Skip to content

Commit

Permalink
Merge pull request #491 from mvlvrd/competing_risks
Browse files Browse the repository at this point in the history
Cumulative Incidence (Kaplan-Meier) for competing risks.
  • Loading branch information
sebp authored Nov 27, 2024
2 parents 76de413 + 9fc899a commit 3a3ef80
Show file tree
Hide file tree
Showing 9 changed files with 557 additions and 24 deletions.
1 change: 1 addition & 0 deletions doc/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Datasets
get_x_y
load_aids
load_arff_files_standardized
load_bmt
load_breast_cancer
load_flchain
load_gbsg2
Expand Down
1 change: 1 addition & 0 deletions doc/api/nonparametric.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Non-parametric Estimators

CensoringDistributionEstimator
SurvivalFunctionEstimator
cumulative_incidence_competing_risks
ipc_weights
kaplan_meier_estimator
nelson_aalen_estimator
1 change: 1 addition & 0 deletions sksurv/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
get_x_y, # noqa: F401
load_aids, # noqa: F401
load_arff_files_standardized, # noqa: F401
load_bmt, # noqa: F401
load_breast_cancer, # noqa: F401
load_flchain, # noqa: F401
load_gbsg2, # noqa: F401
Expand Down
58 changes: 52 additions & 6 deletions sksurv/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"get_x_y",
"load_arff_files_standardized",
"load_aids",
"load_bmt",
"load_breast_cancer",
"load_flchain",
"load_gbsg2",
Expand All @@ -26,13 +27,17 @@ def _get_data_path(name):
return files(__package__) / "data" / name


def _get_x_y_survival(dataset, col_event, col_time, val_outcome):
def _get_x_y_survival(dataset, col_event, col_time, val_outcome, competing_risks=False):
if col_event is None or col_time is None:
y = None
x_frame = dataset
else:
y = np.empty(dtype=[(col_event, bool), (col_time, np.float64)], shape=dataset.shape[0])
y[col_event] = (dataset[col_event] == val_outcome).values
event_type = np.int64 if competing_risks else bool
y = np.empty(dtype=[(col_event, event_type), (col_time, np.float64)], shape=dataset.shape[0])
if competing_risks:
y[col_event] = dataset[col_event].values
else:
y[col_event] = (dataset[col_event] == val_outcome).values
y[col_time] = dataset[col_time].values

x_frame = dataset.drop([col_event, col_time], axis=1)
Expand All @@ -51,7 +56,7 @@ def _get_x_y_other(dataset, col_label):
return x_frame, y


def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
def get_x_y(data_frame, attr_labels, pos_label=None, survival=True, competing_risks=False):
"""Split data frame into features and labels.
Parameters
Expand All @@ -75,6 +80,9 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
survival : bool, optional, default: True
Whether to return `y` that can be used for survival analysis.
competing_risks : bool, optional, default: False
Whether `y` refers to competing risks situation. Only used if `survival` is True
Returns
-------
X : pandas.DataFrame, shape = (n_samples, n_columns - len(attr_labels))
Expand All @@ -89,9 +97,9 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
if survival:
if len(attr_labels) != 2:
raise ValueError(f"expected sequence of length two for attr_labels, but got {len(attr_labels)}")
if pos_label is None:
if pos_label is None and not competing_risks:
raise ValueError("pos_label needs to be specified if survival=True")
return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label)
return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label, competing_risks)

return _get_x_y_other(data_frame, attr_labels)

Expand Down Expand Up @@ -434,3 +442,41 @@ def load_flchain():
"""
fn = _get_data_path("flchain.arff")
return get_x_y(loadarff(fn), attr_labels=["death", "futime"], pos_label="dead")


def load_bmt():
"""Load and return response to Hematopoietic stem cell transplantation (HSCT) for acute leukemia patients.
The dataset has 35 samples and 1 features:
1. dis: Type of leukemia. 0=ALL(Acute Lymphoblastic Leukemia), 1=AML(Acute Myeloid Leukemia)
The endpoint (status) are:
0. Survival (Right-censored data). 11 patients (31.4%)
1. Transplant related mortality (TRM). 9 events (25.7%)
2. Relapse. 15 events (42.8%)
See [1]_ for further description and [2]_ for the dataset.
Returns
-------
x : pandas.DataFrame
The measurements for each patient.
y : structured array with 2 fields
*status*: Integer indicating the endpoint: 0-(survival i.e. right censored data), 1-(TRM), 2-(relapse)
*ftime*: total length of follow-up or time of event.
References
----------
.. [1] https://doi.org/10.1038/sj.bmt.1705727
Scrucca, L., Santucci, A. & Aversa, F.:
"Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
.. [2] https://luca-scr.github.io/R/bmt.csv
"""
full_path = _get_data_path("bmt.arff")
data = loadarff(full_path)
data["ftime"] = data["ftime"].astype(int)
return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
19 changes: 11 additions & 8 deletions sksurv/datasets/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
This folder contains freely available datasets that than can be used
for survival analysis.

| Dataset | Description | Samples | Features | Events | Outcome |
| ------- | ----------- | ------- | -------- | ------ | ------- |
| actg320_aids or actg320_death | [AIDS study][Hosmer2008] | 1,151 | 11 | 96 (8.3%) | AIDS defining event or death |
| breast-cancer | [Breast cancer][Desmedt2007] | 198 | 80 | 51 (25.8%) | Distant metastases |
| flchain | [Assay of serum free light chain][Dispenzieri2012] | 7874 | 9 | 2169 (27.5%) | Death |
| GBSG2 | [German Breast Cancer Study Group 2][Schumacher1994] | 686 | 8 | 299 (43.6%) | Recurrence free survival |
| veteran | [Veteran's Lung Cancer][Kalbfleisch2008] | 137 | 6 | 128 (93.4%) | Death |
| whas500 | [Worcester Heart Attack Study][Hosmer2008] | 500 | 14 | 215 (43.0%) | Death|
| Dataset | Description | Samples | Features | Events | Outcome |
|-------------------------------|------------------------------------------------------|---------|----------|--------------|------------------------------|
| actg320_aids or actg320_death | [AIDS study][Hosmer2008] | 1,151 | 11 | 96 (8.3%) | AIDS defining event or death |
| breast-cancer | [Breast cancer][Desmedt2007] | 198 | 80 | 51 (25.8%) | Distant metastases |
| flchain | [Assay of serum free light chain][Dispenzieri2012] | 7874 | 9 | 2169 (27.5%) | Death |
| GBSG2 | [German Breast Cancer Study Group 2][Schumacher1994] | 686 | 8 | 299 (43.6%) | Recurrence free survival |
| veteran | [Veteran's Lung Cancer][Kalbfleisch2008] | 137 | 6 | 128 (93.4%) | Death |
| whas500 | [Worcester Heart Attack Study][Hosmer2008] | 500 | 14 | 215 (43.0%) | Death |
| BMT | [Leukemia HSC Transplant][Scrucca2007] | 35 | 1 | 24 (68.6%) | Transplant related death or relapse |

[Desmedt2007]: http://dx.doi.org/10.1158/1078-0432.CCR-06-2765 "Desmedt, C., Piette, F., Loi et al.: Strong Time Dependence of the 76-Gene Prognostic Signature for Node-Negative Breast Cancer Patients in the TRANSBIG Multicenter Independent Validation Series. Clin. Cancer Res. 13(11), 3207–14 (2007)"

Expand All @@ -21,3 +22,5 @@ for survival analysis.
[Kalbfleisch2008]: http://www.wiley.com/WileyCDA/WileyTitle/productCd-047136357X.html "Kalbfleisch, J.D., Prentice, R.L.: The Statistical Analysis of Failure Time Data. John Wiley & Sons, Inc. (2002)"

[Schumacher1994]: http://ascopubs.org/doi/abs/10.1200/jco.1994.12.10.2086 "Schumacher, M., Basert, G., Bojar, H., et al. Randomized 2 × 2 trial evaluating hormonal treatment and the duration of chemotherapy in node-positive breast cancer patients. Journal of Clinical Oncology 12, 2086–2093. (1994)"

[Scrucca2007]: https://doi.org/10.1038/sj.bmt.1705727 "Scrucca, L., Santucci, A. & Aversa, F. Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
46 changes: 46 additions & 0 deletions sksurv/datasets/data/bmt.arff
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
% Scrucca L., Santucci A., Aversa F. (2007)
% Competing risks analysis using R: an easy guide for clinicians.
% Bone Marrow Transplantation 40, 381-387.
% https://luca-scr.github.io/R/bmt.csv
@RELATION BMT

@ATTRIBUTE dis {0,1}
@ATTRIBUTE ftime NUMERIC
@ATTRIBUTE status {0,1,2}

@DATA
0,13,2
0,1,1
0,72,0
0,7,2
0,8,2
1,67,0
0,9,2
0,5,2
1,70,0
1,4,0
1,7,0
1,68,0
0,1,2
1,10,2
1,7,2
1,3,1
1,4,1
1,4,1
1,3,1
1,3,1
0,22,2
1,8,1
1,2,2
0,0,2
0,0,1
0,35,0
1,35,0
0,4,2
0,14,2
0,26,2
0,3,2
1,2,0
1,8,0
1,32,0
0,12,1
107 changes: 101 additions & 6 deletions sksurv/nonparametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"nelson_aalen_estimator",
"ipc_weights",
"SurvivalFunctionEstimator",
"cumulative_incidence_competing_risks",
]


Expand All @@ -36,6 +37,9 @@ def _compute_counts(event, time, order=None):
----------
event : array
Boolean event indicator.
Integer in the case of multiple risks.
Zero means right-censored event.
Positive values for each of the possible risk events.
time : array
Survival time or time of censoring.
Expand All @@ -51,6 +55,7 @@ def _compute_counts(event, time, order=None):
n_events : array
Number of events at each time point.
2D array with shape `(n_unique_time_points, n_risks + 1)` in the case of competing risks.
n_at_risk : array
Number of samples that have not been censored or have not had an event at each time point.
Expand All @@ -59,23 +64,27 @@ def _compute_counts(event, time, order=None):
Number of censored samples at each time point.
"""
n_samples = event.shape[0]
n_risks = event.max() if (np.issubdtype(event.dtype, np.integer) and event.max() > 1) else 0

if order is None:
order = np.argsort(time, kind="mergesort")

uniq_times = np.empty(n_samples, dtype=time.dtype)
uniq_events = np.empty(n_samples, dtype=int)
uniq_events = np.empty((n_samples, n_risks + 1), dtype=int)
uniq_counts = np.empty(n_samples, dtype=int)

i = 0
prev_val = time[order[0]]
j = 0
while True:
count_event = 0
count_event = np.zeros(n_risks + 1, dtype=int)
count = 0
while i < n_samples and prev_val == time[order[i]]:
if event[order[i]]:
count_event += 1
event_type = event[order[i]]
if event_type:
count_event[0] += 1
if n_risks:
count_event[event_type] += 1

count += 1
i += 1
Expand All @@ -91,9 +100,13 @@ def _compute_counts(event, time, order=None):
prev_val = time[order[i]]

times = np.resize(uniq_times, j)
n_events = np.resize(uniq_events, j)
total_count = np.resize(uniq_counts, j)
n_censored = total_count - n_events
if n_risks:
n_events = np.resize(uniq_events, (j, n_risks + 1))
n_censored = total_count - n_events[:, 0]
else:
n_events = np.resize(uniq_events, j)
n_censored = total_count - n_events

# offset cumulative sum by one
total_count = np.r_[0, total_count]
Expand Down Expand Up @@ -586,3 +599,85 @@ def predict_ipcw(self, y):
weights[event] = 1.0 / Ghat

return weights


def cumulative_incidence_competing_risks(event, time_exit, time_min=None):
"""Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
See [1]_ for further description.
Parameters
----------
event : array-like, shape = (n_samples,)
Contains event indicators.
time_exit : array-like, shape = (n_samples,)
Contains event/censoring times. '0' indicates right-censoring.
Positive integers (between 1 and n_risks, n_risks being the total number of different risks)
indicate the possible different risks.
It assumes there are events for all possible risks.
time_min : float, optional, default: None
Compute estimator conditional on survival at least up to
the specified time.
Returns
-------
time : array, shape = (n_times,)
Unique times.
cum_incidence : array, shape = (n_risks + 1, n_times)
Cumulative incidence at each unique time point.
The first dimension indicates total risk (``cum_incidence[0]``),
the dimension `i=1,..,n_risks` the incidence for each competing risk.
Examples
--------
Creating cumulative incidence curves:
>>> from sksurv.datasets import load_bmt
>>> dis, bmt_df = load_bmt()
>>> event = bmt_df["status"]
>>> time = bmt_df["ftime"]
>>> n_risks = event.max()
>>> x, y = cumulative_incidence_competing_risks(event, time)
>>> plt.step(x, y[0], where="post", label="Total risk")
>>> for i in range(1, n_risks + 1):
>>> plt.step(x, y[i], where="post", label=f"{i}-risk")
>>> plt.ylim(0, 1)
>>> plt.legend()
>>> plt.show()
References
----------
.. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
"""
event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
check_consistent_length(event, time_exit)

n_risks = event.max()
uniq_times, n_events_cr, n_at_risk, _n_censored = _compute_counts(event, time_exit)

# account for 0/0 = nan
n_t = uniq_times.shape[0]
ratio = np.divide(
n_events_cr,
n_at_risk[..., np.newaxis],
out=np.zeros((n_t, n_risks + 1), dtype=float),
where=n_events_cr != 0,
)

if time_min is not None:
mask = uniq_times >= time_min
uniq_times = np.compress(mask, uniq_times)
ratio = np.compress(mask, ratio, axis=0)

kpe = np.cumprod(1.0 - ratio[:, 0])
kpe_prime = np.ones_like(kpe)
kpe_prime[1:] = kpe[:-1]
cum_inc = np.empty((n_risks + 1, n_t), dtype=float)
cum_inc[1 : n_risks + 1] = np.cumsum((ratio[:, 1 : n_risks + 1].T * kpe_prime), axis=1)
cum_inc[0] = 1.0 - kpe

return uniq_times, cum_inc
Loading

0 comments on commit 3a3ef80

Please sign in to comment.