Merge pull request #491 from mvlvrd/competing_risks

Cumulative Incidence (Kaplan-Meier) for competing risks.
sebp · Nov 27, 2024 · 3a3ef80 · 3a3ef80
2 parents 76de413 + 9fc899a
commit 3a3ef80
Show file tree

Hide file tree

Showing 9 changed files with 557 additions and 24 deletions.
diff --git a/doc/api/datasets.rst b/doc/api/datasets.rst
@@ -8,6 +8,7 @@ Datasets
     get_x_y
     load_aids
     load_arff_files_standardized
+    load_bmt
     load_breast_cancer
     load_flchain
     load_gbsg2

diff --git a/doc/api/nonparametric.rst b/doc/api/nonparametric.rst
@@ -7,6 +7,7 @@ Non-parametric Estimators
 
     CensoringDistributionEstimator
     SurvivalFunctionEstimator
+    cumulative_incidence_competing_risks
     ipc_weights
     kaplan_meier_estimator
     nelson_aalen_estimator
diff --git a/sksurv/datasets/__init__.py b/sksurv/datasets/__init__.py
@@ -2,6 +2,7 @@
     get_x_y,  # noqa: F401
     load_aids,  # noqa: F401
     load_arff_files_standardized,  # noqa: F401
+    load_bmt,  # noqa: F401
     load_breast_cancer,  # noqa: F401
     load_flchain,  # noqa: F401
     load_gbsg2,  # noqa: F401

diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py
@@ -12,6 +12,7 @@
     "get_x_y",
     "load_arff_files_standardized",
     "load_aids",
+    "load_bmt",
     "load_breast_cancer",
     "load_flchain",
     "load_gbsg2",
@@ -26,13 +27,17 @@ def _get_data_path(name):
     return files(__package__) / "data" / name
 
 
-def _get_x_y_survival(dataset, col_event, col_time, val_outcome):
+def _get_x_y_survival(dataset, col_event, col_time, val_outcome, competing_risks=False):
     if col_event is None or col_time is None:
         y = None
         x_frame = dataset
     else:
-        y = np.empty(dtype=[(col_event, bool), (col_time, np.float64)], shape=dataset.shape[0])
-        y[col_event] = (dataset[col_event] == val_outcome).values
+        event_type = np.int64 if competing_risks else bool
+        y = np.empty(dtype=[(col_event, event_type), (col_time, np.float64)], shape=dataset.shape[0])
+        if competing_risks:
+            y[col_event] = dataset[col_event].values
+        else:
+            y[col_event] = (dataset[col_event] == val_outcome).values
         y[col_time] = dataset[col_time].values
 
         x_frame = dataset.drop([col_event, col_time], axis=1)
@@ -51,7 +56,7 @@ def _get_x_y_other(dataset, col_label):
     return x_frame, y
 
 
-def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
+def get_x_y(data_frame, attr_labels, pos_label=None, survival=True, competing_risks=False):
     """Split data frame into features and labels.
 
     Parameters
@@ -75,6 +80,9 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
     survival : bool, optional, default: True
         Whether to return `y` that can be used for survival analysis.
 
+    competing_risks : bool, optional, default: False
+        Whether `y` refers to competing risks situation. Only used if `survival` is True
+
     Returns
     -------
     X : pandas.DataFrame, shape = (n_samples, n_columns - len(attr_labels))
@@ -89,9 +97,9 @@ def get_x_y(data_frame, attr_labels, pos_label=None, survival=True):
     if survival:
         if len(attr_labels) != 2:
             raise ValueError(f"expected sequence of length two for attr_labels, but got {len(attr_labels)}")
-        if pos_label is None:
+        if pos_label is None and not competing_risks:
             raise ValueError("pos_label needs to be specified if survival=True")
-        return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label)
+        return _get_x_y_survival(data_frame, attr_labels[0], attr_labels[1], pos_label, competing_risks)
 
     return _get_x_y_other(data_frame, attr_labels)
 
@@ -434,3 +442,41 @@ def load_flchain():
     """
     fn = _get_data_path("flchain.arff")
     return get_x_y(loadarff(fn), attr_labels=["death", "futime"], pos_label="dead")
+
+
+def load_bmt():
+    """Load and return response to Hematopoietic stem cell transplantation (HSCT) for acute leukemia patients.
+
+    The dataset has 35 samples and 1 features:
+
+        1. dis: Type of leukemia. 0=ALL(Acute Lymphoblastic Leukemia), 1=AML(Acute Myeloid Leukemia)
+
+    The endpoint (status) are:
+
+        0. Survival (Right-censored data). 11 patients (31.4%)
+        1. Transplant related mortality (TRM). 9 events (25.7%)
+        2. Relapse. 15 events (42.8%)
+
+    See [1]_ for further description and [2]_ for the dataset.
+
+    Returns
+    -------
+    x : pandas.DataFrame
+        The measurements for each patient.
+
+    y : structured array with 2 fields
+        *status*: Integer indicating the endpoint: 0-(survival i.e. right censored data), 1-(TRM), 2-(relapse)
+
+        *ftime*: total length of follow-up or time of event.
+
+    References
+    ----------
+    .. [1] https://doi.org/10.1038/sj.bmt.1705727
+           Scrucca, L., Santucci, A. & Aversa, F.:
+           "Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
+    .. [2] https://luca-scr.github.io/R/bmt.csv
+    """
+    full_path = _get_data_path("bmt.arff")
+    data = loadarff(full_path)
+    data["ftime"] = data["ftime"].astype(int)
+    return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
diff --git a/sksurv/datasets/data/README.md b/sksurv/datasets/data/README.md
@@ -3,14 +3,15 @@
 This folder contains freely available datasets that than can be used
 for survival analysis.
 
-| Dataset | Description | Samples | Features | Events | Outcome |
-| ------- | ----------- | ------- | -------- | ------ | ------- |
-| actg320_aids or actg320_death | [AIDS study][Hosmer2008] | 1,151 | 11 | 96 (8.3%) | AIDS defining event or death |
-| breast-cancer | [Breast cancer][Desmedt2007] | 198 | 80 | 51 (25.8%) | Distant metastases |
-| flchain | [Assay of serum free light chain][Dispenzieri2012] | 7874 | 9 | 2169 (27.5%) | Death |
-| GBSG2 | [German Breast Cancer Study Group 2][Schumacher1994] | 686 | 8 | 299 (43.6%) | Recurrence free survival |
-| veteran | [Veteran's Lung Cancer][Kalbfleisch2008] | 137 | 6 | 128 (93.4%) | Death |
-| whas500 | [Worcester Heart Attack Study][Hosmer2008] | 500 | 14 | 215 (43.0%) | Death|
+| Dataset                       | Description                                          | Samples | Features | Events       | Outcome                      |
+|-------------------------------|------------------------------------------------------|---------|----------|--------------|------------------------------|
+| actg320_aids or actg320_death | [AIDS study][Hosmer2008]                             | 1,151   | 11       | 96 (8.3%)    | AIDS defining event or death |
+| breast-cancer                 | [Breast cancer][Desmedt2007]                         | 198     | 80       | 51 (25.8%)   | Distant metastases           |
+| flchain                       | [Assay of serum free light chain][Dispenzieri2012]   | 7874    | 9        | 2169 (27.5%) | Death                        |
+| GBSG2                         | [German Breast Cancer Study Group 2][Schumacher1994] | 686     | 8        | 299 (43.6%)  | Recurrence free survival     |
+| veteran                       | [Veteran's Lung Cancer][Kalbfleisch2008]             | 137     | 6        | 128 (93.4%)  | Death                        |
+| whas500                       | [Worcester Heart Attack Study][Hosmer2008]           | 500     | 14       | 215 (43.0%)  | Death                        |
+| BMT                           | [Leukemia HSC Transplant][Scrucca2007]               | 35      | 1        | 24 (68.6%)   | Transplant related death or relapse |
 
 [Desmedt2007]: http://dx.doi.org/10.1158/1078-0432.CCR-06-2765 "Desmedt, C., Piette, F., Loi et al.: Strong Time Dependence of the 76-Gene Prognostic Signature for Node-Negative Breast Cancer Patients in the TRANSBIG Multicenter Independent Validation Series. Clin. Cancer Res. 13(11), 3207–14 (2007)"
 
@@ -21,3 +22,5 @@ for survival analysis.
 [Kalbfleisch2008]: http://www.wiley.com/WileyCDA/WileyTitle/productCd-047136357X.html "Kalbfleisch, J.D., Prentice, R.L.: The Statistical Analysis of Failure Time Data. John Wiley & Sons, Inc. (2002)"
 
 [Schumacher1994]: http://ascopubs.org/doi/abs/10.1200/jco.1994.12.10.2086 "Schumacher, M., Basert, G., Bojar, H., et al. Randomized 2 × 2 trial evaluating hormonal treatment and the duration of chemotherapy in node-positive breast cancer patients. Journal of Clinical Oncology 12, 2086–2093. (1994)"
+
+[Scrucca2007]: https://doi.org/10.1038/sj.bmt.1705727 "Scrucca, L., Santucci, A. & Aversa, F. Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
diff --git a/sksurv/datasets/data/bmt.arff b/sksurv/datasets/data/bmt.arff
@@ -0,0 +1,46 @@
+% Scrucca L., Santucci A., Aversa F. (2007)
+% Competing risks analysis using R: an easy guide for clinicians.
+% Bone Marrow Transplantation 40, 381-387.
+% https://luca-scr.github.io/R/bmt.csv
+@RELATION BMT
+
+@ATTRIBUTE dis {0,1}
+@ATTRIBUTE ftime NUMERIC
+@ATTRIBUTE status {0,1,2}
+
+@DATA
+0,13,2
+0,1,1
+0,72,0
+0,7,2
+0,8,2
+1,67,0
+0,9,2
+0,5,2
+1,70,0
+1,4,0
+1,7,0
+1,68,0
+0,1,2
+1,10,2
+1,7,2
+1,3,1
+1,4,1
+1,4,1
+1,3,1
+1,3,1
+0,22,2
+1,8,1
+1,2,2
+0,0,2
+0,0,1
+0,35,0
+1,35,0
+0,4,2
+0,14,2
+0,26,2
+0,3,2
+1,2,0
+1,8,0
+1,32,0
+0,12,1
diff --git a/sksurv/nonparametric.py b/sksurv/nonparametric.py
@@ -26,6 +26,7 @@
     "nelson_aalen_estimator",
     "ipc_weights",
     "SurvivalFunctionEstimator",
+    "cumulative_incidence_competing_risks",
 ]
 
 
@@ -36,6 +37,9 @@ def _compute_counts(event, time, order=None):
     ----------
     event : array
         Boolean event indicator.
+        Integer in the case of multiple risks.
+        Zero means right-censored event.
+        Positive values for each of the possible risk events.
 
     time : array
         Survival time or time of censoring.
@@ -51,6 +55,7 @@ def _compute_counts(event, time, order=None):
 
     n_events : array
         Number of events at each time point.
+        2D array with shape `(n_unique_time_points, n_risks + 1)` in the case of competing risks.
 
     n_at_risk : array
         Number of samples that have not been censored or have not had an event at each time point.
@@ -59,23 +64,27 @@ def _compute_counts(event, time, order=None):
         Number of censored samples at each time point.
     """
     n_samples = event.shape[0]
+    n_risks = event.max() if (np.issubdtype(event.dtype, np.integer) and event.max() > 1) else 0
 
     if order is None:
         order = np.argsort(time, kind="mergesort")
 
     uniq_times = np.empty(n_samples, dtype=time.dtype)
-    uniq_events = np.empty(n_samples, dtype=int)
+    uniq_events = np.empty((n_samples, n_risks + 1), dtype=int)
     uniq_counts = np.empty(n_samples, dtype=int)
 
     i = 0
     prev_val = time[order[0]]
     j = 0
     while True:
-        count_event = 0
+        count_event = np.zeros(n_risks + 1, dtype=int)
         count = 0
         while i < n_samples and prev_val == time[order[i]]:
-            if event[order[i]]:
-                count_event += 1
+            event_type = event[order[i]]
+            if event_type:
+                count_event[0] += 1
+                if n_risks:
+                    count_event[event_type] += 1
 
             count += 1
             i += 1
@@ -91,9 +100,13 @@ def _compute_counts(event, time, order=None):
         prev_val = time[order[i]]
 
     times = np.resize(uniq_times, j)
-    n_events = np.resize(uniq_events, j)
     total_count = np.resize(uniq_counts, j)
-    n_censored = total_count - n_events
+    if n_risks:
+        n_events = np.resize(uniq_events, (j, n_risks + 1))
+        n_censored = total_count - n_events[:, 0]
+    else:
+        n_events = np.resize(uniq_events, j)
+        n_censored = total_count - n_events
 
     # offset cumulative sum by one
     total_count = np.r_[0, total_count]
@@ -586,3 +599,85 @@ def predict_ipcw(self, y):
         weights[event] = 1.0 / Ghat
 
         return weights
+
+
+def cumulative_incidence_competing_risks(event, time_exit, time_min=None):
+    """Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
+
+    See [1]_ for further description.
+
+    Parameters
+    ----------
+    event : array-like, shape = (n_samples,)
+        Contains event indicators.
+
+    time_exit : array-like, shape = (n_samples,)
+        Contains event/censoring times. '0' indicates right-censoring.
+        Positive integers (between 1 and n_risks, n_risks being the total number of different risks)
+        indicate the possible different risks.
+        It assumes there are events for all possible risks.
+
+    time_min : float, optional, default: None
+        Compute estimator conditional on survival at least up to
+        the specified time.
+
+    Returns
+    -------
+    time : array, shape = (n_times,)
+        Unique times.
+
+    cum_incidence : array, shape = (n_risks + 1, n_times)
+        Cumulative incidence at each unique time point.
+        The first dimension indicates total risk (``cum_incidence[0]``),
+        the dimension `i=1,..,n_risks` the incidence for each competing risk.
+
+    Examples
+    --------
+    Creating cumulative incidence curves:
+
+    >>> from sksurv.datasets import load_bmt
+    >>> dis, bmt_df = load_bmt()
+    >>> event = bmt_df["status"]
+    >>> time = bmt_df["ftime"]
+    >>> n_risks = event.max()
+    >>> x, y = cumulative_incidence_competing_risks(event, time)
+    >>> plt.step(x, y[0], where="post", label="Total risk")
+    >>> for i in range(1, n_risks + 1):
+    >>>    plt.step(x, y[i], where="post", label=f"{i}-risk")
+    >>> plt.ylim(0, 1)
+    >>> plt.legend()
+    >>> plt.show()
+
+    References
+    ----------
+    .. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
+           The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
+    """
+    event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
+    check_consistent_length(event, time_exit)
+
+    n_risks = event.max()
+    uniq_times, n_events_cr, n_at_risk, _n_censored = _compute_counts(event, time_exit)
+
+    # account for 0/0 = nan
+    n_t = uniq_times.shape[0]
+    ratio = np.divide(
+        n_events_cr,
+        n_at_risk[..., np.newaxis],
+        out=np.zeros((n_t, n_risks + 1), dtype=float),
+        where=n_events_cr != 0,
+    )
+
+    if time_min is not None:
+        mask = uniq_times >= time_min
+        uniq_times = np.compress(mask, uniq_times)
+        ratio = np.compress(mask, ratio, axis=0)
+
+    kpe = np.cumprod(1.0 - ratio[:, 0])
+    kpe_prime = np.ones_like(kpe)
+    kpe_prime[1:] = kpe[:-1]
+    cum_inc = np.empty((n_risks + 1, n_t), dtype=float)
+    cum_inc[1 : n_risks + 1] = np.cumsum((ratio[:, 1 : n_risks + 1].T * kpe_prime), axis=1)
+    cum_inc[0] = 1.0 - kpe
+
+    return uniq_times, cum_inc