Changes in the pull request review:

- Better documentation. - Some errors with `::` in Rest are corrected. - Clearer processing of the cum_inc = 0 case when calculating Confidence Intervals.
sebp · Jan 7, 2025 · 90a379c · 90a379c
1 parent 344e15d
commit 90a379c
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 34 deletions.
diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py
@@ -489,26 +489,47 @@ def load_cgvhd():
     initiated for patients with a myeloid malignancy who were to
     undergo an allogeneic bone marrow transplant.
 
-    The dataset has 100 samples and 4 features:::
+    The dataset has 100 samples.
+
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | Index | Name       | Description                                  | Encoding                                  |
+    +=======+============+==============================================+===========================================+
+    | 1     | dx         | Diagnosis                                    | | AML=acute myeloid leukaemia             |
+    |       |            |                                              | | CML=chronic myeloid leukaemia           |
+    |       |            |                                              | | MDS=myelodysplastic syndrome            |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 2     | tx         | Randomized treatment                         | | BM=cell harvested from the bone marrow  |
+    |       |            |                                              | | PB=cell harvested from peripheral blood |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 3     | extent     | Extent of disease                            | L=limited, E=extensive                    |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 4     | agvhdgd    | Grade of acute GVHD                          |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 5     | age        | Age                                          | Years                                     |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 6     | survtime   | Time from date of transplant to death        | Years                                     |
+    |       |            | or last follow-up                            |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 7     | reltime    | Time from date of transplant to relapse      | Years                                     |
+    |       |            | or last follow-up                            |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 8     | agvhtime   | Time from date of transplant to acute GVHD   | Years                                     |
+    |       |            | or last follow-up                            |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 9     | cgvhtime   | Time from date of transplant to chronic GVHD | Years                                     |
+    |       |            | or last follow-up                            |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 10    | stat       | Status                                       | 1=Dead, 0=Alive                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 11    | rcens      | Relapse                                      | 1=Yes, 0=No                               |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 12    | agvh       | Acute GVHD                                   | 1=Yes, 0=No                               |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 13    | cgvh       | Chronic GVHD                                 | 1=Yes, 0=No                               |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
+    | 14    | patient ID |                                              |                                           |
+    +-------+------------+----------------------------------------------+-------------------------------------------+
 
-        1. dx: Diagnosis: AML=acute myeloid leukaemia, CML=chronic myeloid leukaemia, MDS=myelodysplastic syndrome
-        2. tx: Randomized treatment: BM=cell harvested from the bone marrow, PB=cell harvested from the peripheral blood
-        3. extent: Extent of disease: L=limited, E=extensive
-        5. age: Age (years)
-
-    The rest of the columns correspond to outcome variables:::
-
-        6.  survtime: Time from date of transplant to death or last follow-up (Years)
-        7.  reltime: Time from date of transplant to relapse or last follow-up (Years)
-        8.  agvhtime: Time from date of transplant to acute GVHD or last follow-up (Years)
-        9.  cgvhtime: Time from date of transplant to chronic GVHD or last follow-up (Years)
-        10. stat Status: 1=Dead, 0=Alive
-        11. rcens Relapse: 1=Yes, 0=No
-        4.  agvhdgd: Grade of acute GVHD
-        12. agvh: Acute GVHD: 1=Yes, 0=No
-        13. cgvh: Chronic GVHD: 1=Yes, 0=No
-
-    The last column (14) is a patient ID.
 
     Columns 6,7 and 9 contain the time to death, relapse and CGVHD
     calculated in years (survtime, reltime, cgvhtime) and the
@@ -518,7 +539,7 @@ def load_cgvhd():
     censoring variable cens is coded as 0 when no events were
     observed, 1 if CGVHD was observed as first event, 2 if a relapse
     was observed as the first event and 3 if death occurred before
-    either of the events: The endpoint (status) are therefore:
+    either of the events: The endpoint (status) are therefore::
 
         0. Survival (Right-censored data). 4 patients (4%)
         1. Chronic graft versus host disease (CGVHD). 86 events (86%)
@@ -541,8 +562,7 @@ def load_cgvhd():
     ----------
     .. [1] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006
 
-    .. [2] https://drive.google.com/file/d/1FPM264pE\_-F8DB7lvFeLB1yQ3HDo7c-i/view
-           https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
+    .. [2] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
     """
     full_path = _get_data_path("cgvhd.arff")
     data = loadarff(full_path)

diff --git a/sksurv/nonparametric.py b/sksurv/nonparametric.py
@@ -604,15 +604,15 @@ def predict_ipcw(self, y):
 def _cr_ci_logmlog(cum_inc, sigma_t, z):
     """Compute the pointwise log-minus-log transformed confidence intervals"""
     eps = np.finfo(cum_inc.dtype).eps
+    zero_count = cum_inc <= eps
     log_cum_i = np.zeros_like(cum_inc)
-    np.log(cum_inc, where=cum_inc > eps, out=log_cum_i)
+    np.log(cum_inc, where=~zero_count, out=log_cum_i)
     theta = np.zeros_like(cum_inc)
     den = cum_inc * log_cum_i
-    np.divide(sigma_t, den, where=den < -eps, out=theta)
+    np.divide(sigma_t, den, where=~zero_count, out=theta)
     theta = z * np.multiply.outer(np.array([-1, 1]), theta)
     ci = np.exp(log_cum_i * np.exp(theta))
-    ci[:, cum_inc <= eps] = 0.0
-    ci[:, 1.0 - cum_inc <= eps] = 1.0
+    ci[:, zero_count] = 0.0
     return ci
 
 
@@ -635,7 +635,7 @@ def cumulative_incidence_competing_risks(
     time_min=None,
     conf_level=0.95,
     conf_type=None,
-    var_type=None,
+    var_type="Dinse",
 ):
     """Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
 
@@ -665,8 +665,9 @@ def cumulative_incidence_competing_risks(
         If "log-log", estimate confidence intervals using
         the log hazard or :math:`log(-log(S(t)))`.
 
-    var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: None
+    var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: 'Dinse'
         The method for estimating the variance of the estimator.
+        See [2]_, [3]_ and [4]_ for each of the methods.
         Only valid if conf_type is valid.
 
     Returns
@@ -708,6 +709,11 @@ def cumulative_incidence_competing_risks(
     ----------
     .. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
            The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
+    .. [2] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eqs. 4 and 5.
+    .. [3] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eq. 6.
+    .. [4] Aalen, O. (1978a). Annals of Statistics, 6, 534–545.
+           We implement the formula in M. Pintilie: "Competing Risks: A Practical Perspective".
+           John Wiley & Sons, 2006, Eq. 4.5
     """
     event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
     check_consistent_length(event, time_exit)
@@ -746,7 +752,7 @@ def cumulative_incidence_competing_risks(
     elif var_type == "Aalen":
         var = _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc)
     else:
-        raise ValueError(f"{var_type=} not implemented.")
+        raise ValueError(f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'.")
 
     _x, _y, conf_int_km = kaplan_meier_estimator(event > 0, time_exit, conf_type="log-log")
     ci = np.empty(shape=(2, n_risks + 1, n_t), dtype=conf_int_km.dtype)
@@ -768,9 +774,9 @@ def _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc):
     irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
     mask = np.tril(np.ones_like(irt[0]))
 
-    var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, dr / (n_at_risk * (n_at_risk - dr)))
+    var_a = np.sum(irt**2 * mask * (dr / (n_at_risk * (n_at_risk - dr))), axis=2)
     var_b = np.cumsum(((n_at_risk - dr_cr) / n_at_risk) * (dr_cr / n_at_risk**2) * kpe_prime**2, axis=1)
-    var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, dr_cr, kpe_prime / n_at_risk**2)
+    var_c = -2 * np.sum(irt * mask * dr_cr[:, np.newaxis, :] * (kpe_prime / n_at_risk**2), axis=2)
 
     var = var_a + var_b + var_c
     return var
@@ -821,7 +827,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
     _va = np.zeros_like(kpe_prime)
     den_a = (n_at_risk - 1) * (n_at_risk - dr)
     np.divide(dr, den_a, out=_va, where=den_a > 0)
-    var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, _va)
+    var_a = np.sum(irt**2 * mask * _va, axis=2)
 
     _vb = np.zeros_like(kpe_prime)
     den_b = (n_at_risk - 1) * n_at_risk**2
@@ -832,7 +838,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
     _vcb = np.zeros_like(kpe_prime)
     den_c = n_at_risk * (n_at_risk - dr) * (n_at_risk - 1)
     np.divide(kpe_prime, den_c, out=_vcb, where=den_c > 0)
-    var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, _vca, _vcb)
+    var_c = -2 * np.sum(irt * mask * _vca[:, np.newaxis, :] * _vcb, axis=2)
 
     var = var_a + var_b + var_c
     return var
diff --git a/tests/test_nonparametric.py b/tests/test_nonparametric.py
@@ -7657,6 +7657,6 @@ def test_invalid_conf_type_competing_risks(event, time, true_x, true_y, conf_typ
     @pytest.mark.parametrize("event, time, true_x, true_y", SimpleDataBMTCases().get_cases())
     @pytest.mark.parametrize("var_type", ["None", "dinse", 1, "", "not"])
     def test_invalid_var_type_competing_risks(event, time, true_x, true_y, var_type):
-        msg = f"{var_type=} not implemented."
+        msg = f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'."
         with pytest.raises(ValueError, match=msg):
             cumulative_incidence_competing_risks(event, time, conf_level=0.95, conf_type="log-log", var_type=var_type)