diff --git a/sksurv/datasets/base.py b/sksurv/datasets/base.py index 43970590..9e386fc3 100644 --- a/sksurv/datasets/base.py +++ b/sksurv/datasets/base.py @@ -489,26 +489,47 @@ def load_cgvhd(): initiated for patients with a myeloid malignancy who were to undergo an allogeneic bone marrow transplant. - The dataset has 100 samples and 4 features::: + The dataset has 100 samples. + + +-------+------------+----------------------------------------------+-------------------------------------------+ + | Index | Name | Description | Encoding | + +=======+============+==============================================+===========================================+ + | 1 | dx | Diagnosis | | AML=acute myeloid leukaemia | + | | | | | CML=chronic myeloid leukaemia | + | | | | | MDS=myelodysplastic syndrome | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 2 | tx | Randomized treatment | | BM=cell harvested from the bone marrow | + | | | | | PB=cell harvested from peripheral blood | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 3 | extent | Extent of disease | L=limited, E=extensive | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 4 | agvhdgd | Grade of acute GVHD | | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 5 | age | Age | Years | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 6 | survtime | Time from date of transplant to death | Years | + | | | or last follow-up | | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 7 | reltime | Time from date of transplant to relapse | Years | + | | | or last follow-up | | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 8 | agvhtime | Time from date of transplant to acute GVHD | Years | + | | | or last follow-up | | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 9 | cgvhtime | Time from date of transplant to chronic GVHD | Years | + | | | or last follow-up | | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 10 | stat | Status | 1=Dead, 0=Alive | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 11 | rcens | Relapse | 1=Yes, 0=No | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 12 | agvh | Acute GVHD | 1=Yes, 0=No | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 13 | cgvh | Chronic GVHD | 1=Yes, 0=No | + +-------+------------+----------------------------------------------+-------------------------------------------+ + | 14 | patient ID | | | + +-------+------------+----------------------------------------------+-------------------------------------------+ - 1. dx: Diagnosis: AML=acute myeloid leukaemia, CML=chronic myeloid leukaemia, MDS=myelodysplastic syndrome - 2. tx: Randomized treatment: BM=cell harvested from the bone marrow, PB=cell harvested from the peripheral blood - 3. extent: Extent of disease: L=limited, E=extensive - 5. age: Age (years) - - The rest of the columns correspond to outcome variables::: - - 6. survtime: Time from date of transplant to death or last follow-up (Years) - 7. reltime: Time from date of transplant to relapse or last follow-up (Years) - 8. agvhtime: Time from date of transplant to acute GVHD or last follow-up (Years) - 9. cgvhtime: Time from date of transplant to chronic GVHD or last follow-up (Years) - 10. stat Status: 1=Dead, 0=Alive - 11. rcens Relapse: 1=Yes, 0=No - 4. agvhdgd: Grade of acute GVHD - 12. agvh: Acute GVHD: 1=Yes, 0=No - 13. cgvh: Chronic GVHD: 1=Yes, 0=No - - The last column (14) is a patient ID. Columns 6,7 and 9 contain the time to death, relapse and CGVHD calculated in years (survtime, reltime, cgvhtime) and the @@ -518,7 +539,7 @@ def load_cgvhd(): censoring variable cens is coded as 0 when no events were observed, 1 if CGVHD was observed as first event, 2 if a relapse was observed as the first event and 3 if death occurred before - either of the events: The endpoint (status) are therefore: + either of the events: The endpoint (status) are therefore:: 0. Survival (Right-censored data). 4 patients (4%) 1. Chronic graft versus host disease (CGVHD). 86 events (86%) @@ -541,8 +562,7 @@ def load_cgvhd(): ---------- .. [1] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006 - .. [2] https://drive.google.com/file/d/1FPM264pE\_-F8DB7lvFeLB1yQ3HDo7c-i/view - https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics + .. [2] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics """ full_path = _get_data_path("cgvhd.arff") data = loadarff(full_path) diff --git a/sksurv/nonparametric.py b/sksurv/nonparametric.py index 4eb3e0f5..c5693e05 100644 --- a/sksurv/nonparametric.py +++ b/sksurv/nonparametric.py @@ -604,15 +604,15 @@ def predict_ipcw(self, y): def _cr_ci_logmlog(cum_inc, sigma_t, z): """Compute the pointwise log-minus-log transformed confidence intervals""" eps = np.finfo(cum_inc.dtype).eps + zero_count = cum_inc <= eps log_cum_i = np.zeros_like(cum_inc) - np.log(cum_inc, where=cum_inc > eps, out=log_cum_i) + np.log(cum_inc, where=~zero_count, out=log_cum_i) theta = np.zeros_like(cum_inc) den = cum_inc * log_cum_i - np.divide(sigma_t, den, where=den < -eps, out=theta) + np.divide(sigma_t, den, where=~zero_count, out=theta) theta = z * np.multiply.outer(np.array([-1, 1]), theta) ci = np.exp(log_cum_i * np.exp(theta)) - ci[:, cum_inc <= eps] = 0.0 - ci[:, 1.0 - cum_inc <= eps] = 1.0 + ci[:, zero_count] = 0.0 return ci @@ -635,7 +635,7 @@ def cumulative_incidence_competing_risks( time_min=None, conf_level=0.95, conf_type=None, - var_type=None, + var_type="Dinse", ): """Non-parametric estimator of Cumulative Incidence function in the case of competing risks. @@ -665,8 +665,9 @@ def cumulative_incidence_competing_risks( If "log-log", estimate confidence intervals using the log hazard or :math:`log(-log(S(t)))`. - var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: None + var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: 'Dinse' The method for estimating the variance of the estimator. + See [2]_, [3]_ and [4]_ for each of the methods. Only valid if conf_type is valid. Returns @@ -708,6 +709,11 @@ def cumulative_incidence_competing_risks( ---------- .. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002) The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York. + .. [2] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eqs. 4 and 5. + .. [3] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eq. 6. + .. [4] Aalen, O. (1978a). Annals of Statistics, 6, 534–545. + We implement the formula in M. Pintilie: "Competing Risks: A Practical Perspective". + John Wiley & Sons, 2006, Eq. 4.5 """ event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True) check_consistent_length(event, time_exit) @@ -746,7 +752,7 @@ def cumulative_incidence_competing_risks( elif var_type == "Aalen": var = _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc) else: - raise ValueError(f"{var_type=} not implemented.") + raise ValueError(f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'.") _x, _y, conf_int_km = kaplan_meier_estimator(event > 0, time_exit, conf_type="log-log") ci = np.empty(shape=(2, n_risks + 1, n_t), dtype=conf_int_km.dtype) @@ -768,9 +774,9 @@ def _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc): irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :] mask = np.tril(np.ones_like(irt[0])) - var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, dr / (n_at_risk * (n_at_risk - dr))) + var_a = np.sum(irt**2 * mask * (dr / (n_at_risk * (n_at_risk - dr))), axis=2) var_b = np.cumsum(((n_at_risk - dr_cr) / n_at_risk) * (dr_cr / n_at_risk**2) * kpe_prime**2, axis=1) - var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, dr_cr, kpe_prime / n_at_risk**2) + var_c = -2 * np.sum(irt * mask * dr_cr[:, np.newaxis, :] * (kpe_prime / n_at_risk**2), axis=2) var = var_a + var_b + var_c return var @@ -821,7 +827,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc): _va = np.zeros_like(kpe_prime) den_a = (n_at_risk - 1) * (n_at_risk - dr) np.divide(dr, den_a, out=_va, where=den_a > 0) - var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, _va) + var_a = np.sum(irt**2 * mask * _va, axis=2) _vb = np.zeros_like(kpe_prime) den_b = (n_at_risk - 1) * n_at_risk**2 @@ -832,7 +838,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc): _vcb = np.zeros_like(kpe_prime) den_c = n_at_risk * (n_at_risk - dr) * (n_at_risk - 1) np.divide(kpe_prime, den_c, out=_vcb, where=den_c > 0) - var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, _vca, _vcb) + var_c = -2 * np.sum(irt * mask * _vca[:, np.newaxis, :] * _vcb, axis=2) var = var_a + var_b + var_c return var diff --git a/tests/test_nonparametric.py b/tests/test_nonparametric.py index 82927dc1..4b661c0a 100644 --- a/tests/test_nonparametric.py +++ b/tests/test_nonparametric.py @@ -7657,6 +7657,6 @@ def test_invalid_conf_type_competing_risks(event, time, true_x, true_y, conf_typ @pytest.mark.parametrize("event, time, true_x, true_y", SimpleDataBMTCases().get_cases()) @pytest.mark.parametrize("var_type", ["None", "dinse", 1, "", "not"]) def test_invalid_var_type_competing_risks(event, time, true_x, true_y, var_type): - msg = f"{var_type=} not implemented." + msg = f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'." with pytest.raises(ValueError, match=msg): cumulative_incidence_competing_risks(event, time, conf_level=0.95, conf_type="log-log", var_type=var_type)