Skip to content

Commit

Permalink
Changes in the pull request review:
Browse files Browse the repository at this point in the history
- Better documentation.
- Some errors with `::` in Rest are corrected.
- Clearer processing of the cum_inc = 0 case when calculating Confidence
  Intervals.
  • Loading branch information
mvlvrd committed Jan 7, 2025
1 parent 344e15d commit 90a379c
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 34 deletions.
64 changes: 42 additions & 22 deletions sksurv/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,26 +489,47 @@ def load_cgvhd():
initiated for patients with a myeloid malignancy who were to
undergo an allogeneic bone marrow transplant.
The dataset has 100 samples and 4 features:::
The dataset has 100 samples.
+-------+------------+----------------------------------------------+-------------------------------------------+
| Index | Name | Description | Encoding |
+=======+============+==============================================+===========================================+
| 1 | dx | Diagnosis | | AML=acute myeloid leukaemia |
| | | | | CML=chronic myeloid leukaemia |
| | | | | MDS=myelodysplastic syndrome |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 2 | tx | Randomized treatment | | BM=cell harvested from the bone marrow |
| | | | | PB=cell harvested from peripheral blood |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 3 | extent | Extent of disease | L=limited, E=extensive |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 4 | agvhdgd | Grade of acute GVHD | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 5 | age | Age | Years |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 6 | survtime | Time from date of transplant to death | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 7 | reltime | Time from date of transplant to relapse | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 8 | agvhtime | Time from date of transplant to acute GVHD | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 9 | cgvhtime | Time from date of transplant to chronic GVHD | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 10 | stat | Status | 1=Dead, 0=Alive |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 11 | rcens | Relapse | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 12 | agvh | Acute GVHD | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 13 | cgvh | Chronic GVHD | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 14 | patient ID | | |
+-------+------------+----------------------------------------------+-------------------------------------------+
1. dx: Diagnosis: AML=acute myeloid leukaemia, CML=chronic myeloid leukaemia, MDS=myelodysplastic syndrome
2. tx: Randomized treatment: BM=cell harvested from the bone marrow, PB=cell harvested from the peripheral blood
3. extent: Extent of disease: L=limited, E=extensive
5. age: Age (years)
The rest of the columns correspond to outcome variables:::
6. survtime: Time from date of transplant to death or last follow-up (Years)
7. reltime: Time from date of transplant to relapse or last follow-up (Years)
8. agvhtime: Time from date of transplant to acute GVHD or last follow-up (Years)
9. cgvhtime: Time from date of transplant to chronic GVHD or last follow-up (Years)
10. stat Status: 1=Dead, 0=Alive
11. rcens Relapse: 1=Yes, 0=No
4. agvhdgd: Grade of acute GVHD
12. agvh: Acute GVHD: 1=Yes, 0=No
13. cgvh: Chronic GVHD: 1=Yes, 0=No
The last column (14) is a patient ID.
Columns 6,7 and 9 contain the time to death, relapse and CGVHD
calculated in years (survtime, reltime, cgvhtime) and the
Expand All @@ -518,7 +539,7 @@ def load_cgvhd():
censoring variable cens is coded as 0 when no events were
observed, 1 if CGVHD was observed as first event, 2 if a relapse
was observed as the first event and 3 if death occurred before
either of the events: The endpoint (status) are therefore:
either of the events: The endpoint (status) are therefore::
0. Survival (Right-censored data). 4 patients (4%)
1. Chronic graft versus host disease (CGVHD). 86 events (86%)
Expand All @@ -541,8 +562,7 @@ def load_cgvhd():
----------
.. [1] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006
.. [2] https://drive.google.com/file/d/1FPM264pE\_-F8DB7lvFeLB1yQ3HDo7c-i/view
https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
.. [2] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
"""
full_path = _get_data_path("cgvhd.arff")
data = loadarff(full_path)
Expand Down
28 changes: 17 additions & 11 deletions sksurv/nonparametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,15 +604,15 @@ def predict_ipcw(self, y):
def _cr_ci_logmlog(cum_inc, sigma_t, z):
"""Compute the pointwise log-minus-log transformed confidence intervals"""
eps = np.finfo(cum_inc.dtype).eps
zero_count = cum_inc <= eps
log_cum_i = np.zeros_like(cum_inc)
np.log(cum_inc, where=cum_inc > eps, out=log_cum_i)
np.log(cum_inc, where=~zero_count, out=log_cum_i)
theta = np.zeros_like(cum_inc)
den = cum_inc * log_cum_i
np.divide(sigma_t, den, where=den < -eps, out=theta)
np.divide(sigma_t, den, where=~zero_count, out=theta)
theta = z * np.multiply.outer(np.array([-1, 1]), theta)
ci = np.exp(log_cum_i * np.exp(theta))
ci[:, cum_inc <= eps] = 0.0
ci[:, 1.0 - cum_inc <= eps] = 1.0
ci[:, zero_count] = 0.0
return ci


Expand All @@ -635,7 +635,7 @@ def cumulative_incidence_competing_risks(
time_min=None,
conf_level=0.95,
conf_type=None,
var_type=None,
var_type="Dinse",
):
"""Non-parametric estimator of Cumulative Incidence function in the case of competing risks.
Expand Down Expand Up @@ -665,8 +665,9 @@ def cumulative_incidence_competing_risks(
If "log-log", estimate confidence intervals using
the log hazard or :math:`log(-log(S(t)))`.
var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: None
var_type : None or one of {'Dinse', 'Dinse_Approx', 'Aalen'}, optional, default: 'Dinse'
The method for estimating the variance of the estimator.
See [2]_, [3]_ and [4]_ for each of the methods.
Only valid if conf_type is valid.
Returns
Expand Down Expand Up @@ -708,6 +709,11 @@ def cumulative_incidence_competing_risks(
----------
.. [1] Kalbfleisch, J.D. and Prentice, R.L. (2002)
The Statistical Analysis of Failure Time Data. 2nd Edition, John Wiley and Sons, New York.
.. [2] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eqs. 4 and 5.
.. [3] Dinse and Larson, Biometrika (1986), 379. Sect. 4, Eq. 6.
.. [4] Aalen, O. (1978a). Annals of Statistics, 6, 534–545.
We implement the formula in M. Pintilie: "Competing Risks: A Practical Perspective".
John Wiley & Sons, 2006, Eq. 4.5
"""
event, time_exit = check_y_survival(event, time_exit, allow_all_censored=True, competing_risks=True)
check_consistent_length(event, time_exit)
Expand Down Expand Up @@ -746,7 +752,7 @@ def cumulative_incidence_competing_risks(
elif var_type == "Aalen":
var = _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc)
else:
raise ValueError(f"{var_type=} not implemented.")
raise ValueError(f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'.")

_x, _y, conf_int_km = kaplan_meier_estimator(event > 0, time_exit, conf_type="log-log")
ci = np.empty(shape=(2, n_risks + 1, n_t), dtype=conf_int_km.dtype)
Expand All @@ -768,9 +774,9 @@ def _var_dinse_approx(n_events_cr, kpe_prime, n_at_risk, cum_inc):
irt = cum_inc[1:, :, np.newaxis] - cum_inc[1:, np.newaxis, :]
mask = np.tril(np.ones_like(irt[0]))

var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, dr / (n_at_risk * (n_at_risk - dr)))
var_a = np.sum(irt**2 * mask * (dr / (n_at_risk * (n_at_risk - dr))), axis=2)
var_b = np.cumsum(((n_at_risk - dr_cr) / n_at_risk) * (dr_cr / n_at_risk**2) * kpe_prime**2, axis=1)
var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, dr_cr, kpe_prime / n_at_risk**2)
var_c = -2 * np.sum(irt * mask * dr_cr[:, np.newaxis, :] * (kpe_prime / n_at_risk**2), axis=2)

var = var_a + var_b + var_c
return var
Expand Down Expand Up @@ -821,7 +827,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
_va = np.zeros_like(kpe_prime)
den_a = (n_at_risk - 1) * (n_at_risk - dr)
np.divide(dr, den_a, out=_va, where=den_a > 0)
var_a = np.einsum("rjk,jk,k->rj", irt**2, mask, _va)
var_a = np.sum(irt**2 * mask * _va, axis=2)

_vb = np.zeros_like(kpe_prime)
den_b = (n_at_risk - 1) * n_at_risk**2
Expand All @@ -832,7 +838,7 @@ def _var_aalen(n_events_cr, kpe_prime, n_at_risk, cum_inc):
_vcb = np.zeros_like(kpe_prime)
den_c = n_at_risk * (n_at_risk - dr) * (n_at_risk - 1)
np.divide(kpe_prime, den_c, out=_vcb, where=den_c > 0)
var_c = -2 * np.einsum("rjk,jk,rk,k->rj", irt, mask, _vca, _vcb)
var_c = -2 * np.sum(irt * mask * _vca[:, np.newaxis, :] * _vcb, axis=2)

var = var_a + var_b + var_c
return var
2 changes: 1 addition & 1 deletion tests/test_nonparametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7657,6 +7657,6 @@ def test_invalid_conf_type_competing_risks(event, time, true_x, true_y, conf_typ
@pytest.mark.parametrize("event, time, true_x, true_y", SimpleDataBMTCases().get_cases())
@pytest.mark.parametrize("var_type", ["None", "dinse", 1, "", "not"])
def test_invalid_var_type_competing_risks(event, time, true_x, true_y, var_type):
msg = f"{var_type=} not implemented."
msg = f"{var_type=} must be one of 'Dinse', 'Dinse_approx' or 'Aalen'."
with pytest.raises(ValueError, match=msg):
cumulative_incidence_competing_risks(event, time, conf_level=0.95, conf_type="log-log", var_type=var_type)

0 comments on commit 90a379c

Please sign in to comment.