Skip to content

Commit

Permalink
Merge pull request #500 from mvlvrd/conf_int_for_competing_risks
Browse files Browse the repository at this point in the history
Add confidence intervals to the cumulative incidence estimator for competing risks.
  • Loading branch information
sebp authored Jan 12, 2025
2 parents adf382d + 28d7652 commit 7b6d478
Show file tree
Hide file tree
Showing 7 changed files with 1,514 additions and 16 deletions.
1 change: 1 addition & 0 deletions doc/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Datasets
load_aids
load_arff_files_standardized
load_bmt
load_cgvhd
load_breast_cancer
load_flchain
load_gbsg2
Expand Down
1 change: 1 addition & 0 deletions sksurv/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
load_arff_files_standardized, # noqa: F401
load_bmt, # noqa: F401
load_breast_cancer, # noqa: F401
load_cgvhd, # noqa: F401
load_flchain, # noqa: F401
load_gbsg2, # noqa: F401
load_veterans_lung_cancer, # noqa: F401
Expand Down
100 changes: 100 additions & 0 deletions sksurv/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"load_arff_files_standardized",
"load_aids",
"load_bmt",
"load_cgvhd",
"load_breast_cancer",
"load_flchain",
"load_gbsg2",
Expand Down Expand Up @@ -474,9 +475,108 @@ def load_bmt():
.. [1] https://doi.org/10.1038/sj.bmt.1705727
Scrucca, L., Santucci, A. & Aversa, F.:
"Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"
.. [2] https://luca-scr.github.io/R/bmt.csv
"""
full_path = _get_data_path("bmt.arff")
data = loadarff(full_path)
data["ftime"] = data["ftime"].astype(int)
return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)


def load_cgvhd():
r"""Load and return data from multicentre randomized clinical trial
initiated for patients with a myeloid malignancy who were to
undergo an allogeneic bone marrow transplant.
The available dataset [1]_ is a 100 size subsample of the full data set. See [2]_ for further details.
+-------+------------+----------------------------------------------+-------------------------------------------+
| Index | Name | Description | Encoding |
+=======+============+==============================================+===========================================+
| 1 | dx | Diagnosis | | AML=acute myeloid leukaemia |
| | | | | CML=chronic myeloid leukaemia |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 2 | tx | Randomized treatment | | BM=cell harvested from the bone marrow |
| | | | | PB=cell harvested from peripheral blood |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 3 | extent | Extent of disease | L=limited, E=extensive |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 4 | agvhdgd | Grade of acute GVHD | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 5 | age | Age | Years |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 6 | survtime | Time from date of transplant to death | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 7 | reltime | Time from date of transplant to relapse | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 8 | agvhtime | Time from date of transplant to acute GVHD | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 9 | cgvhtime | Time from date of transplant to chronic GVHD | Years |
| | | or last follow-up | |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 10 | stat | Status | 1=Dead, 0=Alive |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 11 | rcens | Relapse | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 12 | agvh | Acute GVHD | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 13 | cgvh | Chronic GVHD | 1=Yes, 0=No |
+-------+------------+----------------------------------------------+-------------------------------------------+
| 14 | stnum | patient ID | |
+-------+------------+----------------------------------------------+-------------------------------------------+
Columns 6,7 and 9 contain the time to death, relapse and CGVHD
calculated in years (survtime, reltime, cgvhtime) and the
respective indicator variables are in columns 10,11 and 13 (stat,
rcens, cgvh). The earliest time that any of these events happened
is calculated by taking the minimum of the observed times. The
censoring variable cens is coded as 0 when no events were
observed, 1 if CGVHD was observed as first event, 2 if a relapse
was observed as the first event and 3 if death occurred before
either of the events: The endpoint (status) is therefore defined as
+-------+-------------------------------------------+-----------------+
| Value | Description | Count (%) |
+=======+===========================================+=================+
| 0 | Survival (Right-censored data) | 4 patients (4%) |
+-------+-------------------------------------------+-----------------+
| 1 | Chronic graft versus host disease (CGVHD) | 86 events (86%) |
+-------+-------------------------------------------+-----------------+
| 2 | Relapse (TRM) | 5 events (5%) |
+-------+-------------------------------------------+-----------------+
| 3 | Death | 5 events (5%) |
+-------+-------------------------------------------+-----------------+
See [1]_ for further description and [2]_ for the dataset.
Returns
-------
x : pandas.DataFrame
The measurements for each patient.
y : structured array with 2 fields
*status*: Integer indicating the endpoint: 0: right censored data; 1: GCVHD; 2: relapse; 3: death.
*ftime*: total length of follow-up or time of event.
References
----------
.. [1] https://sites.google.com/view/melaniapintiliemscstatistics/home/statistics
.. [2] Melania Pintilie: "Competing Risks: A Practical Perspective". John Wiley & Sons, 2006
"""
full_path = _get_data_path("cgvhd.arff")
data = loadarff(full_path)
data["ftime"] = data[["survtime", "reltime", "cgvhtime"]].min(axis=1)
data["status"] = (
((data["ftime"] == data["cgvhtime"]) & (data["cgvh"] == "1")).astype(int)
+ 2 * ((data["ftime"] == data["reltime"]) & (data["rcens"] == "1")).astype(int)
+ 3 * ((data["ftime"] == data["survtime"]) & (data["stat"] == "1")).astype(int)
)
data = data[["ftime", "status", "dx", "tx", "extent", "age"]]

return get_x_y(data, attr_labels=["status", "ftime"], competing_risks=True)
3 changes: 3 additions & 0 deletions sksurv/datasets/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ for survival analysis.
| veteran | [Veteran's Lung Cancer][Kalbfleisch2008] | 137 | 6 | 128 (93.4%) | Death |
| whas500 | [Worcester Heart Attack Study][Hosmer2008] | 500 | 14 | 215 (43.0%) | Death |
| BMT | [Leukemia HSC Transplant][Scrucca2007] | 35 | 1 | 24 (68.6%) | Transplant related death or relapse |
| GCVHD | [GCVHD][Pintilie2006] | 100 | 4 | 96 (96%) | Chronic graft disease (GCVHD), relapse or death |

[Desmedt2007]: http://dx.doi.org/10.1158/1078-0432.CCR-06-2765 "Desmedt, C., Piette, F., Loi et al.: Strong Time Dependence of the 76-Gene Prognostic Signature for Node-Negative Breast Cancer Patients in the TRANSBIG Multicenter Independent Validation Series. Clin. Cancer Res. 13(11), 3207–14 (2007)"

Expand All @@ -24,3 +25,5 @@ for survival analysis.
[Schumacher1994]: http://ascopubs.org/doi/abs/10.1200/jco.1994.12.10.2086 "Schumacher, M., Basert, G., Bojar, H., et al. Randomized 2 × 2 trial evaluating hormonal treatment and the duration of chemotherapy in node-positive breast cancer patients. Journal of Clinical Oncology 12, 2086–2093. (1994)"

[Scrucca2007]: https://doi.org/10.1038/sj.bmt.1705727 "Scrucca, L., Santucci, A. & Aversa, F. Competing risk analysis using R: an easy guide for clinicians. Bone Marrow Transplant 40, 381–387 (2007)"

[Pintilie2006]: https://www.wiley.com/en-us/Competing+Risks%3A+A+Practical+Perspective-p-9780470870693 "Melania Pintilie: Competing Risks: A Practical Perspective. John Wiley & Sons, (2006)"
118 changes: 118 additions & 0 deletions sksurv/datasets/data/cgvhd.arff
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
@RELATION CGVHD

@ATTRIBUTE dx {CML, AML}
@ATTRIBUTE tx {PB, BM}
@ATTRIBUTE extent {L, E}
@ATTRIBUTE agvhdgd NUMERIC
@ATTRIBUTE age NUMERIC
@ATTRIBUTE survtime NUMERIC
@ATTRIBUTE reltime NUMERIC
@ATTRIBUTE agvhtime NUMERIC
@ATTRIBUTE cgvhtime NUMERIC
@ATTRIBUTE stat {0, 1}
@ATTRIBUTE rcens {0, 1}
@ATTRIBUTE agvh {1, 0}
@ATTRIBUTE cgvh {1, 0}
@ATTRIBUTE stnum NUMERIC

@DATA
CML,PB,L,1,36,4.895,4.895,0.099,0.52,0,0,1,1,1
AML,PB,L,3,57,3.474,0.753,0.101,0.408,1,1,1,1,2
CML,PB,L,0,48,4.95,4.95,4.95,0.348,0,0,0,1,3
AML,PB,L,2,52,4.643,4.643,0.057,0.482,0,0,1,1,4
AML,PB,L,3,45,4.066,4.066,0.137,0.378,0,0,1,1,5
AML,PB,L,3,47,1.558,0.416,0.055,1.558,1,1,1,0,6
CML,PB,L,1,40,4.512,4.512,0.09,0.381,0,0,1,1,7
AML,PB,L,3,38,4.041,4.041,0.082,0.914,0,0,1,1,8
AML,PB,L,2,41,4.164,4.164,0.055,0.923,0,0,1,1,9
CML,PB,L,0,50,4.011,4.011,4.011,0.397,0,0,0,1,10
CML,PB,L,1,56,3.945,3.945,0.047,0.479,0,0,1,1,11
CML,PB,L,2,56,4.361,4.361,0.079,0.991,0,0,1,1,12
AML,PB,L,1,54,0.841,0.654,0.077,0.474,1,1,1,1,13
CML,PB,L,3,25,2.951,2.951,0.164,0.339,0,0,1,1,14
CML,PB,L,4,40,0.586,0.586,0.055,0.277,1,0,1,1,15
CML,PB,L,0,41,3.559,3.559,3.559,0.367,0,0,0,1,16
CML,PB,L,2,57,3.422,3.422,0.131,0.742,0,0,1,1,17
CML,PB,L,3,62,0.408,0.408,0.408,0.408,0,0,1,1,18
CML,PB,L,1,29,3.428,3.428,0.09,0.958,0,0,1,1,19
AML,PB,L,1,44,0.063,0.063,0.014,0.063,1,0,1,0,20
CML,PB,L,2,40,1.572,1.572,0.09,0.282,1,0,1,1,21
CML,PB,L,1,54,1.013,1.013,0.093,0.413,1,0,1,1,22
AML,PB,L,2,37,3.023,3.023,0.074,0.394,0,0,1,1,23
AML,PB,L,1,58,2.979,2.979,0.079,0.342,0,0,1,1,24
CML,PB,L,3,39,2.817,2.817,0.049,0.367,0,0,1,1,25
CML,PB,L,2,31,2.804,2.804,0.137,0.277,0,0,1,1,26
CML,PB,L,2,45,2.609,2.609,0.252,0.367,0,0,1,1,27
AML,PB,L,0,48,2.508,2.508,2.508,0.331,0,0,0,1,28
CML,PB,L,0,53,0.665,0.665,0.665,0.32,1,0,0,1,29
CML,PB,L,0,29,2.497,2.497,2.497,0.329,0,0,0,1,30
CML,PB,L,0,27,1.799,1.799,1.799,0.444,1,0,0,1,31
AML,PB,L,3,45,0.471,0.438,0.071,0.471,1,1,1,0,32
CML,PB,L,1,39,2.031,2.031,0.112,0.964,0,0,1,1,33
CML,PB,L,3,49,2.073,2.073,0.063,0.564,0,0,1,1,34
AML,PB,L,1,37,0.999,0.75,0.274,0.402,1,1,1,1,35
AML,PB,L,3,53,0.427,0.427,0.055,0.277,1,0,1,1,36
CML,PB,L,1,48,1.766,1.766,0.216,0.4,0,0,1,1,37
AML,PB,L,1,59,1.555,1.555,0.178,0.446,0,0,1,1,38
CML,PB,L,2,33,1.67,1.67,0.11,0.474,0,0,1,1,39
CML,PB,L,0,38,1.607,1.607,1.607,0.329,0,0,0,1,40
CML,PB,L,4,37,1.511,1.511,0.055,0.323,0,0,1,1,41
AML,PB,L,3,41,1.287,1.287,0.049,0.392,0,0,1,1,42
AML,PB,E,1,64,1.227,1.227,0.23,0.496,0,0,1,1,43
CML,PB,L,3,32,1.3,1.3,0.063,0.63,0,0,1,1,44
CML,PB,L,0,41,1.27,1.27,1.27,0.383,0,0,0,1,45
AML,PB,E,1,56,1.205,1.205,0.074,1.205,0,0,1,0,46
CML,PB,L,1,50,1.147,1.147,0.131,0.361,0,0,1,1,47
CML,PB,L,3,37,1.109,1.109,0.055,0.277,0,0,1,1,48
CML,PB,L,0,27,0.994,0.994,0.994,0.287,0,0,0,1,49
CML,BM,L,3,45,4.572,4.572,0.066,0.619,0,0,1,1,50
AML,BM,L,3,45,4.616,4.616,0.101,0.452,0,0,1,1,51
AML,BM,L,2,42,4.0,4.0,0.027,0.29,0,0,1,1,52
CML,BM,L,0,22,4.238,4.238,4.238,0.479,0,0,0,1,53
AML,BM,L,4,47,0.11,0.11,0.074,0.11,1,0,1,0,54
AML,BM,L,2,48,4.03,4.03,0.101,0.857,0,0,1,1,55
AML,BM,L,2,49,3.124,2.527,0.115,1.993,1,1,1,1,56
CML,BM,L,2,38,0.515,0.515,0.079,0.463,1,0,1,1,57
CML,BM,L,1,39,4.222,3.149,0.085,0.496,0,1,1,1,58
CML,BM,L,3,41,4.027,4.027,0.104,0.422,0,0,1,1,59
CML,BM,L,2,46,1.969,1.969,0.038,0.307,1,0,1,1,60
AML,BM,L,0,24,3.792,3.792,3.792,0.701,0,0,0,1,61
AML,BM,L,3,32,0.427,0.427,0.041,0.279,1,0,1,1,62
CML,BM,L,0,36,3.34,3.34,3.34,0.419,0,0,0,1,63
CML,BM,L,1,53,3.504,0.72,0.112,0.616,0,1,1,1,64
CML,BM,L,0,52,3.685,3.685,3.685,0.331,0,0,0,1,65
CML,BM,L,1,59,0.181,0.181,0.049,0.181,1,0,1,0,66
CML,BM,L,3,42,0.736,0.736,0.09,0.567,1,0,1,1,67
CML,BM,L,1,65,0.287,0.287,0.052,0.287,1,0,1,0,68
CML,BM,E,0,60,0.057,0.057,0.057,0.057,0,0,0,0,69
CML,BM,L,2,61,3.107,3.107,0.088,0.764,0,0,1,1,70
CML,BM,L,1,55,3.088,3.088,0.11,0.381,0,0,1,1,71
AML,BM,E,0,48,0.446,0.274,0.446,0.446,1,1,0,0,72
AML,BM,E,0,49,2.776,2.776,2.776,2.776,0,0,0,0,73
CML,BM,L,0,36,0.693,0.172,0.693,0.635,1,1,0,1,74
AML,BM,L,1,48,2.01,2.01,0.077,0.553,0,0,1,1,75
CML,BM,L,0,47,2.374,2.374,2.374,0.287,0,0,0,1,76
AML,BM,L,3,43,1.079,1.079,0.088,0.345,1,0,1,1,77
CML,BM,L,0,56,2.604,2.604,2.604,0.375,0,0,0,1,78
CML,BM,L,1,56,2.478,2.478,0.17,0.517,0,0,1,1,79
CML,BM,L,0,36,2.338,2.338,2.338,0.457,0,0,0,1,80
CML,BM,L,2,52,2.3,2.3,0.049,0.345,0,0,1,1,81
CML,BM,E,1,44,0.219,0.219,0.145,0.219,1,0,1,0,82
AML,BM,L,3,32,2.127,2.127,0.118,0.422,0,0,1,1,83
AML,BM,L,1,44,2.034,2.034,0.096,0.479,0,0,1,1,84
CML,BM,L,0,45,2.034,2.034,2.034,0.29,0,0,0,1,85
AML,BM,L,3,48,2.007,2.007,0.088,0.35,0,0,1,1,86
CML,BM,L,0,48,1.183,1.183,1.183,0.372,0,0,0,1,87
AML,BM,L,3,42,0.375,0.375,0.096,0.277,1,0,1,1,88
AML,BM,E,2,24,0.353,0.301,0.096,0.353,1,1,1,0,89
CML,BM,L,2,26,1.566,1.566,0.137,0.474,0,0,1,1,90
CML,BM,L,2,34,1.588,1.588,0.129,0.465,0,0,1,1,91
CML,BM,L,0,57,1.243,1.243,1.243,0.433,0,0,0,1,92
CML,BM,L,3,51,1.555,1.555,0.09,0.359,0,0,1,1,93
AML,BM,L,2,54,1.202,1.202,0.192,1.202,0,0,1,0,94
AML,BM,E,0,20,1.251,1.251,1.251,0.408,0,0,0,1,95
AML,BM,L,2,39,1.114,1.114,0.074,0.402,0,0,1,1,96
AML,BM,L,0,49,1.15,1.15,1.15,0.35,0,0,0,1,97
CML,BM,L,1,42,0.997,0.997,0.142,0.411,0,0,1,1,98
CML,BM,L,0,44,1.057,1.057,1.057,0.301,0,0,0,1,99
CML,BM,L,1,56,1.125,1.125,0.129,0.32,0,0,1,1,100
Loading

0 comments on commit 7b6d478

Please sign in to comment.