import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
from lifelines import CoxPHFitter

df = pd.read_csv("https://query.data.world/s/bxiwadzq5bazw3xqgvqoycwxwxufsp?dws=00000")

print(df.head(3))
print(df.shape)

  Patient  ID  age  eventdeath   survival  timerecurrence  chemo  hormonal  \
0    s122  18   43           0  14.817248       14.817248      0         0   
1    s123  19   48           0  14.261465       14.261465      0         0   
2    s124  20   38           0   6.644764        6.644764      0         0   

   amputation  histtype  ...  Contig36312_RC  Contig38980_RC  NM_000853  \
0           1         1  ...        0.591103       -0.355018   0.373644   
1           0         1  ...       -0.199829       -0.001635  -0.062922   
2           0         1  ...        0.328736       -0.047571   0.084228   

   NM_000854  NM_000860  Contig29014_RC  Contig46616_RC  NM_000888  NM_000898  \
0  -0.760690  -0.164025       -0.038726        0.237856  -0.087631  -0.369153   
1  -0.682204  -0.220934       -0.100088       -0.466537  -0.231547  -0.643019   
2  -0.695950  -0.402840       -0.099965        0.110155  -0.114298   0.258495   

   AF067420  
0  0.153795  
1 -0.014098  
2 -0.198911  

[3 rows x 1570 columns]
(272, 1570)

df_sur = df[['Patient','survival','eventdeath']]
df_sur.head()

#Construct the Kaplan-Meier Estimator
kmf = KaplanMeierFitter(label='Cancer Survival')

#fit it to the data set
kmf = kmf.fit(durations=df_sur['survival'], event_observed=df_sur['eventdeath'])

#plot the Kaplan-Meier plot showing S(T>t) against t
kmf.plot()

#Mark the point (5, 0.825) corresponding to S(T > 6)
plt.plot(5, 0.825, color='red', marker='+', markeredgewidth=3, markersize=16)

plt.show()

#create a differentiated index based on chemo=1
idx = df['chemo'] == 1
#Create the respective time series
survival_time_with_chemo, survival_status_with_chemo = df.loc[idx, 'survival'], df.loc[idx, 'eventdeath']

survival_time_without_chemo, survival_status_without_chemo = df.loc[~idx, 'survival'], df.loc[~idx, 'eventdeath']

#Fit a Kaplan-Meier plot to the respective sample, with and without prior surgery
kmf_with_chemo = KaplanMeierFitter(label='Group with chemotherapy').fit(durations=survival_time_with_chemo, event_observed=survival_status_with_chemo)

kmf_without_chemo = KaplanMeierFitter(label='Group without chemotherapy').fit(durations=survival_time_without_chemo, event_observed=survival_status_without_chemo)

#plot the two curves
kmf_with_chemo.plot()
kmf_without_chemo.plot()
plt.show()

#Perform the point in time at t=10 years
results = survival_difference_at_fixed_point_in_time_test(point_in_time=10, fitterA=kmf_with_chemo, fitterB=kmf_without_chemo)

#Print the test statistic and it's p-value for 1 degree of freedom on the Chi-square table
print('Chi-squared(1) Test statistic='+str(results.test_statistic) + ' p-value='+str(results.p_value))

Chi-squared(1) Test statistic=1.4135103503568223 p-value=0.23447452813394776

#Let's create a subset that contains the columns of our interest
df2 = df[['age', 'chemo', 'amputation', 'survival', 'eventdeath']]

#Carve out the training and test data sets
mask = np.random.rand(len(df2)) < 0.8
df2_train = df2[mask]
df2_test = df2[~mask]
print('Training data set length='+str(len(df2_train)))
print('Testing data set length='+str(len(df2_test)))

Training data set length=221
Testing data set length=51

#Create and train the Cox model on the training set
cph_model = CoxPHFitter()
cph_model.fit(df=df2_train, duration_col='survival', event_col='eventdeath')

<lifelines.CoxPHFitter: fitted with 221 total observations, 165 right-censored observations>

#Display the model training summary
cph_model.print_summary()

#here is the way to get the estimated expectation of the survival times on the test data set
expected_survival_times = cph_model.predict_expectation(df2_test)
print(expected_survival_times[:10])
print('expected_survival_times: ', expected_survival_times.mean())

14     9.068793
15    13.730592
29    11.081538
37    14.270172
39    12.446330
44    12.613216
48    14.388012
52    12.533832
53    11.183863
61    12.346908
dtype: float64
expected_survival_times:  13.392619490668654

model	lifelines.CoxPHFitter
duration col	'survival'
event col	'eventdeath'
baseline estimation	breslow
number of observations	221
number of events observed	56
partial log-likelihood	-281.90
time fit was run	2024-05-09 10:29:51 UTC

	coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
age	-0.06	0.94	0.02	-0.11	-0.02	0.89	0.98	-2.59	0.01	6.70
chemo	-0.24	0.79	0.28	-0.79	0.31	0.45	1.37	-0.85	0.40	1.33
amputation	0.15	1.16	0.27	-0.38	0.68	0.69	1.97	0.56	0.58	0.79

Concordance	0.60
Partial AIC	569.79
log-likelihood ratio test	7.22 on 3 df
-log2(p) of ll-ratio test	3.94

Survival Analysis Using Python¶

Loading data¶

Building Kaplan-Meier model¶

Kaplan-Meier chart with and without chemotherapy¶

Building Cox model¶

	Patient	survival
0	s122	14.817248
1	s123	14.261465
2	s124	6.644764
3	s125	7.748118
4	s126	6.436687