Script to the report on the survival analysis on a set of Cancer survival data.

NOTICE TO MARINERS: Results are commented in a seperate report. If you find this interesting but confusing, email me.

prescript

#If you, like me, run your code in RStudio, this could save you 1.5 seconds :-):

library(rstudioapi)
setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))

script

Data wrangling

data0 <- read.csv2("Cancerfaktorer.csv") #read the data
sum0 <- summary(data0) #Have a look.
sum0
##      Patnr             Time              Status             PS        
##  Min.   :  1.00   Min.   : -73.462   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 28.25   1st Qu.:   8.172   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 58.00   Median :  17.338   Median :1.0000   Median :0.0000  
##  Mean   : 64.14   Mean   :  44.914   Mean   :0.8063   Mean   :0.4459  
##  3rd Qu.: 96.75   3rd Qu.:  35.384   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :152.00   Max.   :3650.238   Max.   :1.0000   Max.   :1.0000  
##                                                                       
##   Stage          Age            Gender         HB.risk   
##  IIIA: 77   Min.   : 42.00   Female:109   Anemia   : 21  
##  IIIB:145   1st Qu.: 56.72   Male  :113   No anemia:198  
##             Median : 62.00                NA's     :  3  
##             Mean   : 65.26                               
##             3rd Qu.: 68.12                               
##             Max.   :710.00                               
##             NA's   :3                                    
##             LPK.risk                TPK.risk  
##  Leukocytosis   :123   No thrombocytosis: 96  
##  No leukocytosis: 95   Thrombocytosis   :123  
##  NA's           :  4   NA's             :  3  
##                                               
##                                               
##                                               
## 
write.csv(sum0, "sum0.csv")
#Ok, so we have a problem: 
#Maximum Time is 3650 and minimum is -73.
#Values such as these are probably typos and need to be removed not to distort the analysis.
#Same goes for Age, with a maximum of 710.

#Let's have a closer look:
hist(data0$Time, breaks=20) 

hist(data0$Age, breaks=20)

#Based on the density, I will suggest the below filtering:
library(dplyr)
data1 <- filter(data0, Time >= 0 & Time < 500 & Age < 110)

#Whitespaces may be problematic, lets strip the factors
data1$HB.risk <- as.factor(gsub(" ", "_", data1$HB.risk, fixed = TRUE))
data1$LPK.risk <- as.factor(gsub(" ", "_", data1$LPK.risk, fixed = TRUE))
data1$TPK.risk <- as.factor(gsub(" ", "_", data1$TPK.risk, fixed = TRUE))

#Inspect:
hist(data1$Time, breaks=20)

hist(data1$Age, breaks=20)
#Inspect:
summary(data1)
##      Patnr             Time             Status             PS        
##  Min.   :  1.00   Min.   : 0.8993   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 28.50   1st Qu.: 8.3778   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 58.00   Median :17.6756   Median :1.0000   Median :0.0000  
##  Mean   : 64.13   Mean   :25.4477   Mean   :0.8047   Mean   :0.4465  
##  3rd Qu.: 96.50   3rd Qu.:35.3511   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :152.00   Max.   :97.5770   Max.   :1.0000   Max.   :1.0000  
##   Stage          Age           Gender         HB.risk   
##  IIIA: 76   Min.   :42.00   Female:105   Anemia   : 20  
##  IIIB:139   1st Qu.:56.84   Male  :110   No_anemia:195  
##             Median :62.00                               
##             Mean   :62.36                               
##             3rd Qu.:68.00                               
##             Max.   :81.00                               
##             LPK.risk                TPK.risk  
##  Leukocytosis   :120   No_thrombocytosis: 93  
##  No_leukocytosis: 94   Thrombocytosis   :122  
##  NA's           :  1                          
##                                               
##                                               
## 
#Looks ok. We have some NA:s as well, but not so many that they should cause any problems.

#However, since we have had typos in data, let's be extraordinary careful and check for duplicates as well.
#Duplicates are problematic if not handled properly (with a design where time and interval are accounted for),
#since we will reuse information and therefore inflate the sample: Say a patient has no event on both occasions,
#then that information is fully readable the second occasion. The first occasion will be an artificial extra row of data,
#falsely increasing the survival rate.
library(tidyverse)

data1$dup <- duplicated(data1$Patnr)
summary(data1)
##      Patnr             Time             Status             PS        
##  Min.   :  1.00   Min.   : 0.8993   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 28.50   1st Qu.: 8.3778   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 58.00   Median :17.6756   Median :1.0000   Median :0.0000  
##  Mean   : 64.13   Mean   :25.4477   Mean   :0.8047   Mean   :0.4465  
##  3rd Qu.: 96.50   3rd Qu.:35.3511   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :152.00   Max.   :97.5770   Max.   :1.0000   Max.   :1.0000  
##   Stage          Age           Gender         HB.risk   
##  IIIA: 76   Min.   :42.00   Female:105   Anemia   : 20  
##  IIIB:139   1st Qu.:56.84   Male  :110   No_anemia:195  
##             Median :62.00                               
##             Mean   :62.36                               
##             3rd Qu.:68.00                               
##             Max.   :81.00                               
##             LPK.risk                TPK.risk      dup         
##  Leukocytosis   :120   No_thrombocytosis: 93   Mode :logical  
##  No_leukocytosis: 94   Thrombocytosis   :122   FALSE:149      
##  NA's           :  1                           TRUE :66       
##                                                               
##                                                               
## 
#Oh, Nelly! 66 duplicates. However, by the looks of it, it is a separate patient group producing all the duplicates:
#One patient can only have one age when given the diagnosis and here, they differ between duplicates. 
#E.g.: 1:TRUE has Age = 58, and 1:FALSE has age = 68, so they are NOT the same patient.
#This pattern is consistent: Age differs between the duplicates, throughout.
#We should be good to go.
#Nonetheless, lets mark the presumed patient groups for future reference:
data1$group <- seq.int(nrow(data1)) #create a column with sequential numbers.
data1$group <- as.factor(ifelse(data1$group > 144, "B", "A")) #by studying the dataset we know that the sequence of Patnr starts over at 144.

#Since R follows the alphabet, we need to set the reference level for two of the risk factors:
data1$HB.risk = relevel(data1$HB.risk, ref = "No_anemia")
data1$LPK.risk = relevel(data1$LPK.risk, ref = "No_leukocytosis")

#Inspect and export
sum1 <- summary(data1) #Have a look.
sum1
##      Patnr             Time             Status             PS        
##  Min.   :  1.00   Min.   : 0.8993   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 28.50   1st Qu.: 8.3778   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median : 58.00   Median :17.6756   Median :1.0000   Median :0.0000  
##  Mean   : 64.13   Mean   :25.4477   Mean   :0.8047   Mean   :0.4465  
##  3rd Qu.: 96.50   3rd Qu.:35.3511   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :152.00   Max.   :97.5770   Max.   :1.0000   Max.   :1.0000  
##   Stage          Age           Gender         HB.risk   
##  IIIA: 76   Min.   :42.00   Female:105   No_anemia:195  
##  IIIB:139   1st Qu.:56.84   Male  :110   Anemia   : 20  
##             Median :62.00                               
##             Mean   :62.36                               
##             3rd Qu.:68.00                               
##             Max.   :81.00                               
##             LPK.risk                TPK.risk      dup          group  
##  No_leukocytosis: 94   No_thrombocytosis: 93   Mode :logical   A:144  
##  Leukocytosis   :120   Thrombocytosis   :122   FALSE:149       B: 71  
##  NA's           :  1                           TRUE :66               
##                                                                       
##                                                                       
## 
write.csv(sum1, "sum1.csv")
#Now we are definitely good to go.

Kaplan-Meier survival estimate

library(survival) #This is the main survival package used to compute survival curves and fit cox regressions
library(survminer) #This package will allow us to plot our results

#We will start the analysis by creating a "survival object", which is a product of time and the event.
#the survival object will be used in all subsequent analyses.
surv_object <- Surv(time = data1$Time, event = data1$Status)
surv_object #a plus indicates a censored data point, i.e. the the event (death) has not occurred.
##   [1] 28.7802900   5.4209450  35.2854200  53.6509200   7.3264880 
##   [6] 14.9486700  14.0944600  25.0677600  97.5770000+  1.2813140 
##  [11] 97.1170400+ 25.1006200  51.1211500  96.8213600+ 20.6324400 
##  [16] 58.5790600   8.5092400   3.3182750  30.6529800  34.2012300 
##  [21] 94.1273100+  8.8377820   4.4024640  10.5133500   8.3449690 
##  [26]  4.9609860   7.3264880   9.3634500  23.6221800  14.9158100 
##  [31] 26.2176600  19.6139600  92.1560600+ 12.4517500  17.8726900 
##  [36]  7.4579060  11.7618100   7.5893230  33.0513300   6.3737170 
##  [41] 90.8747400+ 13.2073900  17.7741300  65.8069800   8.1149890 
##  [46] 20.5667300   4.0082140   8.3778240  38.8665300  15.4086200 
##  [51]  8.7720740  77.3059500  27.6961000   9.1991790  31.1129400 
##  [56]  4.1067760   6.8008210   7.8193020  12.9445600   3.1540040 
##  [61] 39.1622200  86.4394200+ 20.8624200  13.8644800   6.6694050 
##  [66]  7.6221770   1.5770020  33.5113000  15.5400400  15.8685800 
##  [71] 14.4558500  24.2792600  84.1724900+  8.7063660  19.3182800 
##  [76]  6.2751540   5.0924020   2.8583160  12.8788500   6.9979470 
##  [81] 36.2381900  82.5626300+  0.8993056   7.1950720  21.9794700 
##  [86] 53.3552400  29.0431200  11.9917900  80.9856300+ 80.9527700+
##  [91] 20.1396300  23.7535900  80.2628300+  6.5708420   2.7826389 
##  [96] 78.8172500+  5.1252570  13.8973300  29.1088300   6.4722790 
## [101]  5.2895280  77.2731000+ 15.0800800  26.1191000  36.0082100 
## [106]  6.5379880  16.0985600  75.0061600+  6.0451750  20.6981500 
## [111] 74.5462000+  6.4394250  74.0862400+ 18.1683800  12.7145800 
## [116]  8.3778240  17.6755600  42.7433300  12.7145800  72.4763900+
## [121] 26.1191000  16.3285400  19.9753600   8.4435320   9.0677610 
## [126] 33.6427100  26.4476400  36.9281300  14.5872700  25.7905500 
## [131] 55.5893200   6.5051340  32.4928100  68.7967100+ 68.6981500+
## [136]  5.7166320  17.6755600  19.8439400  19.7125300  23.6550300 
## [141] 22.4722800  35.4168400  23.7535900  11.1375800  49.5000000+
## [146]  5.5000000   7.0000000  48.1000000+  5.6000000   6.7000000 
## [151] 17.0000000  14.3000000  13.1000000  15.8000000   3.7000000 
## [156]  7.9000000  13.0000000  46.6000000+  8.5000000  13.8000000 
## [161] 22.4000000  21.5000000   9.1000000   6.3000000  44.5000000+
## [166] 22.5000000  43.8000000+ 43.3000000+  8.9000000  35.9000000 
## [171] 23.2000000  16.3000000   3.3000000  15.8000000  41.9000000+
## [176]  7.2000000  14.9000000  25.0000000  28.1000000  17.9000000 
## [181] 41.0000000+ 20.3000000  14.7000000   9.7000000  10.9000000 
## [186] 39.1000000+ 39.0000000+  1.4000000  38.7000000+  8.5000000 
## [191] 22.9000000  38.3000000+ 38.3000000+ 16.4000000  31.1000000 
## [196] 38.0000000+  6.2000000  26.3000000  23.9000000  16.4000000 
## [201]  9.8000000  37.1000000+ 36.7000000+  7.9000000  36.6000000+
## [206] 36.2000000+  7.9000000  35.9000000+  6.7000000   3.1000000 
## [211] 35.0000000+  4.4000000   2.5000000  34.3000000+ 14.6000000+
#kaplan-Meier survival estimates for the risk factors of (main) interest.

#Compare survival for HB.risk (Anemia/No anemia):
HBcompare <- survfit(surv_object ~ HB.risk, data = data1)
print(HBcompare)
## Call: survfit(formula = surv_object ~ HB.risk, data = data1)
## 
##                     n events median 0.95LCL 0.95UCL
## HB.risk=No_anemia 195    156   17.9   15.80    22.5
## HB.risk=Anemia     20     17   12.7    8.84    53.7
#Lets print survival curves for both states of LPK.risk (Anemia/No anemia)
ggsurvplot(HBcompare,
           pval = TRUE, conf.int = TRUE,
           risk.table = "abs_pct", # Add risk table with no and % at risk
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_minimal(), # Change ggplot2 theme
           palette = "Dark2")

#Test the significance of the differences between the two survival curves:
HB_diff <- survdiff(surv_object ~ HB.risk, data = data1)
HB_diff #Not significant. A problem here is the low number of HB.risk = Anemia.
## Call:
## survdiff(formula = surv_object ~ HB.risk, data = data1)
## 
##                     N Observed Expected (O-E)^2/E (O-E)^2/V
## HB.risk=No_anemia 195      156    158.3    0.0326     0.385
## HB.risk=Anemia     20       17     14.7    0.3503     0.385
## 
##  Chisq= 0.4  on 1 degrees of freedom, p= 0.5
#Compare survival for LPK.risk (Leukocytosis/No leukocytosis):
LPKcompare <- survfit(surv_object ~ LPK.risk, data = data1)
print(LPKcompare)
## Call: survfit(formula = surv_object ~ LPK.risk, data = data1)
## 
##    1 observation deleted due to missingness 
##                            n events median 0.95LCL 0.95UCL
## LPK.risk=No_leukocytosis  94     72   21.7    17.8    28.8
## LPK.risk=Leukocytosis    120    100   14.7    10.5    19.6
#Let's print survival curves for both states of LPK.risk (Leukocytosis/No leukocytosis)
ggsurvplot(LPKcompare,
           pval = TRUE, conf.int = TRUE,
           risk.table = "abs_pct", # Add risk table with no and % at risk
           risk.table.col = "strata", # Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_minimal(), # Change ggplot2 theme
           palette = "Dark2")

#Test the significance of the differences between the two survival curves:
LPK_diff <- survdiff(surv_object ~ LPK.risk, data = data1)
LPK_diff #Significant at a 2% level.
## Call:
## survdiff(formula = surv_object ~ LPK.risk, data = data1)
## 
## n=214, 1 observation deleted due to missingness.
## 
##                            N Observed Expected (O-E)^2/E (O-E)^2/V
## LPK.risk=No_leukocytosis  94       72     87.2      2.66      5.43
## LPK.risk=Leukocytosis    120      100     84.8      2.74      5.43
## 
##  Chisq= 5.4  on 1 degrees of freedom, p= 0.02
#Compare survival for TPK.risk (Thrombocytosis/No thrombocytosis):
TPKcompare <- survfit(surv_object ~ TPK.risk, data = data1)
print(TPKcompare)
## Call: survfit(formula = surv_object ~ TPK.risk, data = data1)
## 
##                              n events median 0.95LCL 0.95UCL
## TPK.risk=No_thrombocytosis  93     68   23.2    17.9    29.0
## TPK.risk=Thrombocytosis    122    105   14.7    12.5    18.2
#Let's print survival curves for both states of TPK.risk (Thrombocytosis/No thrombocytosis)
ggsurvplot(TPKcompare,
           pval = TRUE, conf.int = TRUE,
           risk.table = "abs_pct", # Add risk table with no and % at risk
           risk.table.col = "strata",# Change risk table color by groups
           linetype = "strata", # Change line type by groups
           surv.median.line = "hv", # Specify median survival
           ggtheme = theme_minimal(), # Change ggplot2 theme
           palette = "Dark2")

#Test the significance of the differences between the two survival curves:
TPK_diff <- survdiff(surv_object ~ TPK.risk, data = data1)
TPK_diff #Significant at a 0.6% level.
## Call:
## survdiff(formula = surv_object ~ TPK.risk, data = data1)
## 
##                              N Observed Expected (O-E)^2/E (O-E)^2/V
## TPK.risk=No_thrombocytosis  93       68     86.1      3.81      7.68
## TPK.risk=Thrombocytosis    122      105     86.9      3.78      7.68
## 
##  Chisq= 7.7  on 1 degrees of freedom, p= 0.006

Cox Proportional-Hazards Model

#Fit a model
cox1 <- coxph(surv_object ~ PS + Stage + Age + Gender + HB.risk + LPK.risk + TPK.risk, data = data1)

summary(cox1) #Interestingly, the significance for the LKP.risk is gone.
## Call:
## coxph(formula = surv_object ~ PS + Stage + Age + Gender + HB.risk + 
##     LPK.risk + TPK.risk, data = data1)
## 
##   n= 214, number of events= 172 
##    (1 observation deleted due to missingness)
## 
##                             coef exp(coef)  se(coef)      z Pr(>|z|)   
## PS                      0.466353  1.594170  0.158433  2.944  0.00324 **
## StageIIIB              -0.161875  0.850548  0.163693 -0.989  0.32272   
## Age                     0.008474  1.008510  0.009591  0.884  0.37694   
## GenderMale              0.351796  1.421618  0.159682  2.203  0.02759 * 
## HB.riskAnemia           0.010648  1.010705  0.262993  0.040  0.96770   
## LPK.riskLeukocytosis    0.154123  1.166634  0.174627  0.883  0.37746   
## TPK.riskThrombocytosis  0.395300  1.484830  0.185948  2.126  0.03351 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## PS                        1.5942     0.6273    1.1686     2.175
## StageIIIB                 0.8505     1.1757    0.6171     1.172
## Age                       1.0085     0.9916    0.9897     1.028
## GenderMale                1.4216     0.7034    1.0396     1.944
## HB.riskAnemia             1.0107     0.9894    0.6036     1.692
## LPK.riskLeukocytosis      1.1666     0.8572    0.8285     1.643
## TPK.riskThrombocytosis    1.4848     0.6735    1.0313     2.138
## 
## Concordance= 0.622  (se = 0.023 )
## Rsquare= 0.1   (max possible= 0.999 )
## Likelihood ratio test= 22.48  on 7 df,   p=0.002
## Wald test            = 22.63  on 7 df,   p=0.002
## Score (logrank) test = 22.95  on 7 df,   p=0.002
#Due to some other cofounding factors now included in the analysis, for sure.

#Let's do the boring stuff first: diagnostics of the model
ggcoxdiagnostics(cox1, type = "deviance",
                 linear.predictions = FALSE, ggtheme = theme_minimal()) #No apparent structure in the residual.

testPH <- cox.zph(cox1)
testPH
##                            rho  chisq       p
## PS                     -0.0670  0.804 0.36977
## StageIIIB              -0.0830  1.226 0.26812
## Age                    -0.0372  0.233 0.62950
## GenderMale             -0.0305  0.162 0.68738
## HB.riskAnemia          -0.0451  0.354 0.55209
## LPK.riskLeukocytosis   -0.1949  7.195 0.00731
## TPK.riskThrombocytosis  0.0741  1.103 0.29355
## GLOBAL                      NA 10.646 0.15480
ggcoxzph(testPH, font.main = 10, font.x = 10, font.y = 10, ggtheme = theme_minimal()) #graph.

#Stratification on the problematic variable may solve our dilemma with non-proportional hazard for LKP.risk.
#Lets get a competing model:
cox2 <- coxph(surv_object ~ PS + Stage + Age + Gender + HB.risk + strata(LPK.risk) + TPK.risk, data = data1)
#will output a hazard ratio in the presence of two hazards intrinsic to the levels of LPK.risk
testPH2 <- cox.zph(cox2)
testPH2 #problem solved.
##                            rho chisq     p
## PS                     -0.0605 0.643 0.423
## StageIIIB              -0.0886 1.389 0.239
## Age                    -0.0285 0.136 0.713
## GenderMale             -0.0268 0.123 0.726
## HB.riskAnemia          -0.0410 0.290 0.590
## TPK.riskThrombocytosis  0.0682 0.929 0.335
## GLOBAL                      NA 3.585 0.733
summary(cox2)#However, overall fit not improved...
## Call:
## coxph(formula = surv_object ~ PS + Stage + Age + Gender + HB.risk + 
##     strata(LPK.risk) + TPK.risk, data = data1)
## 
##   n= 214, number of events= 172 
##    (1 observation deleted due to missingness)
## 
##                             coef exp(coef)  se(coef)      z Pr(>|z|)   
## PS                      0.482984  1.620904  0.158648  3.044  0.00233 **
## StageIIIB              -0.170637  0.843127  0.163998 -1.040  0.29812   
## Age                     0.009027  1.009068  0.009583  0.942  0.34621   
## GenderMale              0.334367  1.397056  0.159642  2.094  0.03622 * 
## HB.riskAnemia           0.018707  1.018883  0.262551  0.071  0.94320   
## TPK.riskThrombocytosis  0.406908  1.502167  0.187479  2.170  0.02998 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## PS                        1.6209     0.6169    1.1877     2.212
## StageIIIB                 0.8431     1.1861    0.6114     1.163
## Age                       1.0091     0.9910    0.9903     1.028
## GenderMale                1.3971     0.7158    1.0217     1.910
## HB.riskAnemia             1.0189     0.9815    0.6090     1.705
## TPK.riskThrombocytosis    1.5022     0.6657    1.0402     2.169
## 
## Concordance= 0.603  (se = 0.024 )
## Rsquare= 0.078   (max possible= 0.999 )
## Likelihood ratio test= 17.49  on 6 df,   p=0.008
## Wald test            = 17.66  on 6 df,   p=0.007
## Score (logrank) test = 17.82  on 6 df,   p=0.007
#The violation of the PH criteria does not worry me much,
#since the overall PH is ok and since LPK.risk is not significant anyway. 
#In addition, the slight decrease of hazard is, by looking at the visuals, very modest.
#However, should, for some reason, LPK.risk be the main focus of attention in future research efforts,
#the design of the study need to take into account the probable time-dependence of LPT.risk,
#and the data structure needs to be changed.
#FOr now, I will continue with the first model (cox1).

ggforest(cox1, data = data1) #Hazards per covariate

ggsurvplot(survfit(cox1, data = data1), palette = "Grey",
           ggtheme = theme_minimal()) #Overall survival plot, all covariates considered.

#Now, let's do something akin to the kaplan-Meier survival estimates but in the context of the cox regression

#1
HB_df <- with(data1,
              data.frame(HB.risk = c("No_anemia", "Anemia"), 
                         Age = rep(mean(Age, na.rm = TRUE), 2),
                         PS = c(0, 0),
                         Stage = c("IIIA", "IIIA"),
                         Gender = c("Female", "Female"),
                         LPK.risk = c("No_leukocytosis", "No_leukocytosis"),
                         TPK.risk = c("No_thrombocytosis", "No_thrombocytosis")
              )
)
HB_df
##     HB.risk      Age PS Stage Gender        LPK.risk          TPK.risk
## 1 No_anemia 62.35902  0  IIIA Female No_leukocytosis No_thrombocytosis
## 2    Anemia 62.35902  0  IIIA Female No_leukocytosis No_thrombocytosis
HB_diff <- survfit(cox1, newdata = HB_df)
ggsurvplot(HB_diff, data = HB_diff, conf.int = TRUE, legend.labs=c("No_anemia", "Anemia"),
           ggtheme = theme_minimal(),
           palette = "Dark2"
           )

#2
LPK_df <- with(data1,
                  data.frame(LPK.risk = c("No_leukocytosis", "Leukocytosis"), 
                             Age = rep(mean(Age, na.rm = TRUE), 2),
                             PS = c(0, 0),
                             Stage = c("IIIA", "IIIA"),
                             Gender = c("Female", "Female"),
                             HB.risk = c("No_anemia", "No_anemia"),
                             TPK.risk = c("No_thrombocytosis", "No_thrombocytosis")
                  )
)
LPK_df
##          LPK.risk      Age PS Stage Gender   HB.risk          TPK.risk
## 1 No_leukocytosis 62.35902  0  IIIA Female No_anemia No_thrombocytosis
## 2    Leukocytosis 62.35902  0  IIIA Female No_anemia No_thrombocytosis
LKP_diff <- survfit(cox1, newdata = LPK_df)
ggsurvplot(LKP_diff, data = LKP_diff, conf.int = TRUE, legend.labs=c("No_leukocytosis", "Leukocytosis"),
           ggtheme = theme_minimal(),
           palette = "Dark2"
           )

#3
TPK_df <- with(data1,
              data.frame(TPK.risk = c("No_thrombocytosis", "Thrombocytosis"), 
                         Age = rep(mean(Age, na.rm = TRUE), 2),
                         PS = c(0, 0),
                         Stage = c("IIIA", "IIIA"),
                         Gender = c("Female", "Female"),
                         HB.risk =c("No_anemia", "No_anemia"),
                         LPK.risk = c("No_leukocytosis", "No_leukocytosis")
              )
)
TPK_df
##            TPK.risk      Age PS Stage Gender   HB.risk        LPK.risk
## 1 No_thrombocytosis 62.35902  0  IIIA Female No_anemia No_leukocytosis
## 2    Thrombocytosis 62.35902  0  IIIA Female No_anemia No_leukocytosis
TPK_diff <- survfit(cox1, newdata = TPK_df)
ggsurvplot(TPK_diff, data = TPK_diff, conf.int = TRUE, legend.labs=c("No_thrombocytosis", "Thrombocytosis"),
           ggtheme = theme_minimal(),
           palette = "Dark2"
           )

GMY
## [1] "MYA"