Load Data

data <- read.csv("C:/Users/tanf/OneDrive - Bausch & Lomb, Inc/Desktop/All/HU/cirrhosis_cleaned.csv", header = TRUE)
#age in years
data1 <- data %>% mutate(age_yr=Age/365.25,
                         age_gp= ifelse(Age/365.25 >65, "Age >=65", "Age<65"),
                         censor= ifelse(Status=="D", "Death", "Censor"),
                         censor_n= ifelse(Status=="D", 1, 0))
set.seed=21762
train.test.split<-sample(2, nrow(data1), replace=TRUE, prob=c(0.8,0.2))
train=data1[train.test.split==1,]
test=data1[train.test.split==2,]

head(train, 5)%>% DT::datatable()
head(test, 5)%>% DT::datatable()

0.1 Continuous Variables Disbribution

0.1.1 Age

#1. Age
c1<- ggplot(data1, aes(x=age_yr))+ geom_histogram(binwidth=5, colour="white", fill="darkseagreen2", alpha=0.8)+
  geom_density(eval(bquote(aes(y=..count..*5))),colour="darkgreen", fill="darkgreen", alpha=0.3)+ scale_x_continuous(breaks=seq(40,100,10))+geom_vline(xintercept = 65, linetype="dashed")+ annotate("text", x=50, y=45, label="Age <65", size=2.5, color="dark green") + annotate("text", x=80, y=45, label="Age >= 65", size=2.5, color="dark red") +labs(title="Age Distribution") + theme_minimal(base_size = 8)
c1
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

0.1.2 alk_phos

#2. cpk
c2<- ggplot(data1, aes(x=Alk_Phos))+ geom_histogram(binwidth=100, colour="white", fill="mediumpurple2", alpha=0.8)+
  geom_density(eval(bquote(aes(y=..count..*150))),colour="mediumorchid1", fill="mediumorchid1", alpha=0.3)+ scale_x_continuous(breaks=seq(0,10000,1000))+geom_vline(xintercept = 120, linetype="dashed")+ annotate("text", x=0, y=100, label="CPK Normal", size=2.5, color="dark green") + annotate("text", x=1000, y=80, label="CPK Abnormal", size=2.5, color="dark red")+labs(title="Creatinine Phosphokinase Distribution") + theme_minimal(base_size = 8)
c2

0.1.3 Platelets Count

c4<- ggplot(data1, aes(x=Platelets))+ geom_histogram(binwidth=200, colour="white", fill="lightskyblue2", alpha=0.8)+
  geom_density(eval(bquote(aes(y=..count..*25000))),colour="lightsteelblue", fill="lightsteelblue", alpha=0.3)+
  geom_vline(xintercept = 150000, linetype="dashed")+geom_vline(xintercept = 450000, linetype="dashed")+ annotate("text", x=100000, y=30, label="Abnormal", size=2.5, color="dark red") + annotate("text", x=300000, y=30, label="Normal", color="dark green")+  annotate("text", x=500000, y=30, label="Abnormal", size=2.5, color="dark red")+labs(title="Platelets Count") + theme_minimal(base_size = 8)
c4

d1 <- group_by(data1,N_Days,censor_n)
d2<- summarise(d1,count=n())
## `summarise()` has grouped output by 'N_Days'. You can override using the
## `.groups` argument.
d22 <- arrange(d2, desc(N_Days))


ggplot(d22, aes(x=reorder( N_Days, count), y=N_Days))+ geom_point(aes(size=count, colour=factor(count), shape=factor(censor_n)), alpha=1/2)+
  theme_ipsum() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1) ,  legend.position="none") +coord_flip() +ylab("Survival days") +xlab("Survival counts")+ ggtitle("Patient survival time with counts")

data1$idc <- paste("id",as.factor(data1$ID))

lol1_100<-ggplot(data1[0:100,], aes(x=idc, y=N_Days)) +
  geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days),        color=ifelse(data1[0:100,]$censor_n==1, "orange", "skyblue"))+
  geom_point( color=ifelse(data1[0:100,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  )+ ylab("Survival days") +xlab("Patient ID 1-100") + ggtitle("Patient 1-100 Survival Status with Censor")

lol101<-ggplot(data1[101:200,], aes(x=idc, y=N_Days)) +
  geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days),        color=ifelse(data1[101:200,]$censor_n==1, "orange", "skyblue"))+
  geom_point( color=ifelse(data1[101:200,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  )+ ylab("Survival days") +xlab("Patient ID 101-200") + ggtitle("Patient 101-200 Survival Status with Censor")

lol201<-ggplot(data1[201:300,], aes(x=idc, y=N_Days)) +
  geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days),      color=ifelse(data1[201:300,]$censor_n==1, "orange", "skyblue"))+
  geom_point( color=ifelse(data1[201:300,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  )+ ylab("Survival days") +xlab("Patient ID 201-299") + ggtitle("Patient 201-299 Survival Status with Censor")

lol301<-ggplot(data1[301:400,], aes(x=idc, y=N_Days)) +
  geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days),      color=ifelse(data1[301:400,]$censor_n==1, "orange", "skyblue"))+
  geom_point( color=ifelse(data1[301:400,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  )+ ylab("Survival days") +xlab("Patient ID 301-399") + ggtitle("Patient 301-399 Survival Status with Censor")

lol401<-ggplot(data1[400:418,], aes(x=idc, y=N_Days)) +
  geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days),      color=ifelse(data1[400:418,]$censor_n==1, "orange", "skyblue"))+
  geom_point( color=ifelse(data1[400:418,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
  )+ ylab("Survival days") +xlab("Patient ID 400-418") + ggtitle("Patient 400-418 Survival Status with Censor")

lol1_100

lol101

lol201

lol301

lol401

df <- data1 %>% dplyr::select(N_Days, age_yr, Bilirubin, Cholesterol, Albumin, Copper, Alk_Phos, SGOT, Tryglicerides,Platelets, Prothrombin,Stage)

r=cor(df, use = "pairwise.complete.obs")
corrplot(r, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 90)

coul <- colorRampPalette(brewer.pal(8, "PiYG"))(25)
heatmap(r, scale="column", col = coul)

draw_confusion_matrix <- function(cm) {

  total <- sum(cm$table)
  res <- as.numeric(cm$table)

  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor <- function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette <- greenPalette
    if (greenOrRed == "red")
      palette <- redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }

  # set the basic layout
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)

  # create the matrix 
  classes = colnames(cm$table)
  rect(150, 430, 240, 370, col=getColor("green", res[1]))
  text(195, 435, classes[1], cex=1.2)
  rect(250, 430, 340, 370, col=getColor("red", res[3]))
  text(295, 435, classes[2], cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col=getColor("red", res[2]))
  rect(250, 305, 340, 365, col=getColor("green", res[4]))
  text(140, 400, classes[1], cex=1.2, srt=90)
  text(140, 335, classes[2], cex=1.2, srt=90)

  # add in the cm results
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

0.1.4 GBM with Original Data

train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor, -Drug)



train1 <- train1 %>% mutate_if(is.character, as.factor)

gbm.m<- gbm(train1$censor_n ~. , data=train1, distribution = "bernoulli", cv.folds=10, shrinkage=0.01, n.minobsinnode = 10, n.trees=1000)
#gbm.m
gbm.imp=summary(gbm.m)

gbm.imp
##                         var     rel.inf
## N_Days               N_Days 26.74148360
## Bilirubin         Bilirubin 19.96773291
## Prothrombin     Prothrombin 14.61978856
## Alk_Phos           Alk_Phos  9.63902307
## age_yr               age_yr  8.09459637
## SGOT                   SGOT  5.84196658
## Copper               Copper  5.24593707
## Cholesterol     Cholesterol  3.33431756
## Platelets         Platelets  2.24744817
## Albumin             Albumin  1.77207800
## Tryglicerides Tryglicerides  1.48232362
## Stage                 Stage  0.39932660
## bili_cat           bili_cat  0.26573865
## Sex                     Sex  0.16779091
## Hepatomegaly   Hepatomegaly  0.12846636
## Edema                 Edema  0.05198196
## Ascites             Ascites  0.00000000
## Spiders             Spiders  0.00000000
## alb_cat             alb_cat  0.00000000
## age_gp               age_gp  0.00000000
gmb.t =predict(object=gbm.m, newdata=test, n.trees=1000, type="response")
## Warning in predict.gbm(object = gbm.m, newdata = test, n.trees = 1000, type =
## "response"): NAs introduced by coercion
presult<- as.factor(ifelse(gmb.t>0.5,1,0))
test$censor_n1<-as.factor(test$censor_n)
g<-confusionMatrix(presult,test$censor_n1)
draw_confusion_matrix(g)

# Make sure your survival time and censoring are correctly set up
gbm_model <- gbm(Surv(N_Days, censor_n) ~ ., 
                 data = train1,
                 distribution = "coxph",
                 n.trees = 1000,
                 interaction.depth = 3,
                 shrinkage = 0.01,
                 cv.folds = 5)

# Best number of trees
best_iter <- gbm.perf(gbm_model, method = "cv")

# View variable importance
summary(gbm_model, n.trees = best_iter)

##                         var    rel.inf
## Bilirubin         Bilirubin 38.1247055
## Prothrombin     Prothrombin 14.1575984
## age_yr               age_yr 11.3009157
## Albumin             Albumin  7.2510181
## Copper               Copper  6.0600975
## Edema                 Edema  5.5357270
## Cholesterol     Cholesterol  3.7664012
## Stage                 Stage  3.6339048
## Platelets         Platelets  3.0608936
## SGOT                   SGOT  2.3461227
## Tryglicerides Tryglicerides  2.0122501
## Alk_Phos           Alk_Phos  0.8206455
## Ascites             Ascites  0.5627233
## bili_cat           bili_cat  0.4365556
## Spiders             Spiders  0.3900100
## age_gp               age_gp  0.2064574
## Sex                     Sex  0.1767241
## Hepatomegaly   Hepatomegaly  0.1572494
## alb_cat             alb_cat  0.0000000

0.1.5 Random Forest with Original Data

#train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor, -idc)

train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor)

#Missing value
train1 <- na.omit(train1)
train1 <- train1 %>% mutate_if(is.character, as.factor)

rforest<- randomForest(factor(censor_n) ~. , data=train1, ntree=500, importance=TRUE)
#summary(rforest)
imp<-varImp(rforest)
varImpPlot(rforest)

#test1<- test %>% dplyr::select(-ID, -Status, -Age, -censor, -idc)
test1<- test %>% dplyr::select(-ID, -Status, -Age, -censor)
test1 <- test1 %>% mutate_if(is.character, as.factor)


rpredict<- predict(rforest, test1, type="class")
cm2<-confusionMatrix(rpredict, test$censor_n1)
draw_confusion_matrix(cm2)

factor(censor_n) ~.

rsf_model <- rfsrc(Surv(N_Days, censor_n) ~ ., data = train1,
                   ntree = 1000, importance = TRUE)

# View variable importance
print(rsf_model$importance)
##          Drug           Sex       Ascites  Hepatomegaly       Spiders 
## -0.0002808270  0.0006604285  0.0025945078  0.0023082595  0.0007012017 
##         Edema     Bilirubin   Cholesterol       Albumin        Copper 
##  0.1609040603  0.2166894274  0.0284716792  0.0345822811  0.0335416571 
##      Alk_Phos          SGOT Tryglicerides     Platelets   Prothrombin 
##  0.0047691903  0.0101447994  0.0054850774  0.0098787044  0.0527124552 
##         Stage       alb_cat      bili_cat        age_yr        age_gp 
##  0.0152463953 -0.0006260315  0.0379777196  0.0213926134  0.0149223300
lm1 <- glm(censor_n ~., data=train1, family=binomial(link="logit"))

#summary(lm1)
limp<-varImp(lm1)
backward<-step(lm1,direction="backward", trace=0)
#vi(backward)
p2<- vip(backward,num_features = length(coef(backward)),
         geom="point", horizontal = TRUE, mapping = aes_string(color="Sign"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2

glm.t =predict(object=lm1, newdata=test, type="response")
presult<- as.factor(ifelse(glm.t>0.5,1,0))
test1$censor_n1<-as.factor(test1$censor_n)
cm3<- confusionMatrix(presult,test1$censor_n1)
draw_confusion_matrix(cm3)

mv_fit <- coxph(Surv(N_Days,censor_n) ~ bili_cat + age_yr + Prothrombin +Alk_Phos + Stage +alb_cat, data=data1)
ccox<- cox.zph(mv_fit)
print(ccox)
##               chisq df       p
## bili_cat     3.9103  1 0.04799
## age_yr       0.0622  1 0.80299
## Prothrombin 16.5500  1 4.7e-05
## Alk_Phos     0.0233  1 0.87862
## Stage        6.6747  1 0.00978
## alb_cat     12.6055  1 0.00038
## GLOBAL      30.0252  6 3.9e-05
options(repr.plot.width=10, repr.plot.height=40)
ggcoxzph(ccox)

ggforest(mv_fit)
## Warning in .get_data(model, data = data): The `data` argument is not provided.
## Data will be extracted from model fit.

mo <- coxph(Surv(N_Days,censor_n) ~ Drug + Sex + Ascites + Edema +Spiders, data=data1)
ggforest(mo)
## Warning in .get_data(model, data = data): The `data` argument is not provided.
## Data will be extracted from model fit.

fit_ef<-survfit(Surv(N_Days,censor_n)~Edema, data=data1)
fit_sc<-survfit(Surv(N_Days,censor_n)~ bili_cat, data=data1)
fit_age<-survfit(Surv(N_Days,censor_n)~Stage, data=data1)
fit_sd<-survfit(Surv(N_Days,censor_n)~alb_cat, data=data1)

splots<- list()
splots[[1]]<-ggsurvplot(fit_ef,data=data1 ,xlab="Days", ggtheme=theme_minimal())
splots[[2]]<-ggsurvplot(fit_sc,data=data1, xlab="Days", ggtheme=theme_minimal())
splots[[3]]<-ggsurvplot(fit_age,data=data1,xlab="Days", ggtheme=theme_minimal())
splots[[4]]<-ggsurvplot(fit_sd,data=data1,xlab="Days", ggtheme=theme_minimal())

arrange_ggsurvplots(splots, print=TRUE, ncol=2, nrow=2)

data1$riskgp <- ifelse(data1$bili_cat=="Not Normal" & data1$Edema=="Y" & data1$alb_cat=="Not Normal", "Risk High", "Risk Low")
fit<-survfit(Surv(N_Days,censor_n)~riskgp, data=data1)
km<-ggsurvplot(fit,data=data1, risk.table=TRUE, legend="none", break.time.by=200, size=0.1,tables.height=0.3, xlab="Days")
km

#log rank p value for groups
survdiff(Surv(N_Days, censor_n) ~ riskgp, data=data1)
## Call:
## survdiff(formula = Surv(N_Days, censor_n) ~ riskgp, data = data1)
## 
##                    N Observed Expected (O-E)^2/E (O-E)^2/V
## riskgp=Risk High  16       15     1.21    157.73       161
## riskgp=Risk Low  402      146   159.79      1.19       161
## 
##  Chisq= 161  on 1 degrees of freedom, p= <2e-16
coxph(Surv(N_Days, censor_n) ~ riskgp, data=data1) %>%
  gtsummary::tbl_regression(exp=TRUE)
Characteristic HR 95% CI p-value
riskgp


    Risk High
    Risk Low 0.06 0.03, 0.11 <0.001
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio
data2 <- data1%>% dplyr::select(-ID, -Status, -Age, -censor, -Drug, -riskgp, -Bilirubin, -Albumin)

mv_fit1 <- coxph(Surv(N_Days,censor_n) ~ . , data=data2)
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## one or more coefficients may be infinite
ccox1<- cox.zph(mv_fit1)
print(ccox1)
##               chisq  df   p
## Sex             NaN   1 NaN
## Ascites         NaN   1 NaN
## Hepatomegaly    NaN   1 NaN
## Spiders         NaN   1 NaN
## Edema           NaN   2 NaN
## Cholesterol     NaN   1 NaN
## Copper          NaN   1 NaN
## Alk_Phos        NaN   1 NaN
## SGOT            NaN   1 NaN
## Tryglicerides   NaN   1 NaN
## Platelets       NaN   1 NaN
## Prothrombin     NaN   1 NaN
## Stage           NaN   1 NaN
## alb_cat         NaN   1 NaN
## bili_cat        NaN   1 NaN
## age_yr          NaN   1 NaN
## age_gp          NaN   1 NaN
## idc             NaN 417 NaN
## GLOBAL          NaN 435 NaN
#ccox1 <- coxph(Surv(N_Days, censor_n) ~ ., data = data2)

# Plot forest plot
#ggforest(ccox1, data = data2)