Load Data
data <- read.csv("C:/Users/tanf/OneDrive - Bausch & Lomb, Inc/Desktop/All/HU/cirrhosis_cleaned.csv", header = TRUE)
#age in years
data1 <- data %>% mutate(age_yr=Age/365.25,
age_gp= ifelse(Age/365.25 >65, "Age >=65", "Age<65"),
censor= ifelse(Status=="D", "Death", "Censor"),
censor_n= ifelse(Status=="D", 1, 0))
set.seed=21762
train.test.split<-sample(2, nrow(data1), replace=TRUE, prob=c(0.8,0.2))
train=data1[train.test.split==1,]
test=data1[train.test.split==2,]
head(train, 5)%>% DT::datatable()
#1. Age
c1<- ggplot(data1, aes(x=age_yr))+ geom_histogram(binwidth=5, colour="white", fill="darkseagreen2", alpha=0.8)+
geom_density(eval(bquote(aes(y=..count..*5))),colour="darkgreen", fill="darkgreen", alpha=0.3)+ scale_x_continuous(breaks=seq(40,100,10))+geom_vline(xintercept = 65, linetype="dashed")+ annotate("text", x=50, y=45, label="Age <65", size=2.5, color="dark green") + annotate("text", x=80, y=45, label="Age >= 65", size=2.5, color="dark red") +labs(title="Age Distribution") + theme_minimal(base_size = 8)
c1
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#2. cpk
c2<- ggplot(data1, aes(x=Alk_Phos))+ geom_histogram(binwidth=100, colour="white", fill="mediumpurple2", alpha=0.8)+
geom_density(eval(bquote(aes(y=..count..*150))),colour="mediumorchid1", fill="mediumorchid1", alpha=0.3)+ scale_x_continuous(breaks=seq(0,10000,1000))+geom_vline(xintercept = 120, linetype="dashed")+ annotate("text", x=0, y=100, label="CPK Normal", size=2.5, color="dark green") + annotate("text", x=1000, y=80, label="CPK Abnormal", size=2.5, color="dark red")+labs(title="Creatinine Phosphokinase Distribution") + theme_minimal(base_size = 8)
c2
c4<- ggplot(data1, aes(x=Platelets))+ geom_histogram(binwidth=200, colour="white", fill="lightskyblue2", alpha=0.8)+
geom_density(eval(bquote(aes(y=..count..*25000))),colour="lightsteelblue", fill="lightsteelblue", alpha=0.3)+
geom_vline(xintercept = 150000, linetype="dashed")+geom_vline(xintercept = 450000, linetype="dashed")+ annotate("text", x=100000, y=30, label="Abnormal", size=2.5, color="dark red") + annotate("text", x=300000, y=30, label="Normal", color="dark green")+ annotate("text", x=500000, y=30, label="Abnormal", size=2.5, color="dark red")+labs(title="Platelets Count") + theme_minimal(base_size = 8)
c4
## `summarise()` has grouped output by 'N_Days'. You can override using the
## `.groups` argument.
d22 <- arrange(d2, desc(N_Days))
ggplot(d22, aes(x=reorder( N_Days, count), y=N_Days))+ geom_point(aes(size=count, colour=factor(count), shape=factor(censor_n)), alpha=1/2)+
theme_ipsum() +
theme(axis.text.x = element_text(angle = 90, hjust = 1) , legend.position="none") +coord_flip() +ylab("Survival days") +xlab("Survival counts")+ ggtitle("Patient survival time with counts")
data1$idc <- paste("id",as.factor(data1$ID))
lol1_100<-ggplot(data1[0:100,], aes(x=idc, y=N_Days)) +
geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days), color=ifelse(data1[0:100,]$censor_n==1, "orange", "skyblue"))+
geom_point( color=ifelse(data1[0:100,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)+ ylab("Survival days") +xlab("Patient ID 1-100") + ggtitle("Patient 1-100 Survival Status with Censor")
lol101<-ggplot(data1[101:200,], aes(x=idc, y=N_Days)) +
geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days), color=ifelse(data1[101:200,]$censor_n==1, "orange", "skyblue"))+
geom_point( color=ifelse(data1[101:200,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)+ ylab("Survival days") +xlab("Patient ID 101-200") + ggtitle("Patient 101-200 Survival Status with Censor")
lol201<-ggplot(data1[201:300,], aes(x=idc, y=N_Days)) +
geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days), color=ifelse(data1[201:300,]$censor_n==1, "orange", "skyblue"))+
geom_point( color=ifelse(data1[201:300,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)+ ylab("Survival days") +xlab("Patient ID 201-299") + ggtitle("Patient 201-299 Survival Status with Censor")
lol301<-ggplot(data1[301:400,], aes(x=idc, y=N_Days)) +
geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days), color=ifelse(data1[301:400,]$censor_n==1, "orange", "skyblue"))+
geom_point( color=ifelse(data1[301:400,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)+ ylab("Survival days") +xlab("Patient ID 301-399") + ggtitle("Patient 301-399 Survival Status with Censor")
lol401<-ggplot(data1[400:418,], aes(x=idc, y=N_Days)) +
geom_segment( aes(x=idc, xend=idc, y=0, yend=N_Days), color=ifelse(data1[400:418,]$censor_n==1, "orange", "skyblue"))+
geom_point( color=ifelse(data1[400:418,]$censor_n==1, "red", "darkgreen"), size=0.1, alpha=0.6) +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank()
)+ ylab("Survival days") +xlab("Patient ID 400-418") + ggtitle("Patient 400-418 Survival Status with Censor")
lol1_100
df <- data1 %>% dplyr::select(N_Days, age_yr, Bilirubin, Cholesterol, Albumin, Copper, Alk_Phos, SGOT, Tryglicerides,Platelets, Prothrombin,Stage)
r=cor(df, use = "pairwise.complete.obs")
corrplot(r, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 90)
draw_confusion_matrix <- function(cm) {
total <- sum(cm$table)
res <- as.numeric(cm$table)
# Generate color gradients. Palettes come from RColorBrewer.
greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
getColor <- function (greenOrRed = "green", amount = 0) {
if (amount == 0)
return("#FFFFFF")
palette <- greenPalette
if (greenOrRed == "red")
palette <- redPalette
colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
}
# set the basic layout
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
classes = colnames(cm$table)
rect(150, 430, 240, 370, col=getColor("green", res[1]))
text(195, 435, classes[1], cex=1.2)
rect(250, 430, 340, 370, col=getColor("red", res[3]))
text(295, 435, classes[2], cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col=getColor("red", res[2]))
rect(250, 305, 340, 365, col=getColor("green", res[4]))
text(140, 400, classes[1], cex=1.2, srt=90)
text(140, 335, classes[2], cex=1.2, srt=90)
# add in the cm results
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor, -Drug)
train1 <- train1 %>% mutate_if(is.character, as.factor)
gbm.m<- gbm(train1$censor_n ~. , data=train1, distribution = "bernoulli", cv.folds=10, shrinkage=0.01, n.minobsinnode = 10, n.trees=1000)
#gbm.m
gbm.imp=summary(gbm.m)
## var rel.inf
## N_Days N_Days 26.74148360
## Bilirubin Bilirubin 19.96773291
## Prothrombin Prothrombin 14.61978856
## Alk_Phos Alk_Phos 9.63902307
## age_yr age_yr 8.09459637
## SGOT SGOT 5.84196658
## Copper Copper 5.24593707
## Cholesterol Cholesterol 3.33431756
## Platelets Platelets 2.24744817
## Albumin Albumin 1.77207800
## Tryglicerides Tryglicerides 1.48232362
## Stage Stage 0.39932660
## bili_cat bili_cat 0.26573865
## Sex Sex 0.16779091
## Hepatomegaly Hepatomegaly 0.12846636
## Edema Edema 0.05198196
## Ascites Ascites 0.00000000
## Spiders Spiders 0.00000000
## alb_cat alb_cat 0.00000000
## age_gp age_gp 0.00000000
## Warning in predict.gbm(object = gbm.m, newdata = test, n.trees = 1000, type =
## "response"): NAs introduced by coercion
presult<- as.factor(ifelse(gmb.t>0.5,1,0))
test$censor_n1<-as.factor(test$censor_n)
g<-confusionMatrix(presult,test$censor_n1)
draw_confusion_matrix(g)
# Make sure your survival time and censoring are correctly set up
gbm_model <- gbm(Surv(N_Days, censor_n) ~ .,
data = train1,
distribution = "coxph",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.01,
cv.folds = 5)
# Best number of trees
best_iter <- gbm.perf(gbm_model, method = "cv")
## var rel.inf
## Bilirubin Bilirubin 38.1247055
## Prothrombin Prothrombin 14.1575984
## age_yr age_yr 11.3009157
## Albumin Albumin 7.2510181
## Copper Copper 6.0600975
## Edema Edema 5.5357270
## Cholesterol Cholesterol 3.7664012
## Stage Stage 3.6339048
## Platelets Platelets 3.0608936
## SGOT SGOT 2.3461227
## Tryglicerides Tryglicerides 2.0122501
## Alk_Phos Alk_Phos 0.8206455
## Ascites Ascites 0.5627233
## bili_cat bili_cat 0.4365556
## Spiders Spiders 0.3900100
## age_gp age_gp 0.2064574
## Sex Sex 0.1767241
## Hepatomegaly Hepatomegaly 0.1572494
## alb_cat alb_cat 0.0000000
#train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor, -idc)
train1<- train %>% dplyr::select(-ID, -Status, -Age, -censor)
#Missing value
train1 <- na.omit(train1)
train1 <- train1 %>% mutate_if(is.character, as.factor)
rforest<- randomForest(factor(censor_n) ~. , data=train1, ntree=500, importance=TRUE)
#summary(rforest)
imp<-varImp(rforest)
varImpPlot(rforest)
#test1<- test %>% dplyr::select(-ID, -Status, -Age, -censor, -idc)
test1<- test %>% dplyr::select(-ID, -Status, -Age, -censor)
test1 <- test1 %>% mutate_if(is.character, as.factor)
rpredict<- predict(rforest, test1, type="class")
cm2<-confusionMatrix(rpredict, test$censor_n1)
draw_confusion_matrix(cm2)
factor(censor_n) ~.
rsf_model <- rfsrc(Surv(N_Days, censor_n) ~ ., data = train1,
ntree = 1000, importance = TRUE)
# View variable importance
print(rsf_model$importance)
## Drug Sex Ascites Hepatomegaly Spiders
## -0.0002808270 0.0006604285 0.0025945078 0.0023082595 0.0007012017
## Edema Bilirubin Cholesterol Albumin Copper
## 0.1609040603 0.2166894274 0.0284716792 0.0345822811 0.0335416571
## Alk_Phos SGOT Tryglicerides Platelets Prothrombin
## 0.0047691903 0.0101447994 0.0054850774 0.0098787044 0.0527124552
## Stage alb_cat bili_cat age_yr age_gp
## 0.0152463953 -0.0006260315 0.0379777196 0.0213926134 0.0149223300
lm1 <- glm(censor_n ~., data=train1, family=binomial(link="logit"))
#summary(lm1)
limp<-varImp(lm1)
backward<-step(lm1,direction="backward", trace=0)
#vi(backward)
p2<- vip(backward,num_features = length(coef(backward)),
geom="point", horizontal = TRUE, mapping = aes_string(color="Sign"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
glm.t =predict(object=lm1, newdata=test, type="response")
presult<- as.factor(ifelse(glm.t>0.5,1,0))
test1$censor_n1<-as.factor(test1$censor_n)
cm3<- confusionMatrix(presult,test1$censor_n1)
draw_confusion_matrix(cm3)
mv_fit <- coxph(Surv(N_Days,censor_n) ~ bili_cat + age_yr + Prothrombin +Alk_Phos + Stage +alb_cat, data=data1)
ccox<- cox.zph(mv_fit)
print(ccox)
## chisq df p
## bili_cat 3.9103 1 0.04799
## age_yr 0.0622 1 0.80299
## Prothrombin 16.5500 1 4.7e-05
## Alk_Phos 0.0233 1 0.87862
## Stage 6.6747 1 0.00978
## alb_cat 12.6055 1 0.00038
## GLOBAL 30.0252 6 3.9e-05
## Warning in .get_data(model, data = data): The `data` argument is not provided.
## Data will be extracted from model fit.
## Warning in .get_data(model, data = data): The `data` argument is not provided.
## Data will be extracted from model fit.
fit_ef<-survfit(Surv(N_Days,censor_n)~Edema, data=data1)
fit_sc<-survfit(Surv(N_Days,censor_n)~ bili_cat, data=data1)
fit_age<-survfit(Surv(N_Days,censor_n)~Stage, data=data1)
fit_sd<-survfit(Surv(N_Days,censor_n)~alb_cat, data=data1)
splots<- list()
splots[[1]]<-ggsurvplot(fit_ef,data=data1 ,xlab="Days", ggtheme=theme_minimal())
splots[[2]]<-ggsurvplot(fit_sc,data=data1, xlab="Days", ggtheme=theme_minimal())
splots[[3]]<-ggsurvplot(fit_age,data=data1,xlab="Days", ggtheme=theme_minimal())
splots[[4]]<-ggsurvplot(fit_sd,data=data1,xlab="Days", ggtheme=theme_minimal())
arrange_ggsurvplots(splots, print=TRUE, ncol=2, nrow=2)
data1$riskgp <- ifelse(data1$bili_cat=="Not Normal" & data1$Edema=="Y" & data1$alb_cat=="Not Normal", "Risk High", "Risk Low")
fit<-survfit(Surv(N_Days,censor_n)~riskgp, data=data1)
km<-ggsurvplot(fit,data=data1, risk.table=TRUE, legend="none", break.time.by=200, size=0.1,tables.height=0.3, xlab="Days")
km
## Call:
## survdiff(formula = Surv(N_Days, censor_n) ~ riskgp, data = data1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## riskgp=Risk High 16 15 1.21 157.73 161
## riskgp=Risk Low 402 146 159.79 1.19 161
##
## Chisq= 161 on 1 degrees of freedom, p= <2e-16
Characteristic | HR | 95% CI | p-value |
---|---|---|---|
riskgp | |||
Risk High | — | — | |
Risk Low | 0.06 | 0.03, 0.11 | <0.001 |
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio |
data2 <- data1%>% dplyr::select(-ID, -Status, -Age, -censor, -Drug, -riskgp, -Bilirubin, -Albumin)
mv_fit1 <- coxph(Surv(N_Days,censor_n) ~ . , data=data2)
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## Ran out of iterations and did not converge
## Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
## one or more coefficients may be infinite
## chisq df p
## Sex NaN 1 NaN
## Ascites NaN 1 NaN
## Hepatomegaly NaN 1 NaN
## Spiders NaN 1 NaN
## Edema NaN 2 NaN
## Cholesterol NaN 1 NaN
## Copper NaN 1 NaN
## Alk_Phos NaN 1 NaN
## SGOT NaN 1 NaN
## Tryglicerides NaN 1 NaN
## Platelets NaN 1 NaN
## Prothrombin NaN 1 NaN
## Stage NaN 1 NaN
## alb_cat NaN 1 NaN
## bili_cat NaN 1 NaN
## age_yr NaN 1 NaN
## age_gp NaN 1 NaN
## idc NaN 417 NaN
## GLOBAL NaN 435 NaN