Step 1. Import data and clean Data
Delete observations with missing values on the outcome variables, DX_LASTCONTACT_DEATH_MONTHS & PUF_VITAL_STATUS
Delete observations with value 0 in TUMORS_SIZE_NEW_CAT (only 9 cases) due to difficulty in balancing PS
Select OC patients as a subset
Select OC patients with primary_site_recode = “other and unspecified parts of tongue”
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(survival)
## Warning: package 'survival' was built under R version 3.5.3
library(survminer)
## Warning: package 'survminer' was built under R version 3.5.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(timereg)
## Warning: package 'timereg' was built under R version 3.5.3
library(twang)
## Warning: package 'twang' was built under R version 3.5.3
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: xtable
## Loading required package: lattice
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 3.5.3
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
library(survey)
library(readxl)
cancer <- read_excel("PREPSM_AllSites_2004Onwards_NEW2019.xlsx")
attach(cancer)
cancer_s=data.frame(cbind(age_youngold, AGE, SEX,RACE_RECODE,INSURANCE_STATUS,CDCC_TOTAL,MED_INC_QUAR_12, UR_new,
Primary_site_3grp,TNM_T_new,TNM_N_new,TUMORS_SIZE_NEW_CAT, HPVStatus, TreatmentGrp,
Primary_site_recode,FACILITY_TYPE_CD,
DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS
))
#summary(cancer_s)
detach(cancer)
# DX_LASTCONTACT_DEATH_MONTHS has 11714 missing values
# PUF_VITAL_STATUS has 11707 missing values, alive = 1, dead = 0
# Remove observations with missing values in vital status and survival month
cancer_s1=cancer_s %>% drop_na(DX_LASTCONTACT_DEATH_MONTHS)
cancer_s1=cancer_s1[cancer_s1$TUMORS_SIZE_NEW_CAT!=0,]
# Make variables factor
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==1]=0
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==0]=1
cancer_s1$age_youngold=factor(cancer_s1$age_youngold)
cancer_s1$SEX=factor(cancer_s1$SEX)
cancer_s1$RACE_RECODE=factor(cancer_s1$RACE_RECODE)
cancer_s1$INSURANCE_STATUS=factor(cancer_s1$INSURANCE_STATUS)
cancer_s1$CDCC_TOTAL=factor(cancer_s1$CDCC_TOTAL)
cancer_s1$MED_INC_QUAR_12=factor(cancer_s1$MED_INC_QUAR_12)
cancer_s1$UR_new=factor(cancer_s1$UR_new)
cancer_s1$Primary_site_3grp=factor(cancer_s1$Primary_site_3grp)
cancer_s1$TNM_T_new=factor(cancer_s1$TNM_T_new)
cancer_s1$TNM_N_new=factor(cancer_s1$TNM_N_new)
cancer_s1$TUMORS_SIZE_NEW_CAT=factor(cancer_s1$TUMORS_SIZE_NEW_CAT)
cancer_s1$HPVStatus=factor(cancer_s1$HPVStatus)
cancer_s1$TreatmentGrp=factor(cancer_s1$TreatmentGrp)
cancer_s1$Primary_site_recode=factor(cancer_s1$Primary_site_recode)
cancer_s1$FACILITY_TYPE_CD=factor(cancer_s1$FACILITY_TYPE_CD)
# Select OC subset
cancer_oc=cancer_s1[cancer_s1$Primary_site_3grp==1, ]
cancer_oc$Primary_site_recode=as.numeric(cancer_oc$Primary_site_recode)
cancer_oc$Primary_site_recode=as.factor(cancer_oc$Primary_site_recode)
table(cancer_oc$Primary_site_recode)
##
## 1 3 4 5 6 7
## 2573 6897 2892 4353 2156 3946
# 1 = lip, 3= other and unspecified parts of tongue, 4 = gum, 5 = floor of mouth
# 6 = palate, 7 = other and unspecified parts of mouth
# Select OC_tongue subset
cancer_oc_t=cancer_oc[cancer_oc$Primary_site_recode==3,]
KM curves before propensity score weighting
# K-M curve for OC
surv_object_oc=Surv(time=cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$dead)
fit_oc=survfit(surv_object_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc,data=cancer_oc,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

# K-M curve for OC_tongue
surv_object_oc_t=Surv(time=cancer_oc_t$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc_t$dead)
fit_oc_t=survfit(surv_object_oc_t~age_youngold,data=cancer_oc_t)
ggsurvplot(fit_oc_t,data=cancer_oc_t,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

Propensity score weighting and balanace checking (separate for OC and OC_tongue)
set.seed(1)
# OC
mnps.oc <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
+TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode,
data = cancer_oc,
estimand = "ATE",
verbose = FALSE,
stop.method = c("es.mean"),
n.trees = 5000)
table_oc=bal.table(mnps.oc,digits=2)
write.csv(table_oc, file = "table_oc.csv",row.names=T, na="")
table_oc_cov=bal.table(mnps.oc,digits = 2,collapse.to = "covariate")
write.csv(table_oc_cov, file = "table_oc_cov.csv",row.names=T, na="")
plot(mnps.oc,plots=3)

plot(mnps.oc,plots=4)

# OC_tongue
mnps.oc.t <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
+TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp,
data = cancer_oc_t,
estimand = "ATE",
verbose = FALSE,
stop.method = c("es.mean"),
n.trees = 5000)
table_oc_t=bal.table(mnps.oc.t,digits=3)
write.csv(table_oc_t, file = "table_oc_t.csv",row.names=T, na="")
table_oc_cov_t=bal.table(mnps.oc.t,digits = 3,collapse.to = "covariate")
write.csv(table_oc_cov_t, file = "table_oc_cov_t.csv",row.names=T, na="")
plot(mnps.oc.t,plots=3)

plot(mnps.oc.t,plots=4)

# OC
cancer_oc$weight=get.weights(mnps.oc,stop.method="es.mean")
surv_oc=Surv(cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$PUF_VITAL_STATUS)
# Post- PSW KM curve for OC
fit_oc_ps=survfit(surv_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc_ps,data=cancer_oc,pval=TRUE,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC
fit_cox_oc=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
data=cancer_oc,weights=cancer_oc$weight)
summary(fit_cox_oc)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS ==
## 1) ~ age_youngold + TreatmentGrp, data = cancer_oc, weights = cancer_oc$weight)
##
## n= 22817, number of events= 15451
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age_youngold1 -0.09402 0.91027 0.01114 -8.439 < 2e-16 ***
## age_youngold2 -0.09251 0.91164 0.01119 -8.264 < 2e-16 ***
## age_youngold3 -0.11602 0.89046 0.01132 -10.248 < 2e-16 ***
## age_youngold4 -0.16106 0.85125 0.01200 -13.420 < 2e-16 ***
## TreatmentGrp2 -0.14276 0.86696 0.02612 -5.465 4.63e-08 ***
## TreatmentGrp3 -0.11202 0.89403 0.02338 -4.792 1.65e-06 ***
## TreatmentGrp4 -0.24331 0.78403 0.02380 -10.225 < 2e-16 ***
## TreatmentGrp5 0.03935 1.04014 0.02190 1.797 0.0723 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1 0.9103 1.0986 0.8906 0.9304
## age_youngold2 0.9116 1.0969 0.8919 0.9319
## age_youngold3 0.8905 1.1230 0.8709 0.9104
## age_youngold4 0.8512 1.1747 0.8315 0.8715
## TreatmentGrp2 0.8670 1.1535 0.8237 0.9125
## TreatmentGrp3 0.8940 1.1185 0.8540 0.9359
## TreatmentGrp4 0.7840 1.2755 0.7483 0.8215
## TreatmentGrp5 1.0401 0.9614 0.9964 1.0858
##
## Concordance= 0.547 (se = 0.006 )
## Likelihood ratio test= 974.2 on 8 df, p=<2e-16
## Wald test = 950.6 on 8 df, p=<2e-16
## Score (logrank) test = 953.9 on 8 df, p=<2e-16
# Check proportional hazards assumption
temp_oc1=cox.zph(fit_cox_oc,global=T)
print(temp_oc1)
## rho chisq p
## age_youngold1 -0.00975 0.0980 0.754
## age_youngold2 0.01383 0.3491 0.555
## age_youngold3 0.00590 0.0682 0.794
## age_youngold4 0.00240 0.0149 0.903
## TreatmentGrp2 0.02107 1.5300 0.216
## TreatmentGrp3 0.01945 1.2683 0.260
## TreatmentGrp4 0.02074 1.4541 0.228
## TreatmentGrp5 0.01279 0.5566 0.456
## GLOBAL NA 4.2572 0.833
# Assumption is satisfied
# OC_tongue
cancer_oc_t$weight=get.weights(mnps.oc.t,stop.method="es.mean")
surv_oc_t=Surv(cancer_oc_t$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc_t$PUF_VITAL_STATUS)
# Post- PSW KM curve for OC
fit_oc_t_ps=survfit(surv_oc_t~age_youngold,data=cancer_oc_t)
ggsurvplot(fit_oc_t_ps,data=cancer_oc_t,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC
fit_cox_oc_t=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
data=cancer_oc_t,weights=cancer_oc_t$weight)
summary(fit_cox_oc_t)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS ==
## 1) ~ age_youngold + TreatmentGrp, data = cancer_oc_t, weights = cancer_oc_t$weight)
##
## n= 6897, number of events= 4987
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age_youngold1 0.018404 1.018574 0.019740 0.932 0.351193
## age_youngold2 0.010008 1.010059 0.019759 0.507 0.612498
## age_youngold3 0.004214 1.004223 0.019981 0.211 0.832957
## age_youngold4 -0.071590 0.930912 0.021205 -3.376 0.000735 ***
## TreatmentGrp2 0.283812 1.328184 0.091945 3.087 0.002023 **
## TreatmentGrp3 0.263116 1.300977 0.088343 2.978 0.002898 **
## TreatmentGrp4 0.205729 1.228420 0.088500 2.325 0.020092 *
## TreatmentGrp5 0.340706 1.405940 0.086892 3.921 8.82e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1 1.0186 0.9818 0.9799 1.0588
## age_youngold2 1.0101 0.9900 0.9717 1.0499
## age_youngold3 1.0042 0.9958 0.9657 1.0443
## age_youngold4 0.9309 1.0742 0.8930 0.9704
## TreatmentGrp2 1.3282 0.7529 1.1092 1.5905
## TreatmentGrp3 1.3010 0.7687 1.0941 1.5469
## TreatmentGrp4 1.2284 0.8141 1.0328 1.4611
## TreatmentGrp5 1.4059 0.7113 1.1858 1.6670
##
## Concordance= 0.52 (se = 0.005 )
## Likelihood ratio test= 91.1 on 8 df, p=3e-16
## Wald test = 88.49 on 8 df, p=9e-16
## Score (logrank) test = 88.67 on 8 df, p=9e-16
# Check proportional hazards assumption
temp_oc2=cox.zph(fit_cox_oc_t,global=T)
print(temp_oc2)
## rho chisq p
## age_youngold1 0.01971 0.2015 0.654
## age_youngold2 0.04408 1.4633 0.226
## age_youngold3 0.02164 0.3664 0.545
## age_youngold4 0.00965 0.0793 0.778
## TreatmentGrp2 0.01584 0.3563 0.551
## TreatmentGrp3 0.01427 0.2928 0.588
## TreatmentGrp4 0.01515 0.3301 0.566
## TreatmentGrp5 0.00823 0.0983 0.754
## GLOBAL NA 4.5288 0.807