Step 1. Import data and clean Data

Delete observations with missing values on the outcome variables, DX_LASTCONTACT_DEATH_MONTHS & PUF_VITAL_STATUS

Delete observations with value 0 in TUMORS_SIZE_NEW_CAT (only 9 cases) due to difficulty in balancing PS

Select OC patients as a subset

Select OC patients with primary_site_recode = “other and unspecified parts of tongue”

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(survival)
## Warning: package 'survival' was built under R version 3.5.3
library(survminer)
## Warning: package 'survminer' was built under R version 3.5.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(timereg)
## Warning: package 'timereg' was built under R version 3.5.3
library(twang)
## Warning: package 'twang' was built under R version 3.5.3
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: xtable
## Loading required package: lattice
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 3.5.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(survey)
library(readxl)

cancer <- read_excel("PREPSM_AllSites_2004Onwards_NEW2019.xlsx")

attach(cancer)

cancer_s=data.frame(cbind(age_youngold, AGE, SEX,RACE_RECODE,INSURANCE_STATUS,CDCC_TOTAL,MED_INC_QUAR_12, UR_new,
                          Primary_site_3grp,TNM_T_new,TNM_N_new,TUMORS_SIZE_NEW_CAT, HPVStatus, TreatmentGrp,
                          Primary_site_recode,FACILITY_TYPE_CD,
                          DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS
                          ))

#summary(cancer_s)

detach(cancer)


# DX_LASTCONTACT_DEATH_MONTHS has 11714 missing values 
# PUF_VITAL_STATUS has 11707 missing values, alive = 1, dead = 0
# Remove observations with missing values in vital status and survival month

cancer_s1=cancer_s %>% drop_na(DX_LASTCONTACT_DEATH_MONTHS)
cancer_s1=cancer_s1[cancer_s1$TUMORS_SIZE_NEW_CAT!=0,]
# Make variables factor 

cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==1]=0
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==0]=1
cancer_s1$age_youngold=factor(cancer_s1$age_youngold)
cancer_s1$SEX=factor(cancer_s1$SEX)
cancer_s1$RACE_RECODE=factor(cancer_s1$RACE_RECODE)
cancer_s1$INSURANCE_STATUS=factor(cancer_s1$INSURANCE_STATUS)
cancer_s1$CDCC_TOTAL=factor(cancer_s1$CDCC_TOTAL)
cancer_s1$MED_INC_QUAR_12=factor(cancer_s1$MED_INC_QUAR_12)
cancer_s1$UR_new=factor(cancer_s1$UR_new)
cancer_s1$Primary_site_3grp=factor(cancer_s1$Primary_site_3grp)
cancer_s1$TNM_T_new=factor(cancer_s1$TNM_T_new)
cancer_s1$TNM_N_new=factor(cancer_s1$TNM_N_new)
cancer_s1$TUMORS_SIZE_NEW_CAT=factor(cancer_s1$TUMORS_SIZE_NEW_CAT)
cancer_s1$HPVStatus=factor(cancer_s1$HPVStatus)
cancer_s1$TreatmentGrp=factor(cancer_s1$TreatmentGrp)
cancer_s1$Primary_site_recode=factor(cancer_s1$Primary_site_recode)
cancer_s1$FACILITY_TYPE_CD=factor(cancer_s1$FACILITY_TYPE_CD)

# Select OC subset 

cancer_oc=cancer_s1[cancer_s1$Primary_site_3grp==1, ]
cancer_oc$Primary_site_recode=as.numeric(cancer_oc$Primary_site_recode)
cancer_oc$Primary_site_recode=as.factor(cancer_oc$Primary_site_recode)
table(cancer_oc$Primary_site_recode)
## 
##    1    3    4    5    6    7 
## 2573 6897 2892 4353 2156 3946
# 1 = lip, 3= other and unspecified parts of tongue, 4 = gum, 5 = floor of mouth
# 6 = palate, 7 = other and unspecified parts of mouth 

# Select OC_tongue subset 
cancer_oc_t=cancer_oc[cancer_oc$Primary_site_recode==3,]

KM curves before propensity score weighting

# K-M curve for OC
surv_object_oc=Surv(time=cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$dead)
fit_oc=survfit(surv_object_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc,data=cancer_oc,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

# K-M curve for OC_tongue
surv_object_oc_t=Surv(time=cancer_oc_t$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc_t$dead)
fit_oc_t=survfit(surv_object_oc_t~age_youngold,data=cancer_oc_t)
ggsurvplot(fit_oc_t,data=cancer_oc_t,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

Propensity score weighting and balanace checking (separate for OC and OC_tongue)

set.seed(1)

# OC 
mnps.oc <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode,
                 data = cancer_oc,
                 estimand = "ATE",
                 verbose = FALSE,
                 stop.method = c("es.mean"),
                 n.trees = 5000)
table_oc=bal.table(mnps.oc,digits=2)
write.csv(table_oc, file = "table_oc.csv",row.names=T, na="")

table_oc_cov=bal.table(mnps.oc,digits = 2,collapse.to = "covariate")
write.csv(table_oc_cov, file = "table_oc_cov.csv",row.names=T, na="")

plot(mnps.oc,plots=3)

plot(mnps.oc,plots=4)

# OC_tongue
mnps.oc.t <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp,
                 data = cancer_oc_t,
                 estimand = "ATE",
                 verbose = FALSE,
                 stop.method = c("es.mean"),
                 n.trees = 5000)

table_oc_t=bal.table(mnps.oc.t,digits=3)
write.csv(table_oc_t, file = "table_oc_t.csv",row.names=T, na="")

table_oc_cov_t=bal.table(mnps.oc.t,digits = 3,collapse.to = "covariate")
write.csv(table_oc_cov_t, file = "table_oc_cov_t.csv",row.names=T, na="")
plot(mnps.oc.t,plots=3)

plot(mnps.oc.t,plots=4)

# OC

cancer_oc$weight=get.weights(mnps.oc,stop.method="es.mean")
surv_oc=Surv(cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$PUF_VITAL_STATUS)

# Post- PSW KM curve for OC 

fit_oc_ps=survfit(surv_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc_ps,data=cancer_oc,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC

fit_cox_oc=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
              data=cancer_oc,weights=cancer_oc$weight)
summary(fit_cox_oc)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold + TreatmentGrp, data = cancer_oc, weights = cancer_oc$weight)
## 
##   n= 22817, number of events= 15451 
## 
##                   coef exp(coef) se(coef)       z Pr(>|z|)    
## age_youngold1 -0.09402   0.91027  0.01114  -8.439  < 2e-16 ***
## age_youngold2 -0.09251   0.91164  0.01119  -8.264  < 2e-16 ***
## age_youngold3 -0.11602   0.89046  0.01132 -10.248  < 2e-16 ***
## age_youngold4 -0.16106   0.85125  0.01200 -13.420  < 2e-16 ***
## TreatmentGrp2 -0.14276   0.86696  0.02612  -5.465 4.63e-08 ***
## TreatmentGrp3 -0.11202   0.89403  0.02338  -4.792 1.65e-06 ***
## TreatmentGrp4 -0.24331   0.78403  0.02380 -10.225  < 2e-16 ***
## TreatmentGrp5  0.03935   1.04014  0.02190   1.797   0.0723 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1    0.9103     1.0986    0.8906    0.9304
## age_youngold2    0.9116     1.0969    0.8919    0.9319
## age_youngold3    0.8905     1.1230    0.8709    0.9104
## age_youngold4    0.8512     1.1747    0.8315    0.8715
## TreatmentGrp2    0.8670     1.1535    0.8237    0.9125
## TreatmentGrp3    0.8940     1.1185    0.8540    0.9359
## TreatmentGrp4    0.7840     1.2755    0.7483    0.8215
## TreatmentGrp5    1.0401     0.9614    0.9964    1.0858
## 
## Concordance= 0.547  (se = 0.006 )
## Likelihood ratio test= 974.2  on 8 df,   p=<2e-16
## Wald test            = 950.6  on 8 df,   p=<2e-16
## Score (logrank) test = 953.9  on 8 df,   p=<2e-16
# Check proportional hazards assumption 
temp_oc1=cox.zph(fit_cox_oc,global=T)
print(temp_oc1)
##                    rho  chisq     p
## age_youngold1 -0.00975 0.0980 0.754
## age_youngold2  0.01383 0.3491 0.555
## age_youngold3  0.00590 0.0682 0.794
## age_youngold4  0.00240 0.0149 0.903
## TreatmentGrp2  0.02107 1.5300 0.216
## TreatmentGrp3  0.01945 1.2683 0.260
## TreatmentGrp4  0.02074 1.4541 0.228
## TreatmentGrp5  0.01279 0.5566 0.456
## GLOBAL              NA 4.2572 0.833
# Assumption is satisfied 

# OC_tongue

cancer_oc_t$weight=get.weights(mnps.oc.t,stop.method="es.mean")
surv_oc_t=Surv(cancer_oc_t$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc_t$PUF_VITAL_STATUS)

# Post- PSW KM curve for OC 

fit_oc_t_ps=survfit(surv_oc_t~age_youngold,data=cancer_oc_t)
ggsurvplot(fit_oc_t_ps,data=cancer_oc_t,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC

fit_cox_oc_t=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
              data=cancer_oc_t,weights=cancer_oc_t$weight)
summary(fit_cox_oc_t)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold + TreatmentGrp, data = cancer_oc_t, weights = cancer_oc_t$weight)
## 
##   n= 6897, number of events= 4987 
## 
##                    coef exp(coef)  se(coef)      z Pr(>|z|)    
## age_youngold1  0.018404  1.018574  0.019740  0.932 0.351193    
## age_youngold2  0.010008  1.010059  0.019759  0.507 0.612498    
## age_youngold3  0.004214  1.004223  0.019981  0.211 0.832957    
## age_youngold4 -0.071590  0.930912  0.021205 -3.376 0.000735 ***
## TreatmentGrp2  0.283812  1.328184  0.091945  3.087 0.002023 ** 
## TreatmentGrp3  0.263116  1.300977  0.088343  2.978 0.002898 ** 
## TreatmentGrp4  0.205729  1.228420  0.088500  2.325 0.020092 *  
## TreatmentGrp5  0.340706  1.405940  0.086892  3.921 8.82e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1    1.0186     0.9818    0.9799    1.0588
## age_youngold2    1.0101     0.9900    0.9717    1.0499
## age_youngold3    1.0042     0.9958    0.9657    1.0443
## age_youngold4    0.9309     1.0742    0.8930    0.9704
## TreatmentGrp2    1.3282     0.7529    1.1092    1.5905
## TreatmentGrp3    1.3010     0.7687    1.0941    1.5469
## TreatmentGrp4    1.2284     0.8141    1.0328    1.4611
## TreatmentGrp5    1.4059     0.7113    1.1858    1.6670
## 
## Concordance= 0.52  (se = 0.005 )
## Likelihood ratio test= 91.1  on 8 df,   p=3e-16
## Wald test            = 88.49  on 8 df,   p=9e-16
## Score (logrank) test = 88.67  on 8 df,   p=9e-16
# Check proportional hazards assumption 
temp_oc2=cox.zph(fit_cox_oc_t,global=T)
print(temp_oc2)
##                   rho  chisq     p
## age_youngold1 0.01971 0.2015 0.654
## age_youngold2 0.04408 1.4633 0.226
## age_youngold3 0.02164 0.3664 0.545
## age_youngold4 0.00965 0.0793 0.778
## TreatmentGrp2 0.01584 0.3563 0.551
## TreatmentGrp3 0.01427 0.2928 0.588
## TreatmentGrp4 0.01515 0.3301 0.566
## TreatmentGrp5 0.00823 0.0983 0.754
## GLOBAL             NA 4.5288 0.807