Import data

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(survival)
## Warning: package 'survival' was built under R version 3.5.3
library(survminer)
## Warning: package 'survminer' was built under R version 3.5.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(timereg)
## Warning: package 'timereg' was built under R version 3.5.3
library(twang)
## Warning: package 'twang' was built under R version 3.5.3
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: xtable
## Loading required package: lattice
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 3.5.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(survey)
library(readxl)

cancer <- read_excel("PREPSM_AllSites_2004Onwards_NEW2019.xlsx")

attach(cancer)

cancer_s=data.frame(cbind(age_youngold, AGE, SEX,RACE_RECODE,INSURANCE_STATUS,CDCC_TOTAL,MED_INC_QUAR_12, UR_new,
                          Primary_site_3grp,TNM_T_new,TNM_N_new,TUMORS_SIZE_NEW_CAT, HPVStatus, TreatmentGrp,
                          Primary_site_recode,FACILITY_TYPE_CD,
                          DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS
                          ))

summary(cancer_s)
##   age_youngold        AGE            SEX         RACE_RECODE    
##  Min.   :0.000   Min.   :18.0   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:2.000   1st Qu.:54.0   1st Qu.:1.000   1st Qu.: 1.000  
##  Median :3.000   Median :61.0   Median :1.000   Median : 1.000  
##  Mean   :2.683   Mean   :62.3   Mean   :1.294   Mean   : 1.741  
##  3rd Qu.:4.000   3rd Qu.:70.0   3rd Qu.:2.000   3rd Qu.: 1.000  
##  Max.   :4.000   Max.   :90.0   Max.   :2.000   Max.   :98.000  
##                                                 NA's   :421     
##  INSURANCE_STATUS   CDCC_TOTAL     MED_INC_QUAR_12     UR_new     
##  Min.   :0.00     Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.00     1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :5.00     Median :0.0000   Median :3.000   Median :1.000  
##  Mean   :3.12     Mean   :0.2552   Mean   :2.765   Mean   :1.197  
##  3rd Qu.:5.00     3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:1.000  
##  Max.   :9.00     Max.   :2.0000   Max.   :4.000   Max.   :3.000  
##                                    NA's   :221     NA's   :1217   
##  Primary_site_3grp   TNM_T_new       TNM_N_new     TUMORS_SIZE_NEW_CAT
##  Min.   :1.00      Min.   :0.000   Min.   :0.000   Min.   :  0.0      
##  1st Qu.:1.00      1st Qu.:1.000   1st Qu.:0.000   1st Qu.:992.0      
##  Median :1.00      Median :2.000   Median :0.000   Median :993.0      
##  Mean   :1.42      Mean   :1.748   Mean   :0.494   Mean   :991.7      
##  3rd Qu.:2.00      3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:994.0      
##  Max.   :2.00      Max.   :4.000   Max.   :3.000   Max.   :999.0      
##                    NA's   :10260   NA's   :13934                      
##    HPVStatus      TreatmentGrp   Primary_site_recode FACILITY_TYPE_CD
##  Min.   :0.000   Min.   :1.000   Min.   : 0.000      Min.   :1.000   
##  1st Qu.:0.000   1st Qu.:2.000   1st Qu.: 2.000      1st Qu.:2.000   
##  Median :2.000   Median :4.000   Median : 4.000      Median :3.000   
##  Mean   :4.503   Mean   :3.502   Mean   : 4.359      Mean   :2.655   
##  3rd Qu.:9.000   3rd Qu.:5.000   3rd Qu.: 9.000      3rd Qu.:3.000   
##  Max.   :9.000   Max.   :5.000   Max.   :10.000      Max.   :4.000   
##                                                      NA's   :1190    
##  DX_LASTCONTACT_DEATH_MONTHS PUF_VITAL_STATUS
##  Min.   : 0.00               Min.   :0.000   
##  1st Qu.:16.39               1st Qu.:0.000   
##  Median :27.14               Median :1.000   
##  Mean   :29.04               Mean   :0.729   
##  3rd Qu.:40.90               3rd Qu.:1.000   
##  Max.   :71.33               Max.   :1.000   
##  NA's   :11714               NA's   :11707
detach(cancer)


# Remove all cases with missing values

cancer_s1=cancer_s[complete.cases(cancer_s),]


# Make variables factor 

cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==1]=0
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==0]=1
cancer_s1$age_youngold=factor(cancer_s1$age_youngold)
cancer_s1$SEX=factor(cancer_s1$SEX)
cancer_s1$RACE_RECODE=factor(cancer_s1$RACE_RECODE)
cancer_s1$INSURANCE_STATUS=factor(cancer_s1$INSURANCE_STATUS)
cancer_s1$CDCC_TOTAL=factor(cancer_s1$CDCC_TOTAL)
cancer_s1$MED_INC_QUAR_12=factor(cancer_s1$MED_INC_QUAR_12)
cancer_s1$UR_new=factor(cancer_s1$UR_new)
cancer_s1$Primary_site_3grp=factor(cancer_s1$Primary_site_3grp)
cancer_s1$TNM_T_new=factor(cancer_s1$TNM_T_new)
cancer_s1$TNM_N_new=factor(cancer_s1$TNM_N_new)
cancer_s1$TUMORS_SIZE_NEW_CAT=factor(cancer_s1$TUMORS_SIZE_NEW_CAT)
cancer_s1$HPVStatus=factor(cancer_s1$HPVStatus)
cancer_s1$TreatmentGrp=factor(cancer_s1$TreatmentGrp)
cancer_s1$Primary_site_recode=factor(cancer_s1$Primary_site_recode)
cancer_s1$FACILITY_TYPE_CD=factor(cancer_s1$FACILITY_TYPE_CD)

summary(cancer_s1)
##  age_youngold      AGE        SEX       RACE_RECODE INSURANCE_STATUS
##  1:2231       Min.   :40.00   1:14303   1 :19618    0:  953         
##  2:6151       1st Qu.:55.00   2: 7025   2 : 1151    1: 8975         
##  3:6311       Median :63.00             3 :   43    5:11072         
##  4:6635       Mean   :63.89             4 :  402    9:  328         
##               3rd Qu.:72.00             98:  114                    
##               Max.   :90.00                                         
##                                                                     
##  CDCC_TOTAL MED_INC_QUAR_12 UR_new    Primary_site_3grp TNM_T_new
##  0:16665    1:3408          1:17317   1:14549           0:   84  
##  1: 3598    2:5185          2: 3591   2: 6779           1:10368  
##  2: 1065    3:5869          3:  420                     2: 7999  
##             4:6866                                      3: 2498  
##                                                         4:  379  
##                                                                  
##                                                                  
##  TNM_N_new TUMORS_SIZE_NEW_CAT HPVStatus TreatmentGrp Primary_site_recode
##  0:14706   992    :6170        0: 5729   1: 1245      2      :4864       
##  1: 3604   993    :4666        2: 4290   2: 3745      9      :3708       
##  2: 2491   991    :4222        3:   95   3: 2815      4      :2616       
##  3:  527   994    :2860        9:11214   4: 3106      1      :2473       
##            999    :1326                  5:10417      6      :2285       
##            995    :1235                               0      :2029       
##            (Other): 849                               (Other):3353       
##  FACILITY_TYPE_CD DX_LASTCONTACT_DEATH_MONTHS PUF_VITAL_STATUS
##  1: 1589          Min.   : 0.00               Min.   :0.0000  
##  2: 7089          1st Qu.:17.48               1st Qu.:1.0000  
##  3:10597          Median :28.25               Median :1.0000  
##  4: 2053          Mean   :29.90               Mean   :0.7534  
##                   3rd Qu.:41.99               3rd Qu.:1.0000  
##                   Max.   :71.00               Max.   :1.0000  
##                                                               
##       dead       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2466  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
## 
# Divide data into OP and OC subsets 

# Recode variables 

cancer_op=cancer_s1[cancer_s1$Primary_site_3grp==2, ]
cancer_op$Primary_site_recode=as.numeric(cancer_op$Primary_site_recode)
cancer_op$Primary_site_recode=as.factor(cancer_op$Primary_site_recode)
# 2 = base of tong, 8 = Tonsil, 9 = Orpharynx

cancer_op$HPVStatus=as.numeric(cancer_op$HPVStatus)
cancer_op$HPVStatus=as.factor(cancer_op$HPVStatus)
# 1 = HPV Neg, 2 = HPV Pos


cancer_oc=cancer_s1[cancer_s1$Primary_site_3grp==1, ]
cancer_oc$Primary_site_recode=as.numeric(cancer_oc$Primary_site_recode)
cancer_oc$Primary_site_recode=as.factor(cancer_oc$Primary_site_recode)
# 1 = lip, 3= other and unspecified parts of tongue, 4 = gum, 5 = floor of mouth
# 6 = palate, 7 = other and unspecified parts of mouth 

KM curves before propensity score weighting

surv_object_op=Surv(time=cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$dead)

# K-M curve OP
fit_op=survfit(surv_object_op~age_youngold,data=cancer_op)
ggsurvplot(fit_op,data=cancer_op,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

surv_object_oc=Surv(time=cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$dead)

# K-M curve for OC
fit_oc=survfit(surv_object_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc,data=cancer_oc,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

Propensity score weighting and balanace checking

set.seed(1)

## OP
mnps.op <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode+HPVStatus,
                data = cancer_op,
                estimand = "ATE",
                verbose = FALSE,
                stop.method = c("es.mean"),
                n.trees = 5000)
table_op=bal.table(mnps.op,digits=2)
write.csv(table_op, file = "table_op_new.csv",row.names=T, na="")

table_op_cov=bal.table(mnps.op,digits = 2,collapse.to = "covariate")
write.csv(table_op_cov, file = "table_op_cov_new.csv",row.names=T, na="")

## OC

mnps.oc <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode,
                 data = cancer_oc,
                 estimand = "ATE",
                 verbose = FALSE,
                 stop.method = c("es.mean"),
                 n.trees = 5000)
table_oc=bal.table(mnps.oc,digits=2)
write.csv(table_oc, file = "table_oc_new.csv",row.names=T, na="")

table_oc_cov=bal.table(mnps.oc,digits = 2,collapse.to = "covariate")
write.csv(table_op_cov, file = "table_op_cov_new.csv",row.names=T, na="")

#plot(mnps.oc,plots=1)
#plot(mnps.oc,plots=2)
#plot(mnps.oc,plots=3,pairwiseMax = F)
#plot(mnps.oc,plots=4)

Analysis on propensity score weighted data

# Cox regression for OP 

cancer_op$weight=get.weights(mnps.op,stop.method="es.mean")
surv_op=Surv(cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$PUF_VITAL_STATUS)
fit_cox_op1=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold,
                 data=cancer_op,weights=cancer_op$weight)
summary(fit_cox_op1)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold, data = cancer_op, weights = cancer_op$weight)
## 
##   n= 6779, number of events= 5635 
## 
##                   coef exp(coef) se(coef)      z Pr(>|z|)    
## age_youngold2 -0.01323   0.98686  0.01904 -0.695   0.4873    
## age_youngold3  0.04102   1.04188  0.01915  2.142   0.0322 *  
## age_youngold4  0.07800   1.08113  0.02005  3.890   0.0001 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold2    0.9869     1.0133    0.9507     1.024
## age_youngold3    1.0419     0.9598    1.0035     1.082
## age_youngold4    1.0811     0.9250    1.0395     1.124
## 
## Concordance= 0.509  (se = 0.005 )
## Likelihood ratio test= 25.67  on 3 df,   p=1e-05
## Wald test            = 25.8  on 3 df,   p=1e-05
## Score (logrank) test = 25.81  on 3 df,   p=1e-05
# Check proportional hazards assumption 
temp_op1=cox.zph(fit_cox_op1,global=T)
print(temp_op1)
##                     rho    chisq     p
## age_youngold2  0.002191 0.006374 0.936
## age_youngold3 -0.006517 0.052790 0.818
## age_youngold4 -0.000678 0.000397 0.984
## GLOBAL               NA 0.105392 0.991
# Assumption is satisfied 


# Cox regression for OC 

cancer_oc$weight=get.weights(mnps.oc,stop.method="es.mean")
surv_oc=Surv(cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$PUF_VITAL_STATUS)
fit_cox_oc=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold,
              data=cancer_oc,weights=cancer_oc$weight)
summary(fit_cox_oc)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold, data = cancer_oc, weights = cancer_oc$weight)
## 
##   n= 14549, number of events= 10433 
## 
##                   coef exp(coef) se(coef)      z Pr(>|z|)    
## age_youngold2 -0.02092   0.97929  0.01335 -1.567 0.117115    
## age_youngold3 -0.04772   0.95340  0.01350 -3.535 0.000408 ***
## age_youngold4 -0.09694   0.90761  0.01429 -6.784 1.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold2    0.9793      1.021    0.9540    1.0053
## age_youngold3    0.9534      1.049    0.9285    0.9790
## age_youngold4    0.9076      1.102    0.8825    0.9334
## 
## Concordance= 0.511  (se = 0.004 )
## Likelihood ratio test= 51.12  on 3 df,   p=5e-11
## Wald test            = 50.75  on 3 df,   p=6e-11
## Score (logrank) test = 50.78  on 3 df,   p=5e-11
# Check proportional hazards assumption 

temp_oc1=cox.zph(fit_cox_oc,global=T)
print(temp_oc1)
##                     rho    chisq     p
## age_youngold2  0.004723 0.037888 0.846
## age_youngold3  0.001057 0.002058 0.964
## age_youngold4 -0.000363 0.000309 0.986
## GLOBAL               NA 0.053655 0.997
# Assumption is satisfied