Step 1. Import data and clean Data

Delete observations with missing values on the outcome variables, DX_LASTCONTACT_DEATH_MONTHS & PUF_VITAL_STATUS

Delete observations with value 0 in TUMORS_SIZE_NEW_CAT (only 9 cases) due to difficulty in balancing PS

Then divide dataset into OP and OC subsets

library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(survival)
## Warning: package 'survival' was built under R version 3.5.3
library(survminer)
## Warning: package 'survminer' was built under R version 3.5.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(timereg)
## Warning: package 'timereg' was built under R version 3.5.3
library(twang)
## Warning: package 'twang' was built under R version 3.5.3
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: xtable
## Loading required package: lattice
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 3.5.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(survey)
library(readxl)

cancer <- read_excel("PREPSM_AllSites_2004Onwards_NEW2019.xlsx")

attach(cancer)

cancer_s=data.frame(cbind(age_youngold, AGE, SEX,RACE_RECODE,INSURANCE_STATUS,CDCC_TOTAL,MED_INC_QUAR_12, UR_new,
                          Primary_site_3grp,TNM_T_new,TNM_N_new,TUMORS_SIZE_NEW_CAT, HPVStatus, TreatmentGrp,
                          Primary_site_recode,FACILITY_TYPE_CD,
                          DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS
                          ))

#summary(cancer_s)

detach(cancer)


# DX_LASTCONTACT_DEATH_MONTHS has 11714 missing values 
# PUF_VITAL_STATUS has 11707 missing values, alive = 1, dead = 0
# Remove observations with missing values in vital status and survival month

cancer_s1=cancer_s %>% drop_na(DX_LASTCONTACT_DEATH_MONTHS)
cancer_s1=cancer_s1[cancer_s1$TUMORS_SIZE_NEW_CAT!=0,]
# Make variables factor 

cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==1]=0
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==0]=1
cancer_s1$age_youngold=factor(cancer_s1$age_youngold)
cancer_s1$SEX=factor(cancer_s1$SEX)
cancer_s1$RACE_RECODE=factor(cancer_s1$RACE_RECODE)
cancer_s1$INSURANCE_STATUS=factor(cancer_s1$INSURANCE_STATUS)
cancer_s1$CDCC_TOTAL=factor(cancer_s1$CDCC_TOTAL)
cancer_s1$MED_INC_QUAR_12=factor(cancer_s1$MED_INC_QUAR_12)
cancer_s1$UR_new=factor(cancer_s1$UR_new)
cancer_s1$Primary_site_3grp=factor(cancer_s1$Primary_site_3grp)
cancer_s1$TNM_T_new=factor(cancer_s1$TNM_T_new)
cancer_s1$TNM_N_new=factor(cancer_s1$TNM_N_new)
cancer_s1$TUMORS_SIZE_NEW_CAT=factor(cancer_s1$TUMORS_SIZE_NEW_CAT)
cancer_s1$HPVStatus=factor(cancer_s1$HPVStatus)
cancer_s1$TreatmentGrp=factor(cancer_s1$TreatmentGrp)
cancer_s1$Primary_site_recode=factor(cancer_s1$Primary_site_recode)
cancer_s1$FACILITY_TYPE_CD=factor(cancer_s1$FACILITY_TYPE_CD)


# Divide data into OP and OC subsets 

cancer_op=cancer_s1[cancer_s1$Primary_site_3grp==2, ]
cancer_op$Primary_site_recode=as.numeric(cancer_op$Primary_site_recode)
cancer_op$Primary_site_recode=as.factor(cancer_op$Primary_site_recode)
table(cancer_op$Primary_site_recode)
## 
##    2    8    9 
## 6006 7697 1300
# 2 = base of tong, 8 = Tonsil, 9 = Orpharynx
cancer_op$HPVStatus=as.numeric(cancer_op$HPVStatus)
cancer_op$HPVStatus=as.factor(cancer_op$HPVStatus)
table(cancer_op$HPVStatus)
## 
##    1    2 
## 5866 9137
# 1 = HPV Neg, 2 = HPV Pos


cancer_oc=cancer_s1[cancer_s1$Primary_site_3grp==1, ]
cancer_oc$Primary_site_recode=as.numeric(cancer_oc$Primary_site_recode)
cancer_oc$Primary_site_recode=as.factor(cancer_oc$Primary_site_recode)
table(cancer_oc$Primary_site_recode)
## 
##    1    3    4    5    6    7 
## 2573 6897 2892 4353 2156 3946
# 1 = lip, 3= other and unspecified parts of tongue, 4 = gum, 5 = floor of mouth
# 6 = palate, 7 = other and unspecified parts of mouth 

KM curves before propensity score weighting

surv_object_op=Surv(time=cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$dead)

# K-M curve OP
fit_op=survfit(surv_object_op~age_youngold,data=cancer_op)
ggsurvplot(fit_op,data=cancer_op,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

surv_object_oc=Surv(time=cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$dead)

# K-M curve for OC
fit_oc=survfit(surv_object_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc,data=cancer_oc,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

Propensity score weighting and balanace checking (separate for OC and OP)

set.seed(1)

mnps.op <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode+HPVStatus,
                data = cancer_op,
                estimand = "ATE",
                verbose = FALSE,
                stop.method = c("es.mean"),
                n.trees = 5000)
table_op=bal.table(mnps.op,digits=2)
write.csv(table_op, file = "table_op.csv",row.names=T, na="")

table_op_cov=bal.table(mnps.op,digits = 2,collapse.to = "covariate")
write.csv(table_op_cov, file = "table_op_cov.csv",row.names=T, na="")

#plot(mnps.op,plots=1)
#plot(mnps.op,plots=2)
plot(mnps.op,plots=3)

plot(mnps.op,plots=4)

mnps.oc <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
                +TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode,
                 data = cancer_oc,
                 estimand = "ATE",
                 verbose = FALSE,
                 stop.method = c("es.mean"),
                 n.trees = 5000)
table_oc=bal.table(mnps.oc,digits=2)
write.csv(table_oc, file = "table_oc.csv",row.names=T, na="")

table_oc_cov=bal.table(mnps.oc,digits = 2,collapse.to = "covariate")
write.csv(table_oc_cov, file = "table_op_cov.csv",row.names=T, na="")

#plot(mnps.oc,plots=1)
#plot(mnps.oc,plots=2)
plot(mnps.oc,plots=3)

plot(mnps.oc,plots=4)

Regression Analysis on propensity score weighted data (separate for OC and OP)

# OP 
cancer_op$weight=get.weights(mnps.op,stop.method="es.mean")
surv_op=Surv(cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$PUF_VITAL_STATUS)

# Post-PSW KM curve for OP 
fit_op_ps=survfit(surv_op~age_youngold,data=cancer_op)
ggsurvplot(fit_op_ps,data=cancer_op,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

# Post PSW Cox Regression for OP
fit_cox_op1=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold,
                 data=cancer_op,weights=cancer_op$weight)
summary(fit_cox_op1)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold, data = cancer_op, weights = cancer_op$weight)
## 
##   n= 15003, number of events= 12107 
## 
##                  coef exp(coef) se(coef)     z Pr(>|z|)    
## age_youngold1 0.02733   1.02770  0.01360 2.009   0.0445 *  
## age_youngold2 0.05645   1.05808  0.01353 4.174 2.99e-05 ***
## age_youngold3 0.10034   1.10554  0.01360 7.377 1.62e-13 ***
## age_youngold4 0.11744   1.12461  0.01417 8.286  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1     1.028     0.9730     1.001     1.055
## age_youngold2     1.058     0.9451     1.030     1.087
## age_youngold3     1.106     0.9045     1.076     1.135
## age_youngold4     1.125     0.8892     1.094     1.156
## 
## Concordance= 0.507  (se = 0.008 )
## Likelihood ratio test= 100.8  on 4 df,   p=<2e-16
## Wald test            = 100.9  on 4 df,   p=<2e-16
## Score (logrank) test = 100.9  on 4 df,   p=<2e-16
# Check proportional hazards assumption 
temp_op1=cox.zph(fit_cox_op1,global=T)
print(temp_op1)
##                  rho chisq       p
## age_youngold1 0.0958  7.74 0.00539
## age_youngold2 0.0810  9.97 0.00159
## age_youngold3 0.0505  3.64 0.05637
## age_youngold4 0.0731  4.83 0.02791
## GLOBAL            NA 11.69 0.01983
# Proportional hazard assumption is not satisfied. Run Aalen's additive hazards model for OP.

add3=aalen(surv_op~age_youngold,data=cancer_op, weights=cancer_op$weight,residuals = 1)
summary(add3)
## Additive Aalen Model 
## 
## Test for nonparametric terms 
## 
## Test for non-significant effects 
##               Supremum-test of significance p-value H_0: B(t)=0
## (Intercept)                             Inf                   0
## age_youngold1                           Inf                   0
## age_youngold2                           Inf                   0
## age_youngold3                           Inf                   0
## age_youngold4                           Inf                   0
## 
## Test for time invariant effects 
##                     Kolmogorov-Smirnov test p-value H_0:constant effect
## (Intercept)                            1.48                       0.017
## age_youngold1                          2.29                       0.013
## age_youngold2                          1.67                       0.017
## age_youngold3                          1.38                       0.073
## age_youngold4                          1.29                       0.161
##                       Cramer von Mises test p-value H_0:constant effect
## (Intercept)                            74.5                           0
## age_youngold1                         132.0                           0
## age_youngold2                          61.7                           0
## age_youngold3                          38.1                           0
## age_youngold4                          28.9                           0
## 
##    
##    
##   Call: 
## aalen(formula = surv_op ~ age_youngold, data = cancer_op, residuals = 1, 
##     weights = cancer_op$weight)
# Create estimated cumulative regression coefficients in the additive hazards model with 95% pointwise CI

par(mfrow=c(2,2))
plot(add3,specific.comps = 2, ylim=c(-1,5), mains=F)
title(main="40-49 vs. <40")
plot(add3,specific.comps = 3, ylim=c(-1,5), mains=F)
title(main="50-59 vs. <40")
plot(add3,specific.comps = 4, ylim=c(-1,5), mains=F)
title(main="60-69 vs. <40")
plot(add3,specific.comps = 5, ylim=c(-1,5), mains=F)
title(main="70+ vs. <40")

# OC

cancer_oc$weight=get.weights(mnps.oc,stop.method="es.mean")
surv_oc=Surv(cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$PUF_VITAL_STATUS)

# Post- PSW KM curve for OC 

fit_oc_ps=survfit(surv_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc_ps,data=cancer_oc,pval=TRUE,
           legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
           risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC

fit_cox_oc=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
              data=cancer_oc,weights=cancer_oc$weight)
summary(fit_cox_oc)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS == 
##     1) ~ age_youngold + TreatmentGrp, data = cancer_oc, weights = cancer_oc$weight)
## 
##   n= 22817, number of events= 15451 
## 
##                   coef exp(coef) se(coef)       z Pr(>|z|)    
## age_youngold1 -0.09402   0.91027  0.01114  -8.439  < 2e-16 ***
## age_youngold2 -0.09251   0.91164  0.01119  -8.264  < 2e-16 ***
## age_youngold3 -0.11602   0.89046  0.01132 -10.248  < 2e-16 ***
## age_youngold4 -0.16106   0.85125  0.01200 -13.420  < 2e-16 ***
## TreatmentGrp2 -0.14276   0.86696  0.02612  -5.465 4.63e-08 ***
## TreatmentGrp3 -0.11202   0.89403  0.02338  -4.792 1.65e-06 ***
## TreatmentGrp4 -0.24331   0.78403  0.02380 -10.225  < 2e-16 ***
## TreatmentGrp5  0.03935   1.04014  0.02190   1.797   0.0723 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1    0.9103     1.0986    0.8906    0.9304
## age_youngold2    0.9116     1.0969    0.8919    0.9319
## age_youngold3    0.8905     1.1230    0.8709    0.9104
## age_youngold4    0.8512     1.1747    0.8315    0.8715
## TreatmentGrp2    0.8670     1.1535    0.8237    0.9125
## TreatmentGrp3    0.8940     1.1185    0.8540    0.9359
## TreatmentGrp4    0.7840     1.2755    0.7483    0.8215
## TreatmentGrp5    1.0401     0.9614    0.9964    1.0858
## 
## Concordance= 0.547  (se = 0.006 )
## Likelihood ratio test= 974.2  on 8 df,   p=<2e-16
## Wald test            = 950.6  on 8 df,   p=<2e-16
## Score (logrank) test = 953.9  on 8 df,   p=<2e-16
# Check proportional hazards assumption 
temp_oc1=cox.zph(fit_cox_oc,global=T)
print(temp_oc1)
##                    rho  chisq     p
## age_youngold1 -0.00975 0.0980 0.754
## age_youngold2  0.01383 0.3491 0.555
## age_youngold3  0.00590 0.0682 0.794
## age_youngold4  0.00240 0.0149 0.903
## TreatmentGrp2  0.02107 1.5300 0.216
## TreatmentGrp3  0.01945 1.2683 0.260
## TreatmentGrp4  0.02074 1.4541 0.228
## TreatmentGrp5  0.01279 0.5566 0.456
## GLOBAL              NA 4.2572 0.833
# Assumption is satisfied