Step 1. Import data and clean Data
Delete observations with missing values on the outcome variables, DX_LASTCONTACT_DEATH_MONTHS & PUF_VITAL_STATUS
Delete observations with value 0 in TUMORS_SIZE_NEW_CAT (only 9 cases) due to difficulty in balancing PS
Then divide dataset into OP and OC subsets
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.5.3
library(survival)
## Warning: package 'survival' was built under R version 3.5.3
library(survminer)
## Warning: package 'survminer' was built under R version 3.5.3
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(timereg)
## Warning: package 'timereg' was built under R version 3.5.3
library(twang)
## Warning: package 'twang' was built under R version 3.5.3
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: xtable
## Loading required package: lattice
## Loading required package: latticeExtra
## Warning: package 'latticeExtra' was built under R version 3.5.3
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
library(survey)
library(readxl)
cancer <- read_excel("PREPSM_AllSites_2004Onwards_NEW2019.xlsx")
attach(cancer)
cancer_s=data.frame(cbind(age_youngold, AGE, SEX,RACE_RECODE,INSURANCE_STATUS,CDCC_TOTAL,MED_INC_QUAR_12, UR_new,
Primary_site_3grp,TNM_T_new,TNM_N_new,TUMORS_SIZE_NEW_CAT, HPVStatus, TreatmentGrp,
Primary_site_recode,FACILITY_TYPE_CD,
DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS
))
#summary(cancer_s)
detach(cancer)
# DX_LASTCONTACT_DEATH_MONTHS has 11714 missing values
# PUF_VITAL_STATUS has 11707 missing values, alive = 1, dead = 0
# Remove observations with missing values in vital status and survival month
cancer_s1=cancer_s %>% drop_na(DX_LASTCONTACT_DEATH_MONTHS)
cancer_s1=cancer_s1[cancer_s1$TUMORS_SIZE_NEW_CAT!=0,]
# Make variables factor
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==1]=0
cancer_s1$dead[cancer_s1$PUF_VITAL_STATUS==0]=1
cancer_s1$age_youngold=factor(cancer_s1$age_youngold)
cancer_s1$SEX=factor(cancer_s1$SEX)
cancer_s1$RACE_RECODE=factor(cancer_s1$RACE_RECODE)
cancer_s1$INSURANCE_STATUS=factor(cancer_s1$INSURANCE_STATUS)
cancer_s1$CDCC_TOTAL=factor(cancer_s1$CDCC_TOTAL)
cancer_s1$MED_INC_QUAR_12=factor(cancer_s1$MED_INC_QUAR_12)
cancer_s1$UR_new=factor(cancer_s1$UR_new)
cancer_s1$Primary_site_3grp=factor(cancer_s1$Primary_site_3grp)
cancer_s1$TNM_T_new=factor(cancer_s1$TNM_T_new)
cancer_s1$TNM_N_new=factor(cancer_s1$TNM_N_new)
cancer_s1$TUMORS_SIZE_NEW_CAT=factor(cancer_s1$TUMORS_SIZE_NEW_CAT)
cancer_s1$HPVStatus=factor(cancer_s1$HPVStatus)
cancer_s1$TreatmentGrp=factor(cancer_s1$TreatmentGrp)
cancer_s1$Primary_site_recode=factor(cancer_s1$Primary_site_recode)
cancer_s1$FACILITY_TYPE_CD=factor(cancer_s1$FACILITY_TYPE_CD)
# Divide data into OP and OC subsets
cancer_op=cancer_s1[cancer_s1$Primary_site_3grp==2, ]
cancer_op$Primary_site_recode=as.numeric(cancer_op$Primary_site_recode)
cancer_op$Primary_site_recode=as.factor(cancer_op$Primary_site_recode)
table(cancer_op$Primary_site_recode)
##
## 2 8 9
## 6006 7697 1300
# 2 = base of tong, 8 = Tonsil, 9 = Orpharynx
cancer_op$HPVStatus=as.numeric(cancer_op$HPVStatus)
cancer_op$HPVStatus=as.factor(cancer_op$HPVStatus)
table(cancer_op$HPVStatus)
##
## 1 2
## 5866 9137
# 1 = HPV Neg, 2 = HPV Pos
cancer_oc=cancer_s1[cancer_s1$Primary_site_3grp==1, ]
cancer_oc$Primary_site_recode=as.numeric(cancer_oc$Primary_site_recode)
cancer_oc$Primary_site_recode=as.factor(cancer_oc$Primary_site_recode)
table(cancer_oc$Primary_site_recode)
##
## 1 3 4 5 6 7
## 2573 6897 2892 4353 2156 3946
# 1 = lip, 3= other and unspecified parts of tongue, 4 = gum, 5 = floor of mouth
# 6 = palate, 7 = other and unspecified parts of mouth
KM curves before propensity score weighting
surv_object_op=Surv(time=cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$dead)
# K-M curve OP
fit_op=survfit(surv_object_op~age_youngold,data=cancer_op)
ggsurvplot(fit_op,data=cancer_op,pval=TRUE,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

surv_object_oc=Surv(time=cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$dead)
# K-M curve for OC
fit_oc=survfit(surv_object_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc,data=cancer_oc,pval=TRUE,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

Propensity score weighting and balanace checking (separate for OC and OP)
set.seed(1)
mnps.op <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
+TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode+HPVStatus,
data = cancer_op,
estimand = "ATE",
verbose = FALSE,
stop.method = c("es.mean"),
n.trees = 5000)
table_op=bal.table(mnps.op,digits=2)
write.csv(table_op, file = "table_op.csv",row.names=T, na="")
table_op_cov=bal.table(mnps.op,digits = 2,collapse.to = "covariate")
write.csv(table_op_cov, file = "table_op_cov.csv",row.names=T, na="")
#plot(mnps.op,plots=1)
#plot(mnps.op,plots=2)
plot(mnps.op,plots=3)

plot(mnps.op,plots=4)

mnps.oc <- mnps(age_youngold ~ SEX+RACE_RECODE+CDCC_TOTAL+MED_INC_QUAR_12+UR_new
+TNM_T_new+TNM_N_new+TUMORS_SIZE_NEW_CAT+TreatmentGrp+Primary_site_recode,
data = cancer_oc,
estimand = "ATE",
verbose = FALSE,
stop.method = c("es.mean"),
n.trees = 5000)
table_oc=bal.table(mnps.oc,digits=2)
write.csv(table_oc, file = "table_oc.csv",row.names=T, na="")
table_oc_cov=bal.table(mnps.oc,digits = 2,collapse.to = "covariate")
write.csv(table_oc_cov, file = "table_op_cov.csv",row.names=T, na="")
#plot(mnps.oc,plots=1)
#plot(mnps.oc,plots=2)
plot(mnps.oc,plots=3)

plot(mnps.oc,plots=4)

Regression Analysis on propensity score weighted data (separate for OC and OP)
# OP
cancer_op$weight=get.weights(mnps.op,stop.method="es.mean")
surv_op=Surv(cancer_op$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_op$PUF_VITAL_STATUS)
# Post-PSW KM curve for OP
fit_op_ps=survfit(surv_op~age_youngold,data=cancer_op)
ggsurvplot(fit_op_ps,data=cancer_op,pval=TRUE,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

# Post PSW Cox Regression for OP
fit_cox_op1=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold,
data=cancer_op,weights=cancer_op$weight)
summary(fit_cox_op1)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS ==
## 1) ~ age_youngold, data = cancer_op, weights = cancer_op$weight)
##
## n= 15003, number of events= 12107
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age_youngold1 0.02733 1.02770 0.01360 2.009 0.0445 *
## age_youngold2 0.05645 1.05808 0.01353 4.174 2.99e-05 ***
## age_youngold3 0.10034 1.10554 0.01360 7.377 1.62e-13 ***
## age_youngold4 0.11744 1.12461 0.01417 8.286 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1 1.028 0.9730 1.001 1.055
## age_youngold2 1.058 0.9451 1.030 1.087
## age_youngold3 1.106 0.9045 1.076 1.135
## age_youngold4 1.125 0.8892 1.094 1.156
##
## Concordance= 0.507 (se = 0.008 )
## Likelihood ratio test= 100.8 on 4 df, p=<2e-16
## Wald test = 100.9 on 4 df, p=<2e-16
## Score (logrank) test = 100.9 on 4 df, p=<2e-16
# Check proportional hazards assumption
temp_op1=cox.zph(fit_cox_op1,global=T)
print(temp_op1)
## rho chisq p
## age_youngold1 0.0958 7.74 0.00539
## age_youngold2 0.0810 9.97 0.00159
## age_youngold3 0.0505 3.64 0.05637
## age_youngold4 0.0731 4.83 0.02791
## GLOBAL NA 11.69 0.01983
# Proportional hazard assumption is not satisfied. Run Aalen's additive hazards model for OP.
add3=aalen(surv_op~age_youngold,data=cancer_op, weights=cancer_op$weight,residuals = 1)
summary(add3)
## Additive Aalen Model
##
## Test for nonparametric terms
##
## Test for non-significant effects
## Supremum-test of significance p-value H_0: B(t)=0
## (Intercept) Inf 0
## age_youngold1 Inf 0
## age_youngold2 Inf 0
## age_youngold3 Inf 0
## age_youngold4 Inf 0
##
## Test for time invariant effects
## Kolmogorov-Smirnov test p-value H_0:constant effect
## (Intercept) 1.48 0.017
## age_youngold1 2.29 0.013
## age_youngold2 1.67 0.017
## age_youngold3 1.38 0.073
## age_youngold4 1.29 0.161
## Cramer von Mises test p-value H_0:constant effect
## (Intercept) 74.5 0
## age_youngold1 132.0 0
## age_youngold2 61.7 0
## age_youngold3 38.1 0
## age_youngold4 28.9 0
##
##
##
## Call:
## aalen(formula = surv_op ~ age_youngold, data = cancer_op, residuals = 1,
## weights = cancer_op$weight)
# Create estimated cumulative regression coefficients in the additive hazards model with 95% pointwise CI
par(mfrow=c(2,2))
plot(add3,specific.comps = 2, ylim=c(-1,5), mains=F)
title(main="40-49 vs. <40")
plot(add3,specific.comps = 3, ylim=c(-1,5), mains=F)
title(main="50-59 vs. <40")
plot(add3,specific.comps = 4, ylim=c(-1,5), mains=F)
title(main="60-69 vs. <40")
plot(add3,specific.comps = 5, ylim=c(-1,5), mains=F)
title(main="70+ vs. <40")

# OC
cancer_oc$weight=get.weights(mnps.oc,stop.method="es.mean")
surv_oc=Surv(cancer_oc$DX_LASTCONTACT_DEATH_MONTHS,event=cancer_oc$PUF_VITAL_STATUS)
# Post- PSW KM curve for OC
fit_oc_ps=survfit(surv_oc~age_youngold,data=cancer_oc)
ggsurvplot(fit_oc_ps,data=cancer_oc,pval=TRUE,
legend="bottom",legend.title="Age",legend.labs=c("0-39","40-49","50-59","60-69","70+"),
risk.table = TRUE, tables.theme = theme_cleantable())

# Post-PSW Cox regression for OC
fit_cox_oc=coxph(Surv(DX_LASTCONTACT_DEATH_MONTHS,PUF_VITAL_STATUS==1)~age_youngold+TreatmentGrp,
data=cancer_oc,weights=cancer_oc$weight)
summary(fit_cox_oc)
## Call:
## coxph(formula = Surv(DX_LASTCONTACT_DEATH_MONTHS, PUF_VITAL_STATUS ==
## 1) ~ age_youngold + TreatmentGrp, data = cancer_oc, weights = cancer_oc$weight)
##
## n= 22817, number of events= 15451
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age_youngold1 -0.09402 0.91027 0.01114 -8.439 < 2e-16 ***
## age_youngold2 -0.09251 0.91164 0.01119 -8.264 < 2e-16 ***
## age_youngold3 -0.11602 0.89046 0.01132 -10.248 < 2e-16 ***
## age_youngold4 -0.16106 0.85125 0.01200 -13.420 < 2e-16 ***
## TreatmentGrp2 -0.14276 0.86696 0.02612 -5.465 4.63e-08 ***
## TreatmentGrp3 -0.11202 0.89403 0.02338 -4.792 1.65e-06 ***
## TreatmentGrp4 -0.24331 0.78403 0.02380 -10.225 < 2e-16 ***
## TreatmentGrp5 0.03935 1.04014 0.02190 1.797 0.0723 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age_youngold1 0.9103 1.0986 0.8906 0.9304
## age_youngold2 0.9116 1.0969 0.8919 0.9319
## age_youngold3 0.8905 1.1230 0.8709 0.9104
## age_youngold4 0.8512 1.1747 0.8315 0.8715
## TreatmentGrp2 0.8670 1.1535 0.8237 0.9125
## TreatmentGrp3 0.8940 1.1185 0.8540 0.9359
## TreatmentGrp4 0.7840 1.2755 0.7483 0.8215
## TreatmentGrp5 1.0401 0.9614 0.9964 1.0858
##
## Concordance= 0.547 (se = 0.006 )
## Likelihood ratio test= 974.2 on 8 df, p=<2e-16
## Wald test = 950.6 on 8 df, p=<2e-16
## Score (logrank) test = 953.9 on 8 df, p=<2e-16
# Check proportional hazards assumption
temp_oc1=cox.zph(fit_cox_oc,global=T)
print(temp_oc1)
## rho chisq p
## age_youngold1 -0.00975 0.0980 0.754
## age_youngold2 0.01383 0.3491 0.555
## age_youngold3 0.00590 0.0682 0.794
## age_youngold4 0.00240 0.0149 0.903
## TreatmentGrp2 0.02107 1.5300 0.216
## TreatmentGrp3 0.01945 1.2683 0.260
## TreatmentGrp4 0.02074 1.4541 0.228
## TreatmentGrp5 0.01279 0.5566 0.456
## GLOBAL NA 4.2572 0.833
# Assumption is satisfied