library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("dplyr")
library("readxl")
library("compareGroups")
library("readxl")
library("lubridate")
library("compareGroups")
library("survival")
library("survminer")
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
##
## The following object is masked from 'package:survival':
##
## myeloma
##Combining new data
sts_2012_2022 <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2012-2022/sts_2012_2022.xlsx") #3358 observations, 631 variables
#names(sts_2012_2022)
sts_2022_2024 <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2022-2024/2022-2024_Thoracic_Operations.xlsx") #1410 observations, 554 variables
#names(sts_2012_2022)
sts_2022_2024_mrns <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2022-2024/STS_THORACIC_2022-2024.xlsx")
sts_2022_2024_mrns <- sts_2022_2024_mrns %>% rename(PatID = "STS_PatID") %>% rename(mrn = "MRN") %>% rename(surgdt = "DateOfSurgery")
sts_2022_2024 <- left_join(sts_2022_2024,sts_2022_2024_mrns, by = "PatID" )
sts_2012_2022 <- sts_2012_2022 %>% mutate(across(everything(), as.character))
sts_2022_2024 <- sts_2022_2024 %>% mutate(across(everything(), as.character))
common_cols <- intersect(names(sts_2012_2022), names(sts_2022_2024))
sts_2012_2022_common <- sts_2012_2022[, common_cols]
sts_2022_2024_common <- sts_2022_2024[, common_cols]
sts_2012_2024 <- bind_rows(sts_2012_2022_common, sts_2022_2024_common)#4768 obs, 532 variables in common
#removing patients missing MRNs
sts_2012_2024 <- sts_2012_2024 %>% filter(!is.na(mrn))#4577 obs, 532 columns
#removing true duplicates (same patient same surgery date)
test <- sts_2012_2024 %>% select(mrn, surgdt, Age)
dups <- test[duplicated(test[c("mrn", "surgdt")]) | duplicated(test[c("mrn", "surgdt")], fromLast = TRUE), ]
dups <- sts_2012_2024[duplicated(sts_2012_2024[c("mrn", "surgdt")]) | duplicated(sts_2012_2024[c("mrn", "surgdt")], fromLast = TRUE), ]
sts_2012_2024_clean <- sts_2012_2024 %>%
distinct(mrn, surgdt, .keep_all = TRUE) #4141
dup_mrns <- unique(sts_2012_2024_clean$mrn[duplicated(sts_2012_2024_clean$mrn)])
#dup_mrns
##Creating categories of surgery
sts_2012_2024_clean <- sts_2012_2024_clean %>% mutate(disease = ifelse(CategoryPrim == 130 | CategoryPrim == 150 | CategoryPrim == 160 | CategoryPrim == 170 | CategoryPrim == 180 | CategoryPrim == 190 | CategoryPrim == 1720, "lung cancer",
ifelse(CategoryPrim == 710 | CategoryPrim == 700 | CategoryPrim == 690 | CategoryPrim == 680 | CategoryPrim == 1460, "esophageal cancer",
ifelse(CategoryPrim == 380, "thymus cancer",
ifelse(CategoryPrim == 510, "mesothelioma",
"other")))))
table(sts_2012_2024_clean$disease)
##
## esophageal cancer lung cancer mesothelioma other
## 81 2503 116 1318
## thymus cancer
## 121
meso_2012_2024 <- sts_2012_2024_clean %>% filter(disease == "mesothelioma") #116 obs
meso_2012_2024_unique <- meso_2012_2024 %>% group_by(mrn) %>% slice(1) %>% ungroup() #113 unique patients
dup_mrns_meso <- unique(meso_2012_2024$mrn[duplicated(meso_2012_2024$mrn)])
dup_mrns_meso
## [1] "9128120" "H186869"
#meso_2012_2024_unique_simple <- meso_2012_2024_unique %>% select(mrn, Age, surgdt, Surgeon)
#write.csv(meso_2012_2024_unique_simple, "Meso/meso_2012_2024_unique_simple.csv", row.names = FALSE)
##Nico Pinzon Data abstraction (Nov 2025)
meso_sts_EHR <- read_excel("//users12/users12$/tumins01/Data/Personal/Students/Nico/meso_2012_2024_unique_simple.xlsx")
##Abestos Exposure by Sex
table(meso_sts_EHR$Asbestos_Exposure) #61
##
## 0 1
## 29 61
table(meso_sts_EHR$Asbestos_Exposure_Probable) #68
##
## 0 1
## 28 68
table(meso_sts_EHR$Asbestos_Exposure_Type)
##
## 1980s in Houston, TX Apartment
## 1 7
## Boiler factory Child exposure
## 1 4
## Construction Dry cleaning breathing exposure
## 15 1
## Electricity work Factory
## 5 4
## Husband Not specified
## 2 7
## Occupational, not specified which Shipyard
## 2 11
## Tilling industry
## 1
table(meso_sts_EHR$BAP1_expression)#35
##
## 0 1
## 47 35
table(meso_sts_EHR$Asbestos_Exposure, meso_sts_EHR$BAP1_expression) #18/35
##
## 0 1
## 0 7 13
## 1 29 18
table(meso_sts_EHR$Asbestos_Exposure_Probable, meso_sts_EHR$BAP1_expression) #22/35
##
## 0 1
## 0 7 12
## 1 32 22
table(meso_sts_EHR$Sex)#29 (25.7%) females
##
## F M
## 29 83
table(meso_sts_EHR$Asbestos_Exposure_Probable, meso_sts_EHR$Sex)
##
## F M
## 0 13 15
## 1 10 58
table(meso_sts_EHR$BAP1_expression, meso_sts_EHR$Sex)
##
## F M
## 0 12 35
## 1 10 25
table_compare_sex <- compareGroups(Asbestos_Exposure_Probable ~ Sex, data = meso_sts_EHR)
summary(table_compare_sex)
##
## --- Descriptives of each row-variable by groups of 'Asbestos_Exposure_Probable' ---
##
## -------------------
## row-variable: Sex
##
## F M F% M% p.overall
## [ALL] 23 73 23.95833 76.04167
## 0 13 15 46.42857 53.57143 0.002313
## 1 10 58 14.70588 85.29412
##
## OR OR.lower OR.upper
## F 1
## M 4.900978 1.804754 13.87374
table_out <- createTable(table_compare_sex, show.ratio = TRUE, show.all = TRUE)
print(table_out, show.ratio = TRUE, show.all = TRUE)
##
## --------Summary descriptives table by 'Asbestos_Exposure_Probable'---------
##
## _________________________________________________________________________
## [ALL] 0 1 OR p.ratio p.overall
## N=96 N=28 N=68
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Sex: 0.002
## F 23 (24.0%) 13 (46.4%) 10 (14.7%) Ref. Ref.
## M 73 (76.0%) 15 (53.6%) 58 (85.3%) 4.90 [1.80;13.9] 0.002
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
meso_sts_EHR <- meso_sts_EHR %>% mutate(Asbestos_Exposure_Probable2 = ifelse(BAP1_expression == 1, 1, Asbestos_Exposure_Probable))
table(meso_sts_EHR$Asbestos_Exposure_Probable2)#67
##
## 0 1
## 7 67
table(meso_sts_EHR$Asbestos_Exposure_Probable2, meso_sts_EHR$BAP1_expression)
##
## 0 1
## 0 7 0
## 1 32 35
table_compare_sex2 <- compareGroups(Asbestos_Exposure_Probable2 ~ Sex + Age, data = meso_sts_EHR)
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
summary(table_compare_sex2)
##
## --- Descriptives of each row-variable by groups of 'Asbestos_Exposure_Probable2' ---
##
## -------------------
## row-variable: Sex
##
## F M F% M% p.overall
## [ALL] 20 54 27.02703 72.97297
## 0 5 2 71.42857 28.57143 0.013535
## 1 15 52 22.38806 77.61194
##
## OR OR.lower OR.upper
## F 1
## M 8.035833 1.493429 67.59628
##
## -------------------
## row-variable: Age
##
## N mean sd lower upper p.overall
## [ALL] 74 67.24324 11.6511 64.5439 69.94259
## 0 7 66.14286 11.49534 55.51144 76.77427 0.797815
## 1 67 67.35821 11.74696 64.4929 70.22352
##
## OR OR.lower OR.upper
## [1,] 1.008655 0.946154 1.075286
table_out <- createTable(table_compare_sex2, show.ratio = TRUE, show.all = TRUE)
print(table_out, show.ratio = TRUE, show.all = TRUE)
##
## --------Summary descriptives table by 'Asbestos_Exposure_Probable2'---------
##
## ____________________________________________________________________________
## [ALL] 0 1 OR p.ratio p.overall
## N=74 N=7 N=67
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Sex: 0.014
## F 20 (27.0%) 5 (71.4%) 15 (22.4%) Ref. Ref.
## M 54 (73.0%) 2 (28.6%) 52 (77.6%) 8.04 [1.49;67.6] 0.015
## Age 67.2 (11.7) 66.1 (11.5) 67.4 (11.7) 1.01 [0.95;1.08] 0.792 0.798
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
##Meso Survival by Asbestos Exposure and Sex #looked up deaths, last encounter dates in EPIC on 3.26.2024; needs to be updated with new patients & more recent EPIC data
meso2 <- read.csv("//users12/users12$/tumins01/Data/Personal/Projects/STS/Meso/meso.csv") #2012-2022; total patients n=80 (so 30 new patients since 2022)
table(meso2$Dead2) #11 dead
##
## 0 1
## 68 11
names(meso2) ##Which other biomarkers beyond BAP.1 are useful??
## [1] "mrn" "Age" "Gender" "surgdt" "truedup"
## [6] "Surgeon" "Proc" "Dead2" "deaddt" "CensorDate"
## [11] "PDL.1...." "Ki.67...." "BAP.1" "Cam.5.2" "calretinin"
## [16] "WT.1" "CK5.6" "D2.40" "BER.EP4" "CEA"
## [21] "B72.3" "TTF.1" "Leu.M1" "CK7"
meso2$surgdt <- as.Date(meso2$surgdt, format = "%m/%d/%Y")
meso2$CensorDate <- as.Date(meso2$CensorDate, format = "%m/%d/%Y")
meso2$survDays <- difftime(meso2$CensorDate, meso2$surgdt, units = "days")
meso2$survYears <- (meso2$survDays)/365
meso2$survDays <- as.numeric(meso2$survDays)
meso2$survYears <- as.numeric(meso2$survYears)
meso2 <- meso2 %>% select(mrn, Dead2:survYears)
meso2 <- right_join(meso_sts_EHR, meso2, by = "mrn")
#survival
Surv(meso2$survDays, meso2$Dead2)[1:10]
## [1] 1706+ 401+ 129+ 3138+ 150+ 658+ 38+ 1905+ 24 258+
surv_object <- Surv(meso2$survDays, meso2$Dead2)
surv_object
## [1] 1706+ 401+ 129+ 3138+ 150+ 658+ 38+ 1905+ 24 258+ 2184+ 406+
## [13] 1846+ 211+ 1169+ 181+ 426+ 213+ 367+ 373+ 2972+ 185+ 176+ 46+
## [25] 328 307+ 169+ 282+ 1605+ NA? 213+ 440+ 1659+ 17+ 545+ 499+
## [37] 448 527+ 1334+ 171+ 495+ 304+ 318+ 772+ 189+ 70+ 1122+ 1789+
## [49] 192+ 185+ 184+ 142+ 1403 828+ 247+ 296 20+ 58 2069+ 1970+
## [61] 154 1228+ 425+ 1228 1088+ 143+ 609+ 254+ 974+ 143 861 174+
## [73] 382+ 1031+ 525+ 419+ 633+ 149+ 736+ 32
fit1 <- survfit(surv_object ~ 1, data = meso2)
summary(fit1)
## Call: survfit(formula = surv_object ~ 1, data = meso2)
##
## 1 observation deleted due to missingness
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 24 77 1 0.987 0.0129 0.962 1.000
## 32 76 1 0.974 0.0181 0.939 1.000
## 58 73 1 0.961 0.0223 0.918 1.000
## 143 69 1 0.947 0.0259 0.897 0.999
## 154 65 1 0.932 0.0293 0.876 0.991
## 296 47 1 0.912 0.0348 0.847 0.983
## 328 43 1 0.891 0.0399 0.816 0.973
## 448 33 1 0.864 0.0470 0.777 0.961
## 861 21 1 0.823 0.0601 0.713 0.950
## 1228 15 1 0.768 0.0772 0.631 0.935
## 1403 12 1 0.704 0.0936 0.543 0.914
ggsurvplot(fit1, data = meso2, pval = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared.
## This is a null model.
## Ignoring unknown labels:
## • linetype : "1"
## Ignoring unknown labels:
## • linetype : "1"
fit2 <- survfit(surv_object ~ Sex, data = meso2)
summary(fit2)
## Call: survfit(formula = surv_object ~ Sex, data = meso2)
##
## 1 observation deleted due to missingness
## Sex=F
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 143 18 1 0.944 0.0540 0.844 1
## 154 16 1 0.885 0.0763 0.748 1
##
## Sex=M
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 24 58 1 0.983 0.0171 0.950 1.000
## 32 57 1 0.966 0.0240 0.920 1.000
## 58 54 1 0.948 0.0294 0.892 1.000
## 296 35 1 0.921 0.0391 0.847 1.000
## 328 31 1 0.891 0.0478 0.802 0.990
## 448 22 1 0.850 0.0604 0.740 0.977
## 861 11 1 0.773 0.0919 0.612 0.976
## 1228 6 1 0.644 0.1404 0.420 0.987
## 1403 4 1 0.483 0.1747 0.238 0.982
ggsurvplot(fit2, data = meso2, pval = TRUE)
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"
fit3 <- survfit(surv_object ~ BAP1_expression, data = meso2)
summary(fit3)
## Call: survfit(formula = surv_object ~ BAP1_expression, data = meso2)
##
## 29 observations deleted due to missingness
## BAP1_expression=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 58 29 1 0.966 0.0339 0.901 1
## 143 27 1 0.930 0.0479 0.840 1
## 861 9 1 0.826 0.1063 0.642 1
## 1228 5 1 0.661 0.1706 0.399 1
## 1403 3 1 0.441 0.2129 0.171 1
##
## BAP1_expression=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 32 20 1 0.950 0.0487 0.859 1
## 296 14 1 0.882 0.0795 0.739 1
## 448 10 1 0.794 0.1101 0.605 1
ggsurvplot(fit3, data = meso2, pval = TRUE)
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"