library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("dplyr")
library("readxl")
library("compareGroups")
library("readxl")
library("lubridate")
library("compareGroups")
library("survival")
library("survminer")
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma

##Combining new data

sts_2012_2022 <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2012-2022/sts_2012_2022.xlsx") #3358 observations, 631 variables
#names(sts_2012_2022)

sts_2022_2024 <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2022-2024/2022-2024_Thoracic_Operations.xlsx") #1410 observations, 554 variables
#names(sts_2012_2022)
sts_2022_2024_mrns <- read_excel("//users12/users12$/tumins01/Data/Personal/Projects/STS/Data 2022-2024/STS_THORACIC_2022-2024.xlsx")
sts_2022_2024_mrns <- sts_2022_2024_mrns %>% rename(PatID = "STS_PatID") %>% rename(mrn = "MRN") %>% rename(surgdt = "DateOfSurgery")
sts_2022_2024 <- left_join(sts_2022_2024,sts_2022_2024_mrns, by = "PatID" )

sts_2012_2022 <- sts_2012_2022 %>% mutate(across(everything(), as.character))
sts_2022_2024 <- sts_2022_2024 %>% mutate(across(everything(), as.character))

common_cols <- intersect(names(sts_2012_2022), names(sts_2022_2024))

sts_2012_2022_common <- sts_2012_2022[, common_cols]
sts_2022_2024_common <- sts_2022_2024[, common_cols]

sts_2012_2024 <- bind_rows(sts_2012_2022_common, sts_2022_2024_common)#4768 obs, 532 variables in common

#removing patients missing MRNs
sts_2012_2024 <- sts_2012_2024 %>% filter(!is.na(mrn))#4577 obs, 532 columns 

#removing true duplicates (same patient same surgery date)
test <- sts_2012_2024 %>% select(mrn, surgdt, Age)
dups <- test[duplicated(test[c("mrn", "surgdt")]) | duplicated(test[c("mrn", "surgdt")], fromLast = TRUE), ]

dups <- sts_2012_2024[duplicated(sts_2012_2024[c("mrn", "surgdt")]) | duplicated(sts_2012_2024[c("mrn", "surgdt")], fromLast = TRUE), ]

sts_2012_2024_clean <- sts_2012_2024 %>%
distinct(mrn, surgdt, .keep_all = TRUE) #4141

dup_mrns <- unique(sts_2012_2024_clean$mrn[duplicated(sts_2012_2024_clean$mrn)])
#dup_mrns

##Creating categories of surgery 
sts_2012_2024_clean <- sts_2012_2024_clean %>% mutate(disease = ifelse(CategoryPrim == 130 | CategoryPrim == 150 | CategoryPrim == 160 | CategoryPrim == 170 | CategoryPrim == 180 | CategoryPrim == 190 | CategoryPrim == 1720, "lung cancer",
ifelse(CategoryPrim == 710 | CategoryPrim == 700 | CategoryPrim == 690 | CategoryPrim == 680 | CategoryPrim == 1460, "esophageal cancer",                 
ifelse(CategoryPrim == 380, "thymus cancer",  
ifelse(CategoryPrim == 510, "mesothelioma",
"other")))))

table(sts_2012_2024_clean$disease)
## 
## esophageal cancer       lung cancer      mesothelioma             other 
##                81              2503               116              1318 
##     thymus cancer 
##               121
meso_2012_2024 <- sts_2012_2024_clean %>% filter(disease == "mesothelioma") #116 obs
meso_2012_2024_unique <- meso_2012_2024 %>% group_by(mrn) %>% slice(1) %>% ungroup() #113 unique patients

dup_mrns_meso <- unique(meso_2012_2024$mrn[duplicated(meso_2012_2024$mrn)])
dup_mrns_meso
## [1] "9128120" "H186869"
#meso_2012_2024_unique_simple <- meso_2012_2024_unique %>% select(mrn, Age, surgdt, Surgeon)
#write.csv(meso_2012_2024_unique_simple, "Meso/meso_2012_2024_unique_simple.csv", row.names = FALSE)

##Nico Pinzon Data abstraction (Nov 2025)

meso_sts_EHR <- read_excel("//users12/users12$/tumins01/Data/Personal/Students/Nico/meso_2012_2024_unique_simple.xlsx")

##Abestos Exposure by Sex

table(meso_sts_EHR$Asbestos_Exposure) #61
## 
##  0  1 
## 30 61
table(meso_sts_EHR$Asbestos_Exposure_Probable) #68
## 
##  0  1 
## 29 68
table(meso_sts_EHR$Asbestos_Exposure_Type)
## 
##              1980s in Houston, TX                         Apartment 
##                                 1                                 7 
##                    Boiler factory                    Child exposure 
##                                 1                                 4 
##                      Construction   Dry cleaning breathing exposure 
##                                15                                 1 
##                  Electricity work                           Factory 
##                                 5                                 4 
##                           Husband                     Not specified 
##                                 2                                 7 
## Occupational, not specified which                          Shipyard 
##                                 2                                11 
##                  Tilling industry 
##                                 1
table(meso_sts_EHR$BAP1_expression)#35
## 
##  0  1 
## 47 36
table(meso_sts_EHR$Asbestos_Exposure, meso_sts_EHR$BAP1_expression) #18/35
##    
##      0  1
##   0  7 14
##   1 29 18
table(meso_sts_EHR$Asbestos_Exposure_Probable, meso_sts_EHR$BAP1_expression) #22/35
##    
##      0  1
##   0  7 13
##   1 32 22
table(meso_sts_EHR$Sex)#29 (25.7%) females
## 
##  F  M 
## 29 83
table(meso_sts_EHR$Asbestos_Exposure_Probable, meso_sts_EHR$Sex)
##    
##      F  M
##   0 13 15
##   1 10 58
table(meso_sts_EHR$BAP1_expression, meso_sts_EHR$Sex)
##    
##      F  M
##   0 12 35
##   1 10 25
table_compare_sex <- compareGroups(Asbestos_Exposure_Probable ~  Sex, data = meso_sts_EHR)
summary(table_compare_sex)
## 
##  --- Descriptives of each row-variable by groups of 'Asbestos_Exposure_Probable' ---
## 
## ------------------- 
## row-variable: Sex 
## 
##       F  M  F%       M%       p.overall
## [ALL] 23 73 23.95833 76.04167          
## 0     13 15 46.42857 53.57143 0.002313 
## 1     10 58 14.70588 85.29412          
## 
##   OR       OR.lower OR.upper
## F 1                         
## M 4.900978 1.804754 13.87374
table_out <- createTable(table_compare_sex, show.ratio = TRUE, show.all = TRUE)
print(table_out, show.ratio = TRUE, show.all = TRUE)
## 
## --------Summary descriptives table by 'Asbestos_Exposure_Probable'---------
## 
## _________________________________________________________________________ 
##         [ALL]        0          1             OR        p.ratio p.overall 
##          N=96       N=28       N=68                                       
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Sex:                                                              0.002   
##     F 23 (24.0%) 13 (46.4%) 10 (14.7%)       Ref.        Ref.             
##     M 73 (76.0%) 15 (53.6%) 58 (85.3%) 4.90 [1.80;13.9]  0.002            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
meso_sts_EHR <- meso_sts_EHR %>% mutate(Asbestos_Exposure_Probable2 = ifelse(BAP1_expression == 1, 1, Asbestos_Exposure_Probable))

table(meso_sts_EHR$Asbestos_Exposure_Probable2)#67
## 
##  0  1 
##  7 68
table(meso_sts_EHR$Asbestos_Exposure_Probable2, meso_sts_EHR$BAP1_expression)
##    
##      0  1
##   0  7  0
##   1 32 36
table_compare_sex2 <- compareGroups(Asbestos_Exposure_Probable2 ~  Sex + Age, data = meso_sts_EHR)
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
summary(table_compare_sex2)
## 
##  --- Descriptives of each row-variable by groups of 'Asbestos_Exposure_Probable2' ---
## 
## ------------------- 
## row-variable: Sex 
## 
##       F  M  F%       M%       p.overall
## [ALL] 20 54 27.02703 72.97297          
## 0     5  2  71.42857 28.57143 0.013535 
## 1     15 52 22.38806 77.61194          
## 
##   OR       OR.lower OR.upper
## F 1                         
## M 8.035833 1.493429 67.59628
## 
## ------------------- 
## row-variable: Age 
## 
##       N  mean     sd       lower    upper    p.overall
## [ALL] 75 67.26667 11.57389 64.60376 69.92958          
## 0     7  66.14286 11.49534 55.51144 76.77427 0.79366  
## 1     68 67.38235 11.66066 64.55987 70.20483          
## 
##      OR       OR.lower OR.upper
## [1,] 1.008927 0.946172 1.075844
table_out <- createTable(table_compare_sex2, show.ratio = TRUE, show.all = TRUE)
print(table_out, show.ratio = TRUE, show.all = TRUE)
## 
## --------Summary descriptives table by 'Asbestos_Exposure_Probable2'---------
## 
## ____________________________________________________________________________ 
##          [ALL]         0           1             OR        p.ratio p.overall 
##          N=75         N=7        N=68                                        
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Sex:                                                                 0.014   
##     F 20 (27.0%)   5 (71.4%)  15 (22.4%)        Ref.        Ref.             
##     M 54 (73.0%)   2 (28.6%)  52 (77.6%)  8.04 [1.49;67.6]  0.015            
## Age   67.3 (11.6) 66.1 (11.5) 67.4 (11.7) 1.01 [0.95;1.08]  0.786    0.794   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

##Meso Survival by Asbestos Exposure and Sex #looked up deaths, last encounter dates in EPIC on 3.26.2024; needs to be updated with new patients & more recent EPIC data

meso2 <- read.csv("//users12/users12$/tumins01/Data/Personal/Projects/STS/Meso/meso.csv") #2012-2022; total patients n=80 (so 30 new patients since 2022)
table(meso2$Dead2) #11 dead 
## 
##  0  1 
## 68 11
names(meso2) ##Which other biomarkers beyond BAP.1 are useful??
##  [1] "mrn"        "Age"        "Gender"     "surgdt"     "truedup"   
##  [6] "Surgeon"    "Proc"       "Dead2"      "deaddt"     "CensorDate"
## [11] "PDL.1...."  "Ki.67...."  "BAP.1"      "Cam.5.2"    "calretinin"
## [16] "WT.1"       "CK5.6"      "D2.40"      "BER.EP4"    "CEA"       
## [21] "B72.3"      "TTF.1"      "Leu.M1"     "CK7"
meso2$surgdt <- as.Date(meso2$surgdt, format = "%m/%d/%Y")
meso2$CensorDate <- as.Date(meso2$CensorDate, format = "%m/%d/%Y")

meso2$survDays <- difftime(meso2$CensorDate, meso2$surgdt, units = "days")
meso2$survYears <- (meso2$survDays)/365

meso2$survDays <- as.numeric(meso2$survDays)
meso2$survYears <- as.numeric(meso2$survYears)

meso2 <- meso2 %>% select(mrn, Dead2:survYears)
meso2 <- right_join(meso_sts_EHR, meso2, by = "mrn")

#survival

Surv(meso2$survDays, meso2$Dead2)[1:10]
##  [1] 1706+  401+  129+ 3138+  150+  658+   38+ 1905+   24   258+
surv_object <- Surv(meso2$survDays, meso2$Dead2)
surv_object
##  [1] 1706+  401+  129+ 3138+  150+  658+   38+ 1905+   24   258+ 2184+  406+
## [13] 1846+  211+ 1169+  181+  426+  213+  367+  373+ 2972+  185+  176+   46+
## [25]  328   307+  169+  282+ 1605+   NA?  213+  440+ 1659+   17+  545+  499+
## [37]  448   527+ 1334+  171+  495+  304+  318+  772+  189+   70+ 1122+ 1789+
## [49]  192+  185+  184+  142+ 1403   828+  247+  296    20+   58  2069+ 1970+
## [61]  154  1228+  425+ 1228  1088+  143+  609+  254+  974+  143   861   174+
## [73]  382+ 1031+  525+  419+  633+  149+  736+   32
fit1 <- survfit(surv_object ~ 1, data = meso2)
summary(fit1)
## Call: survfit(formula = surv_object ~ 1, data = meso2)
## 
## 1 observation deleted due to missingness 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    24     77       1    0.987  0.0129        0.962        1.000
##    32     76       1    0.974  0.0181        0.939        1.000
##    58     73       1    0.961  0.0223        0.918        1.000
##   143     69       1    0.947  0.0259        0.897        0.999
##   154     65       1    0.932  0.0293        0.876        0.991
##   296     47       1    0.912  0.0348        0.847        0.983
##   328     43       1    0.891  0.0399        0.816        0.973
##   448     33       1    0.864  0.0470        0.777        0.961
##   861     21       1    0.823  0.0601        0.713        0.950
##  1228     15       1    0.768  0.0772        0.631        0.935
##  1403     12       1    0.704  0.0936        0.543        0.914
ggsurvplot(fit1, data = meso2, pval = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared. 
##  This is a null model.
## Ignoring unknown labels:
## • linetype : "1"
## Ignoring unknown labels:
## • linetype : "1"

fit2 <- survfit(surv_object ~ Sex, data = meso2)
summary(fit2)
## Call: survfit(formula = surv_object ~ Sex, data = meso2)
## 
## 1 observation deleted due to missingness 
##                 Sex=F 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   143     18       1    0.944  0.0540        0.844            1
##   154     16       1    0.885  0.0763        0.748            1
## 
##                 Sex=M 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    24     58       1    0.983  0.0171        0.950        1.000
##    32     57       1    0.966  0.0240        0.920        1.000
##    58     54       1    0.948  0.0294        0.892        1.000
##   296     35       1    0.921  0.0391        0.847        1.000
##   328     31       1    0.891  0.0478        0.802        0.990
##   448     22       1    0.850  0.0604        0.740        0.977
##   861     11       1    0.773  0.0919        0.612        0.976
##  1228      6       1    0.644  0.1404        0.420        0.987
##  1403      4       1    0.483  0.1747        0.238        0.982
ggsurvplot(fit2, data = meso2, pval = TRUE)
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"

fit3 <- survfit(surv_object ~ BAP1_expression, data = meso2)
summary(fit3)
## Call: survfit(formula = surv_object ~ BAP1_expression, data = meso2)
## 
## 29 observations deleted due to missingness 
##                 BAP1_expression=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    58     29       1    0.966  0.0339        0.901            1
##   143     27       1    0.930  0.0479        0.840            1
##   861      9       1    0.826  0.1063        0.642            1
##  1228      5       1    0.661  0.1706        0.399            1
##  1403      3       1    0.441  0.2129        0.171            1
## 
##                 BAP1_expression=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    32     20       1    0.950  0.0487        0.859            1
##   296     14       1    0.882  0.0795        0.739            1
##   448     10       1    0.794  0.1101        0.605            1
ggsurvplot(fit3, data = meso2, pval = TRUE)
## Warning: ggtheme is not a valid theme.
## Please use `theme()` to construct themes.
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"
## Ignoring unknown labels:
## • fill : "Strata"
## • linetype : "1"