Import libraries

library(data.table);library(ggfortify);library(readxl);library(fitdistrplus)
library(stringr);library(lattice); library(nlme);library(gridExtra);library(mixdist)
library(ggplot2);library(tidyverse);library(naniar); library(gtsummary); library(mclust)

Import Aurora data, Batch 1 E2, and Batch 2 E2 data

#import full aurora data
df0 <- read.csv("full_aurora.csv") #4745 x341

#import batch 1
e2_batch1 <- read.csv("AURORA_estrogen_excl_outliers_excl_BMI.csv") #283 x 3035

#subset batch 1 to relevant demographic data
df <- e2_batch1[,c("PID","ED_Age","HighestGrade","BMI","WK8_Pain","M3_Pain","M6_Pain","E2.Concentration..pg.ml._T0_sqrt","ED_NowPain.x","Race","Date.Run_T0")] #283 x 14

#import batch 2 
e2_batch2 <- read_excel("E2_batch2_reran_final.xlsx") #900 x 6

Rename some columns in Batch 1

df <- df %>%
  rename(E2_T0_sqrt = `E2.Concentration..pg.ml._T0_sqrt`) %>%
  rename(ED_Pain = `ED_NowPain.x`) %>%
  mutate(`M6_Pain` = as.numeric(`M6_Pain`))#283 x 11

Import key for batch 2

#PID to inventory ID key for estrogen
key <- read_excel("e2_pid_key.xlsx") #906 x 6

#Link estrogen data with key
e2_batch2_keyed <- e2_batch2 %>% 
  inner_join(key,by=c("Inventory Code" = "InventoryCode")) #900 x 13

Combine keyed E2 and aurora demographic data

Note: the original Aurora data, named here df0, has several duplicates for some PIDs. For example, PID 101394 has 4 duplicate rows in df0. I imagine this is a feature of the long data, and there is a column somewhere that provides categorization for each entry. However, at cursory glance of the first 50 rows, I do not see a column to explain the multiple levels. For this reason I will remove the duplicates after joining the keyed E2 data e2_batch2_keyed with the Aurora data df0, and subsetting the data to estradiol collected at time point 0, T0.

df2 <- e2_batch2_keyed %>% 
  left_join(df0,by=c("PID" = "PID")) #1211 x 353

#subset to relevant demographic data
df2 <- df2[,c("PID","ED_Age","ED_HighestGrade","BMI","AlternateID", "WK8_Pain_C","M3_Pain_C","M6_Pain_C","Mean Concentration (pg/mL)","% CV","Comments","ED_NowPain_C","ED_RaceEthCode","Plate Number")] #1211 x 12

Batch 1 only has T0 for estrogen levels. Subset df2 to T0 collection only. While square root transform E2 values in batch 2 to match transformation in Batch 1. Rename some columns.

df2 <- df2 %>%
  filter(AlternateID == "T0") %>%
  mutate(`Mean Concentration (pg/mL)` = as.numeric(`Mean Concentration (pg/mL)`)) %>%
  mutate(E2_T0_sqrt = sqrt(`Mean Concentration (pg/mL)`)) %>%
  rename(CV = `% CV`) %>%
  mutate(`CV` = as.numeric(`CV`)) #df2 = 590 x 15

Remove duplicates. df2 rows goes from 590 cases to 457 cases.

df2 <- distinct(df2) #457 x 15

Add CV confidence brackets to df2

#note - use CVs_T0 for batch 1 CVs

df2 <- df2 %>%
  mutate(e2_confidence = case_when(
    CV >= 0 & CV <= 10 ~ 1,
    CV > 10 & CV <= 20 ~ 2,
    CV > 20 & CV <= 30 ~ 3,
    CV > 30 & CV <= 40 ~ 4,
    CV > 40 & CV <= 50 ~ 5,
    CV > 50 ~ 6,
    TRUE ~ NA_integer_
  )) #dimensions 457 x 16

Combine Batch 1 and Batch 2 data into one long df

df_batch1 <- data.frame(PID = df$PID, E2_T0_sqrt = df$E2_T0_sqrt, batch = "1", ED_Age = df$ED_Age, ED_Pain = df$ED_Pain, BMI = df$BMI, WK8_Pain = df$WK8_Pain, M3_Pain = df$M3_Pain, M6_Pain = df$M6_Pain)
df_batch2 <- data.frame(PID = df2$PID, E2_T0_sqrt = df2$E2_T0_sqrt, batch = "2", ED_Age = df2$ED_Age, ED_Pain = df2$ED_NowPain_C, BMI = df2$BMI, WK8_Pain = df2$WK8_Pain_C, M3_Pain = df2$M3_Pain_C, M6_Pain = df2$M6_Pain_C)

combined_batches <- rbind(df_batch1, df_batch2) #740 x 6

Plot data

#elements for plots
gghisto <- list(
  theme(axis.text.x = element_text(face="bold", color="cornflowerblue", size=14),
          axis.text.y = element_text(face="bold", color="royalblue4", 
          size=16, angle=25),
          axis.title=element_text(size=17,face="bold"),
          plot.title = element_text(size=14,face="bold.italic"))
)

create plots

p1 <- ggplot(combined_batches, aes(x = E2_T0_sqrt, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Square Root transformed E2", y = "Density", title = "Density Plot of square root E2 values \nfor Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(.9, .8)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

p2 <- ggplot(combined_batches, aes(x = BMI, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "BMI at ED admission", y = "Density", title = "Density Plot of BMI for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.8, 0.8)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

p3 <- ggplot(combined_batches, aes(x = ED_Age, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Age at ED admission", y = "Density", title = "Density Plot of Age for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.8, 0.8)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

p4 <- ggplot(combined_batches, aes(x = ED_Pain, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Pain at ED admission", y = "Density", title = "Density Plot of ED Pain for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

plot timepoint pain

p5 <- ggplot(combined_batches, aes(x = WK8_Pain, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Week 8 Pain", y = "Density", title = "Density Plot of WK8 Pain for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

p6 <- ggplot(combined_batches, aes(x = M3_Pain, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Month 3 Pain", y = "Density", title = "Density Plot of M3 Pain for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 

p7 <- ggplot(combined_batches, aes(x = M6_Pain, fill = batch)) +
  geom_density(alpha = 0.4, linewidth=1.5) +
  labs(x = "Month 6", y = "Density", title = "Density Plot of M6 Pain for Batches 1 and 2") +
  theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
  scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1")) 
#p1;p2;p3;p4
grid.arrange(p1,p2,p3,p4)

grid.arrange(p5,p6,p7)

simple stats

combined_batches %>%
  select(ED_Age, ED_Pain, BMI, E2_T0_sqrt, WK8_Pain, M3_Pain, M6_Pain, batch) %>%
  tbl_summary(by = batch) %>%
  modify_caption("**Batch 1 vs Batch 2 Estradiol Measurements**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Batch**") %>%
  add_p()
Batch 1 vs Batch 2 Estradiol Measurements
Characteristic Batch p-value2
1, N = 2831 2, N = 4571
ED_Age 35 (26, 50) 35 (26, 49) 0.5
ED_Pain 7 (5, 8) 7 (4, 8) 0.3
    Unknown 1 0
BMI 30 (24, 35) 28 (24, 34) 0.2
    Unknown 45 36
E2_T0_sqrt 6.4 (4.1, 11.1) 4.8 (3.8, 6.6) <0.001
WK8_Pain 5 (3, 7) 5 (2, 7) 0.3
    Unknown 21 21
M3_Pain 5 (2, 7) 4 (1, 7) 0.4
    Unknown 22 22
M6_Pain 4.0 (0.0, 6.0) 4.0 (0.0, 6.0) >0.9
    Unknown 46 0
1 Median (IQR)
2 Wilcoxon rank sum test

Next: Fit distribution using Maximum-likelihood Fitting of Univariate Distributions

#let's re-examine the general distributions of each batch
p1 + facet_wrap(vars(batch))

#batch 1 looks like a bimodal distribution. I should try the `mixdist` library. For now I'll do standard distribution building to see where it leads.

#use fitdistrplus package to examine distribution fitting
#batch 1
descdist(df$E2_T0_sqrt, discrete = FALSE)

## summary statistics
## ------
## min:  0.8124038   max:  21.72901 
## median:  6.437158 
## mean:  7.964744 
## estimated sd:  4.96733 
## estimated skewness:  0.7455244 
## estimated kurtosis:  2.512118

Now try to fit different distributions

#try normal, weibull, exponential, and gamma distributions
#its not a uniform distribution so no need to plot
fit.in1 <- fitdist(df$E2_T0_sqrt, "norm")
fit.in2 <- fitdist(df$E2_T0_sqrt, "weibull")
fit.in3 <- fitdist(df$E2_T0_sqrt, "exp")
fit.in4 <- fitdist(df$E2_T0_sqrt, "gamma")


#plot diagnostics
par(mfrow=c(2,2))
denscomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
qqcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
cdfcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
ppcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))

check parameters of data

#normality
shapiro.test(df$E2_T0_sqrt) #strongly and significantly deviates from a normal distribution 
## 
##  Shapiro-Wilk normality test
## 
## data:  df$E2_T0_sqrt
## W = 0.91946, p-value = 3.23e-11
#uniformity
#ks.test(df$E2_T0_sqrt,"punif")

Hard to say.. will try with mixdist package to address bimodalism.

#Plot a histogram of batch 1 transformed estrogen values
hist1 <- hist(df$E2_T0_sqrt, breaks=25) #are there actually three subpopulations??

#build a data frame that has the bucket midpoints and counts
hist_df1 <- data.frame(mid=hist1$mids, cou=hist1$counts)
#The histogram shows 2-3 peaks that might be represented by 2-3 Normal Distributions.  
#Guess at the 3 Means in Ascending Order, with a guess for the associated 3 Sigmas and fit the distribution.  
guess.mean <- c(4, 10, 16)  
guess.sig <- c(1, 3, 4)  
guess.dist <- "norm"  
(fitpro <- mix(as.mixdata(hist_df1), mixparam(mu=guess.mean, sigma=guess.sig), dist=guess.dist))  
## 
## Parameters:
##       pi     mu sigma
## 1 0.5408  3.885 1.818
## 2 0.2878  9.162 2.454
## 3 0.1715 16.067 1.903
## 
## Distribution:
## [1] "norm"
## 
## Constraints:
##    conpi    conmu consigma 
##   "NONE"   "NONE"   "NONE"
#Plot the results  
plot(fitpro, main="Fit a Probability Distribution")  
grid()  
legend("topright", lty=1, lwd=c(1, 1, 2), c("Original Distribution to be Fit", "Individual Fitted Distributions", "Fitted Distributions Combined"), col=c("blue", "red", rgb(0.2, 0.7, 0.2)), bg="white")  

Use Gaussian finite mixture model fitted by EM algorithm with mclust This appears to confirm three populations

model1 <- Mclust(df$E2_T0_sqrt)
summary(model1)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust E (univariate, equal variance) model with 3 components: 
## 
##  log-likelihood   n df      BIC       ICL
##       -804.1235 283  6 -1642.12 -1693.896
## 
## Clustering table:
##   1   2   3 
## 174  59  50
#model suggests 3 components are ideal based on equal variance model BIC score

model1$parameters
## $pro
## [1] 0.5953757 0.2252469 0.1793775
## 
## $mean
##         1         2         3 
##  4.563737 10.217595 16.424164 
## 
## $variance
## $variance$modelName
## [1] "E"
## 
## $variance$d
## [1] 1
## 
## $variance$G
## [1] 3
## 
## $variance$sigmasq
## [1] 3.720778
#plot distribution classifications 
plot(model1, what = c("classification"))

plot(model1, what = "BIC")

k-means clustering

set.seed(1000)

km <- kmeans(df$E2_T0_sqrt, 3, iter.max = 10, nstart = 20)
print(km)
## K-means clustering with 3 clusters of sizes 61, 133, 89
## 
## Cluster means:
##        [,1]
## 1 15.911381
## 2  3.843823
## 3  8.676403
## 
## Clustering vector:
##   [1] 2 3 3 2 2 3 2 3 3 2 3 3 3 2 3 3 3 3 2 2 1 3 1 3 2 2 1 1 2 2 2 2 2 1 2 3 2
##  [38] 2 2 2 3 2 1 3 2 2 3 2 2 3 2 3 2 3 2 3 2 3 3 3 2 2 3 2 2 2 3 3 3 3 2 2 3 2
##  [75] 3 3 2 2 2 2 2 2 3 3 2 2 3 2 1 3 2 1 2 3 3 2 3 3 2 2 2 2 3 2 2 2 2 2 2 1 2
## [112] 3 3 3 1 2 1 2 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 2 2 3 3 2 3 2 3 2 2 3 3 1 3 2 2 2 3 1 3 2 3 2 2 1 2 2 2 3
## [186] 3 1 1 1 1 2 1 2 1 2 1 2 2 2 3 2 3 2 2 2 2 3 3 3 3 2 2 3 2 2 2 3 3 3 3 1 2
## [223] 2 2 3 2 2 1 2 3 2 2 3 3 2 2 2 2 2 3 2 2 3 2 2 3 2 2 2 2 2 2 2 3 2 2 2 2 2
## [260] 3 1 3 3 2 3 2 2 2 1 3 3 3 2 2 3 3 3 2 1 2 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 279.1594 258.3275 264.9127
##  (between_SS / total_SS =  88.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#cluster using E2 and M6_pain
e2_m6_batch1 <- na.omit(df[, c("M6_Pain", "E2_T0_sqrt")]) #237 x 2

km2 <- kmeans(e2_m6_batch1, centers = 3, nstart = 20)

#add point classifications to batch 1 data 
e2_m6_batch1$cluster <- factor(km2$cluster)

#plot
ggplot(e2_m6_batch1, aes(E2_T0_sqrt, M6_Pain, color = cluster)) +
    geom_point(alpha = 0.25) +
    xlab("Transformed E2 level") +
    ylab("NRS Pain at 6 Months")