Import libraries
library(data.table);library(ggfortify);library(readxl);library(fitdistrplus)
library(stringr);library(lattice); library(nlme);library(gridExtra);library(mixdist)
library(ggplot2);library(tidyverse);library(naniar); library(gtsummary); library(mclust)
Import Aurora data, Batch 1 E2, and Batch 2 E2 data
#import full aurora data
df0 <- read.csv("full_aurora.csv") #4745 x341
#import batch 1
e2_batch1 <- read.csv("AURORA_estrogen_excl_outliers_excl_BMI.csv") #283 x 3035
#subset batch 1 to relevant demographic data
df <- e2_batch1[,c("PID","ED_Age","HighestGrade","BMI","WK8_Pain","M3_Pain","M6_Pain","E2.Concentration..pg.ml._T0_sqrt","ED_NowPain.x","Race","Date.Run_T0")] #283 x 14
#import batch 2
e2_batch2 <- read_excel("E2_batch2_reran_final.xlsx") #900 x 6
Rename some columns in Batch 1
df <- df %>%
rename(E2_T0_sqrt = `E2.Concentration..pg.ml._T0_sqrt`) %>%
rename(ED_Pain = `ED_NowPain.x`) %>%
mutate(`M6_Pain` = as.numeric(`M6_Pain`))#283 x 11
Import key for batch 2
#PID to inventory ID key for estrogen
key <- read_excel("e2_pid_key.xlsx") #906 x 6
#Link estrogen data with key
e2_batch2_keyed <- e2_batch2 %>%
inner_join(key,by=c("Inventory Code" = "InventoryCode")) #900 x 13
Combine keyed E2 and aurora demographic data
Note: the original Aurora data, named here df0, has
several duplicates for some PIDs. For example, PID 101394
has 4 duplicate rows in df0. I imagine this is a feature of
the long data, and there is a column somewhere that provides
categorization for each entry. However, at cursory glance of the first
50 rows, I do not see a column to explain the multiple levels. For this
reason I will remove the duplicates after joining the keyed E2 data
e2_batch2_keyed with the Aurora data df0, and
subsetting the data to estradiol collected at time point 0,
T0.
df2 <- e2_batch2_keyed %>%
left_join(df0,by=c("PID" = "PID")) #1211 x 353
#subset to relevant demographic data
df2 <- df2[,c("PID","ED_Age","ED_HighestGrade","BMI","AlternateID", "WK8_Pain_C","M3_Pain_C","M6_Pain_C","Mean Concentration (pg/mL)","% CV","Comments","ED_NowPain_C","ED_RaceEthCode","Plate Number")] #1211 x 12
Batch 1 only has T0 for estrogen levels. Subset df2 to T0 collection only. While square root transform E2 values in batch 2 to match transformation in Batch 1. Rename some columns.
df2 <- df2 %>%
filter(AlternateID == "T0") %>%
mutate(`Mean Concentration (pg/mL)` = as.numeric(`Mean Concentration (pg/mL)`)) %>%
mutate(E2_T0_sqrt = sqrt(`Mean Concentration (pg/mL)`)) %>%
rename(CV = `% CV`) %>%
mutate(`CV` = as.numeric(`CV`)) #df2 = 590 x 15
Remove duplicates. df2 rows goes from 590 cases to 457 cases.
df2 <- distinct(df2) #457 x 15
Add CV confidence brackets to df2
#note - use CVs_T0 for batch 1 CVs
df2 <- df2 %>%
mutate(e2_confidence = case_when(
CV >= 0 & CV <= 10 ~ 1,
CV > 10 & CV <= 20 ~ 2,
CV > 20 & CV <= 30 ~ 3,
CV > 30 & CV <= 40 ~ 4,
CV > 40 & CV <= 50 ~ 5,
CV > 50 ~ 6,
TRUE ~ NA_integer_
)) #dimensions 457 x 16
Combine Batch 1 and Batch 2 data into one long df
df_batch1 <- data.frame(PID = df$PID, E2_T0_sqrt = df$E2_T0_sqrt, batch = "1", ED_Age = df$ED_Age, ED_Pain = df$ED_Pain, BMI = df$BMI, WK8_Pain = df$WK8_Pain, M3_Pain = df$M3_Pain, M6_Pain = df$M6_Pain)
df_batch2 <- data.frame(PID = df2$PID, E2_T0_sqrt = df2$E2_T0_sqrt, batch = "2", ED_Age = df2$ED_Age, ED_Pain = df2$ED_NowPain_C, BMI = df2$BMI, WK8_Pain = df2$WK8_Pain_C, M3_Pain = df2$M3_Pain_C, M6_Pain = df2$M6_Pain_C)
combined_batches <- rbind(df_batch1, df_batch2) #740 x 6
Plot data
#elements for plots
gghisto <- list(
theme(axis.text.x = element_text(face="bold", color="cornflowerblue", size=14),
axis.text.y = element_text(face="bold", color="royalblue4",
size=16, angle=25),
axis.title=element_text(size=17,face="bold"),
plot.title = element_text(size=14,face="bold.italic"))
)
create plots
p1 <- ggplot(combined_batches, aes(x = E2_T0_sqrt, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Square Root transformed E2", y = "Density", title = "Density Plot of square root E2 values \nfor Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(.9, .8)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
p2 <- ggplot(combined_batches, aes(x = BMI, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "BMI at ED admission", y = "Density", title = "Density Plot of BMI for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.8, 0.8)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
p3 <- ggplot(combined_batches, aes(x = ED_Age, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Age at ED admission", y = "Density", title = "Density Plot of Age for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.8, 0.8)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
p4 <- ggplot(combined_batches, aes(x = ED_Pain, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Pain at ED admission", y = "Density", title = "Density Plot of ED Pain for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
plot timepoint pain
p5 <- ggplot(combined_batches, aes(x = WK8_Pain, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Week 8 Pain", y = "Density", title = "Density Plot of WK8 Pain for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
p6 <- ggplot(combined_batches, aes(x = M3_Pain, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Month 3 Pain", y = "Density", title = "Density Plot of M3 Pain for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
p7 <- ggplot(combined_batches, aes(x = M6_Pain, fill = batch)) +
geom_density(alpha = 0.4, linewidth=1.5) +
labs(x = "Month 6", y = "Density", title = "Density Plot of M6 Pain for Batches 1 and 2") +
theme_light() + gghisto + theme(legend.position = c(0.9, 0.85)) +
scale_fill_manual(values = c("1" = "violetred1", "2" = "steelblue1"))
#p1;p2;p3;p4
grid.arrange(p1,p2,p3,p4)
grid.arrange(p5,p6,p7)
simple stats
combined_batches %>%
select(ED_Age, ED_Pain, BMI, E2_T0_sqrt, WK8_Pain, M3_Pain, M6_Pain, batch) %>%
tbl_summary(by = batch) %>%
modify_caption("**Batch 1 vs Batch 2 Estradiol Measurements**") %>%
modify_spanning_header(c("stat_1", "stat_2") ~ "**Batch**") %>%
add_p()
| Characteristic | Batch | p-value2 | |
|---|---|---|---|
| 1, N = 2831 | 2, N = 4571 | ||
| ED_Age | 35 (26, 50) | 35 (26, 49) | 0.5 |
| ED_Pain | 7 (5, 8) | 7 (4, 8) | 0.3 |
| Â Â Â Â Unknown | 1 | 0 | |
| BMI | 30 (24, 35) | 28 (24, 34) | 0.2 |
| Â Â Â Â Unknown | 45 | 36 | |
| E2_T0_sqrt | 6.4 (4.1, 11.1) | 4.8 (3.8, 6.6) | <0.001 |
| WK8_Pain | 5 (3, 7) | 5 (2, 7) | 0.3 |
| Â Â Â Â Unknown | 21 | 21 | |
| M3_Pain | 5 (2, 7) | 4 (1, 7) | 0.4 |
| Â Â Â Â Unknown | 22 | 22 | |
| M6_Pain | 4.0 (0.0, 6.0) | 4.0 (0.0, 6.0) | >0.9 |
| Â Â Â Â Unknown | 46 | 0 | |
| 1 Median (IQR) | |||
| 2 Wilcoxon rank sum test | |||
Next: Fit distribution using Maximum-likelihood Fitting of Univariate Distributions
#let's re-examine the general distributions of each batch
p1 + facet_wrap(vars(batch))
#batch 1 looks like a bimodal distribution. I should try the `mixdist` library. For now I'll do standard distribution building to see where it leads.
#use fitdistrplus package to examine distribution fitting
#batch 1
descdist(df$E2_T0_sqrt, discrete = FALSE)
## summary statistics
## ------
## min: 0.8124038 max: 21.72901
## median: 6.437158
## mean: 7.964744
## estimated sd: 4.96733
## estimated skewness: 0.7455244
## estimated kurtosis: 2.512118
Now try to fit different distributions
#try normal, weibull, exponential, and gamma distributions
#its not a uniform distribution so no need to plot
fit.in1 <- fitdist(df$E2_T0_sqrt, "norm")
fit.in2 <- fitdist(df$E2_T0_sqrt, "weibull")
fit.in3 <- fitdist(df$E2_T0_sqrt, "exp")
fit.in4 <- fitdist(df$E2_T0_sqrt, "gamma")
#plot diagnostics
par(mfrow=c(2,2))
denscomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
qqcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
cdfcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
ppcomp(list(fit.in1,fit.in2,fit.in3,fit.in4),legendtext=c("Normal","Weibull","Exponential", "gamma"))
check parameters of data
#normality
shapiro.test(df$E2_T0_sqrt) #strongly and significantly deviates from a normal distribution
##
## Shapiro-Wilk normality test
##
## data: df$E2_T0_sqrt
## W = 0.91946, p-value = 3.23e-11
#uniformity
#ks.test(df$E2_T0_sqrt,"punif")
Hard to say.. will try with mixdist package to address
bimodalism.
#Plot a histogram of batch 1 transformed estrogen values
hist1 <- hist(df$E2_T0_sqrt, breaks=25) #are there actually three subpopulations??
#build a data frame that has the bucket midpoints and counts
hist_df1 <- data.frame(mid=hist1$mids, cou=hist1$counts)
#The histogram shows 2-3 peaks that might be represented by 2-3 Normal Distributions.
#Guess at the 3 Means in Ascending Order, with a guess for the associated 3 Sigmas and fit the distribution.
guess.mean <- c(4, 10, 16)
guess.sig <- c(1, 3, 4)
guess.dist <- "norm"
(fitpro <- mix(as.mixdata(hist_df1), mixparam(mu=guess.mean, sigma=guess.sig), dist=guess.dist))
##
## Parameters:
## pi mu sigma
## 1 0.5408 3.885 1.818
## 2 0.2878 9.162 2.454
## 3 0.1715 16.067 1.903
##
## Distribution:
## [1] "norm"
##
## Constraints:
## conpi conmu consigma
## "NONE" "NONE" "NONE"
#Plot the results
plot(fitpro, main="Fit a Probability Distribution")
grid()
legend("topright", lty=1, lwd=c(1, 1, 2), c("Original Distribution to be Fit", "Individual Fitted Distributions", "Fitted Distributions Combined"), col=c("blue", "red", rgb(0.2, 0.7, 0.2)), bg="white")
Use Gaussian finite mixture model fitted by EM algorithm with
mclust This appears to confirm three populations
model1 <- Mclust(df$E2_T0_sqrt)
summary(model1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust E (univariate, equal variance) model with 3 components:
##
## log-likelihood n df BIC ICL
## -804.1235 283 6 -1642.12 -1693.896
##
## Clustering table:
## 1 2 3
## 174 59 50
#model suggests 3 components are ideal based on equal variance model BIC score
model1$parameters
## $pro
## [1] 0.5953757 0.2252469 0.1793775
##
## $mean
## 1 2 3
## 4.563737 10.217595 16.424164
##
## $variance
## $variance$modelName
## [1] "E"
##
## $variance$d
## [1] 1
##
## $variance$G
## [1] 3
##
## $variance$sigmasq
## [1] 3.720778
#plot distribution classifications
plot(model1, what = c("classification"))
plot(model1, what = "BIC")
k-means clustering
set.seed(1000)
km <- kmeans(df$E2_T0_sqrt, 3, iter.max = 10, nstart = 20)
print(km)
## K-means clustering with 3 clusters of sizes 61, 133, 89
##
## Cluster means:
## [,1]
## 1 15.911381
## 2 3.843823
## 3 8.676403
##
## Clustering vector:
## [1] 2 3 3 2 2 3 2 3 3 2 3 3 3 2 3 3 3 3 2 2 1 3 1 3 2 2 1 1 2 2 2 2 2 1 2 3 2
## [38] 2 2 2 3 2 1 3 2 2 3 2 2 3 2 3 2 3 2 3 2 3 3 3 2 2 3 2 2 2 3 3 3 3 2 2 3 2
## [75] 3 3 2 2 2 2 2 2 3 3 2 2 3 2 1 3 2 1 2 3 3 2 3 3 2 2 2 2 3 2 2 2 2 2 2 1 2
## [112] 3 3 3 1 2 1 2 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 2 2 3 3 2 3 2 3 2 2 3 3 1 3 2 2 2 3 1 3 2 3 2 2 1 2 2 2 3
## [186] 3 1 1 1 1 2 1 2 1 2 1 2 2 2 3 2 3 2 2 2 2 3 3 3 3 2 2 3 2 2 2 3 3 3 3 1 2
## [223] 2 2 3 2 2 1 2 3 2 2 3 3 2 2 2 2 2 3 2 2 3 2 2 3 2 2 2 2 2 2 2 3 2 2 2 2 2
## [260] 3 1 3 3 2 3 2 2 2 1 3 3 3 2 2 3 3 3 2 1 2 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 279.1594 258.3275 264.9127
## (between_SS / total_SS = 88.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#cluster using E2 and M6_pain
e2_m6_batch1 <- na.omit(df[, c("M6_Pain", "E2_T0_sqrt")]) #237 x 2
km2 <- kmeans(e2_m6_batch1, centers = 3, nstart = 20)
#add point classifications to batch 1 data
e2_m6_batch1$cluster <- factor(km2$cluster)
#plot
ggplot(e2_m6_batch1, aes(E2_T0_sqrt, M6_Pain, color = cluster)) +
geom_point(alpha = 0.25) +
xlab("Transformed E2 level") +
ylab("NRS Pain at 6 Months")