library(data.table);library(ggfortify);library(readxl)
library(factoextra);library(ClassDiscovery)
library(stringr);library(lattice);library(kableExtra)
library(ggplot2);library(MASS);library(memisc)
library(tidyverse);library(naniar);library(gtsummary)
require(lme4); require(nnet); require(tidyverse);
import aurora data
#aurora data
df0 <- read.csv("full_aurora.csv") #4745 x341
Batch 2 estrogen data from Simran
#estrogen levels
e2_conc <- read.csv("E2_concentrated.csv")
#e2_od <- read.csv("E2_OD.csv") #Do I need this data?
viz
e2_conc$Mean.Concentration..pg.mL. <- as.numeric(as.character(e2_conc$Mean.Concentration..pg.mL.))
ggplot(e2_conc, aes(x = as.factor(Plate.Number), y = Mean.Concentration..pg.mL.)) +
geom_boxplot(fill = "skyblue", color = "black") +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 2, color = "red") +
labs(x = "Plate Number", y = "Mean Concentration (pg/mL)") +
theme_minimal()
e2_conc %>%
filter(X..CV != "#DIV/0!") %>%
group_by(Plate.Number) %>%
summarize(
Mean_Concentration = mean(Mean.Concentration..pg.mL.),
Standard_Deviation = sd(Mean.Concentration..pg.mL.),
Avg_CV = mean(as.numeric(X..CV)),
Sample_Count = n(),
)
## # A tibble: 25 × 5
## Plate.Number Mean_Concentration Standard_Deviation Avg_CV Sample_Count
## <int> <dbl> <dbl> <dbl> <int>
## 1 1 74.1 91.5 37.1 21
## 2 2 41.7 51.3 40.2 35
## 3 3 29.9 24.6 63.8 29
## 4 4 41.0 50.2 20.6 34
## 5 5 54.6 58.2 21.2 37
## 6 6 36.2 44.2 15.4 37
## 7 7 36.1 43.8 13.0 37
## 8 8 45.5 77.7 18.8 35
## 9 9 32.9 31.7 13.1 36
## 10 10 57.0 51.5 20.4 37
## # ℹ 15 more rows
clean estrogen data and transform
#filter out the 0 values, 420 values, 0 CV values, >20 CV values
#make columns numeric
e2_conc$Mean.Concentration..pg.mL. <- as.numeric(e2_conc$Mean.Concentration..pg.mL.)
e2_conc$X..CV <- as.numeric(e2_conc$X..CV)
#filter samples
e2_conc <- e2_conc %>%
rename(mean.conc = Mean.Concentration..pg.mL.) %>%
filter(mean.conc > 0) %>%
filter(mean.conc < 420) %>%
filter(`X..CV` > 0) %>%
#filter(`X..CV` < 20) %>% #if filtering out CV values greater than 20
rename(CV = X..CV)
#square root transform concentration
e2_conc <- e2_conc %>%
mutate(sqrt.conc = sqrt(mean.conc)) %>%
select(everything(),mean.conc, sqrt.conc)
ggplot(e2_conc, aes(x = as.factor(Plate.Number), y = sqrt.conc)) +
geom_boxplot(fill = "skyblue", color = "black") +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 2, color = "red") +
labs(x = "Plate Number", y = "Sqrt Mean Concentration (pg/mL)") +
theme_minimal()
e2_conc %>%
group_by(Plate.Number) %>%
summarize(
sqrt_Mean_Concentration = mean(sqrt.conc),
Standard_Deviation = sd(sqrt.conc),
Avg_CV = mean(as.numeric(CV)),
Sample_Count = n(),
)
## # A tibble: 25 × 5
## Plate.Number sqrt_Mean_Concentration Standard_Deviation Avg_CV Sample_Count
## <int> <dbl> <dbl> <dbl> <int>
## 1 1 6.92 3.06 39.0 20
## 2 2 5.84 2.81 40.2 35
## 3 3 5.12 1.96 63.8 29
## 4 4 5.66 3.04 20.6 34
## 5 5 6.78 2.98 21.2 37
## 6 6 5.37 2.74 15.4 37
## 7 7 5.29 2.89 13.0 37
## 8 8 5.56 3.89 18.8 35
## 9 9 5.30 2.21 13.1 36
## 10 10 7.02 2.81 20.4 37
## # ℹ 15 more rows
#remove extra columns
e2_conc <- e2_conc[, -c(4, 5, 7, 9, 10)] #dimensions 860 x 6 ... #or dimensions 644 x 6 if filtering out CV>20 values
read keys and combine with E2 data file to link to full Aurora data
#PID to inventory ID key for estrogen
key <- read_excel("e2_pid_key.xlsx")
#Link estrogen data with key
e2_keyed <- e2_conc %>%
inner_join(key,by=c("Inventory.Code" = "InventoryCode")) #860
Subset by estrogen draw timepoints
#t0 <- e2_keyed %>% filter(AlternateID == "T0")
#t2 <- e2_keyed %>% filter(AlternateID == "T1")
#t3 <- e2_keyed %>% filter(AlternateID == "T2")
#t4 <- e2_keyed %>% filter(AlternateID == "T3")
length(unique(e2_keyed$PID)) # unique PID 466
## [1] 466
#the key has 906 inventory ID/PIDs and the estrogen has 860 samples
#for timepoint 0 for estrogen, only keep the T0 of 432 samples
#length(unique(e2_keyed[["PID"]])) #466 total PID in estrogen
Create wide df of estrogen (T0) and pain data
#aurora data is df0 #4745 rows
#combine e2 and aurora data
e2_keyed_full <- e2_keyed %>%
left_join(df0,by=c("PID" = "PID")) #1161 rows x 353 columns
## Warning in left_join(., df0, by = c(PID = "PID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 15 of `x` matches multiple rows in `y`.
## ℹ Row 946 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
sum(duplicated(e2_keyed_full)) #no duplicates
## [1] 0
create axial pain
#make axial pain feature combines either neck, or left and right shoulder, or lower and upper back. Reporting a 7+ in at least 1 of the axial body regions are defined as having severe axial pain. "M6_PainNeck","M6_PainLeftShoulder","M6_PainRightShoulder","M6_PainUpperBack", "M6_PainLowerBack"
e2_keyed_full$M6_AxialPain <- ifelse(
e2_keyed_full$M6_PainNeck >= 7 |
e2_keyed_full$M6_PainLeftShoulder >= 7 |
e2_keyed_full$M6_PainRightShoulder >= 7 |
e2_keyed_full$M6_PainUpperBack >= 7 |
e2_keyed_full$M6_PainLowerBack >= 7,
1,0)
#M3 axial pain
e2_keyed_full$M3_AxialPain <- ifelse(
e2_keyed_full$M3_PainNeck >= 7 |
e2_keyed_full$M3_PainLeftShoulder >= 7 |
e2_keyed_full$M3_PainRightShoulder >= 7 |
e2_keyed_full$M3_PainUpperBack >= 7 |
e2_keyed_full$M3_PainLowerBack >= 7,
1,0)
#M2 axial pain
e2_keyed_full$WK8_AxialPain <- ifelse(
e2_keyed_full$WK8_PainNeck >= 7 |
e2_keyed_full$WK8_PainLeftShoulder >= 7 |
e2_keyed_full$WK8_PainRightShoulder >= 7 |
e2_keyed_full$WK8_PainUpperBack >= 7 |
e2_keyed_full$WK8_PainLowerBack >= 7,
1,0)
sum(duplicated(e2_keyed_full)) #no duplicates
## [1] 0
add CV confidence levels
e2_keyed_full <- e2_keyed_full %>%
mutate(e2_confidence = case_when(
CV >= 0 & CV <= 10 ~ 1,
CV > 10 & CV <= 20 ~ 2,
CV > 20 & CV <= 30 ~ 3,
CV > 30 ~ 4,
TRUE ~ NA_integer_ # Handle other cases if necessary
))
sum(duplicated(e2_keyed_full)) #no duplicates
## [1] 0
fix the education category
#unique(e2_keyed_full$ED_HighestGrade)
#reduce education categories to 4 levels
level_mapping <- c(
"Some college, no degree" = "Some College or Associate Degree",
"Professional school degree: MD, DDS, DVM, JD" = "Advanced Degree",
"Associate degree: Occupational, technical, or vocational program" = "Some College or Associate Degree",
"High school graduate" = "High School or Less",
"Bachelor's degree: BA, AB, BS, BBA" = "Bachelor's Degree",
"Master's degree: MA, MS, MEng, MEd, MBA" = "Advanced Degree",
"GED or equivalent" = "High School or Less",
"8th grade" = "High School or Less",
"Doctoral degree: PhD, EdD" = "Advanced Degree",
"Associate degree: Academic program" = "Some College or Associate Degree",
"12th grade, no diploma" = "High School or Less",
"11th grade" = "High School or Less",
"9th grade" = "High School or Less",
"10th grade" = "High School or Less",
"7th grade" = "High School or Less"
)
#apply new mapping to education category
e2_keyed_full$ED_HighestGrade <- factor(e2_keyed_full$ED_HighestGrade, levels = names(level_mapping), labels = level_mapping)
#dim 1161 x 357
#check duplicates
sum(duplicated(e2_keyed_full)) #0
## [1] 0
pull other features of interest
#where is the NRS pain score
grep("NRS", names(e2_keyed_full), value = TRUE)
## [1] "ED_Somatic_NRS_SUM" "WK2_Somatic_NRS_SUM" "WK8_Somatic_NRS_SUM"
## [4] "M3_Somatic_NRS_SUM" "M6_Somatic_NRS_SUM"
#added plate number to control for batch effects
#create wide df below
#add 2016 widespread pain bc it has a slight negative correlation with e2
#cor.test(e2_full_wide$sqrt.conc,e2_full_wide$M6_Pain_Widespread_2016)
e2_full_wide <- e2_keyed_full[,c("PID","ED_Age","ED_HighestGrade","BMI","ED_RaceEthCode","sqrt.conc","ED_NowPain_C","WK8_Pain_C", "M3_Pain_C", "M6_Pain_C","WK8_Pain_Widespread_2016","M3_Pain_Widespread_2016","M6_Pain_Widespread_2016","WK8_AxialPain","M3_AxialPain","M6_AxialPain", "Plate.Number", "AlternateID","e2_confidence")] #dim 1161 x 19
#check duplicates
sum(duplicated(e2_full_wide)) #301?
## [1] 301
#e2_full_wide[duplicated(e2_full_wide), ]
#subset_df <- e2_keyed_full[e2_keyed_full$PID == 114723, ]
#subset_df <- subset_df[1:3,]
#duplicates from CRP
#delete CRP duplicates
e2_full_wide <- distinct(e2_full_wide) #dim 860 x19
#rename some of the columns
e2_full_wide <- e2_full_wide %>%
rename(ED_Pain = ED_NowPain_C) %>%
rename(WK8_Pain = WK8_Pain_C) %>%
rename(M3_Pain = M3_Pain_C) %>%
rename(M6_Pain = M6_Pain_C) %>%
rename(Race = ED_RaceEthCode)
length(unique(e2_full_wide[["PID"]])) #466
## [1] 466
Create long df of estrogen and pain data
#for NRS pain
e2_full_long_NRS <- e2_full_wide %>%
gather(key = names, value = NRS_pain, WK8_Pain, M3_Pain, M6_Pain) %>%
mutate(timepoint = case_when( names == "WK8_Pain" ~ 2, names == "M3_Pain" ~ 3,
names == "M6_Pain" ~ 6,TRUE ~ NA_real_)) %>%
arrange(PID, timepoint) %>%
mutate(index = row_number())
#for widespread pain
e2_full_long_WS <- e2_full_wide %>%
gather(key = names, value = Widespread_pain_2016, WK8_Pain_Widespread_2016, M3_Pain_Widespread_2016, M6_Pain_Widespread_2016) %>%
mutate(timepoint = case_when(names == "WK8_Pain_Widespread_2016" ~ 2,names == "M3_Pain_Widespread_2016" ~ 3,
names == "M6_Pain_Widespread_2016" ~ 6,TRUE ~ NA_real_)) %>%
arrange(PID, timepoint) %>%
mutate(index = row_number())%>%
select(1, 17:20)
#for axial pain
e2_full_long_Axial <- e2_full_wide %>%
gather(key = names, value = Axial_Pain, WK8_AxialPain, M3_AxialPain, M6_AxialPain) %>%
mutate(timepoint = case_when(names == "WK8_AxialPain" ~ 2,names == "M3_AxialPain" ~ 3,
names == "M6_AxialPain" ~ 6,TRUE ~ NA_real_)) %>%
arrange(PID, timepoint) %>%
mutate(index = row_number()) %>%
select(1, 17:20)
# Merge the data frames to make final e2_full_long
dfs <- list(e2_full_long_NRS, e2_full_long_WS, e2_full_long_Axial)
e2_full_long <- reduce(dfs, full_join, by = c("PID", "timepoint", "index")) #2580 x 24
#check
length(unique(e2_full_long[["PID"]])) #466 PID
## [1] 466
add postmenopause var and remove the column called index bc its no longer needed
e2_full_long <- e2_full_long %>%
mutate(postmenopause = ifelse(ED_Age >= 50, 1, 0)) %>%
select(-index)
#2580 x 24
examine e2 values and re-level
#visualize
summary(e2_full_long$sqrt.conc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.325 3.728 4.703 5.644 6.498 20.223
q1 <- quantile(e2_full_long$sqrt.conc, 0.25)
q3 <- quantile(e2_full_long$sqrt.conc, 0.75)
ggplot(data = e2_full_long, aes(x = sqrt.conc, y = ..density..)) +
geom_density(color = "deeppink3", fill = "deeppink3", alpha = .4, size=1.8) +
geom_histogram(fill = "skyblue", color = "gray20", alpha = 0.9, bins = 50) +
geom_vline(xintercept = q1, linetype = "dashed", color = "gray30", size = .5) +
geom_vline(xintercept = q3, linetype = "dashed", color = "gray30", size = .5) +
annotate("text", x = q1, y = 0.24, label = paste("Q1 =", round(q1, 2)), color = "gray20", hjust = 1.3, size = 4) +
annotate("text", x = q3, y = 0.24, label = paste("Q3 =", round(q3, 2)), color = "gray20", hjust = -0.1, size = 4) +
labs(x = "Estrogen Level (sqrt.conc)", y = "Density",
title = "Density Plot of Estrogen Levels")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
explore cut off points, pick levels
summary(e2_full_long$sqrt.conc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.325 3.728 4.703 5.644 6.498 20.223
length(e2_full_long[e2_full_long$sqrt.conc >= q3, "sqrt.conc"]) #645 values greater than or equal to 3rd quartile
## [1] 645
length(e2_full_long[e2_full_long$sqrt.conc <= q1, "sqrt.conc"]) #645 values less than or equal to 1st quartile
## [1] 645
length(e2_full_long$sqrt.conc[e2_full_long$sqrt.conc > q1 & e2_full_long$sqrt.conc < q3]) #1290 values between 1st and 3rd quartile
## [1] 1290
make new E2 categories based on quartiles
e2_full_long <- e2_full_long %>%
mutate(e2_bin = case_when(
sqrt.conc <= q1 ~ "low E2",
sqrt.conc > q1 & sqrt.conc < q3 ~ "mid E2",
sqrt.conc >= q3 ~ "high E2"
))
#new dimensions 2580 x25
make new levels for pain
e2_full_long <- e2_full_long %>%
mutate(bin_pain = case_when(
NRS_pain < 1 ~ "0 pain",
NRS_pain > 0 ~ "Any pain"
))
e2_full_long <- e2_full_long %>%
mutate(bin_pain2 = case_when(
NRS_pain < 1 ~ "0 pain",
NRS_pain > 0 & NRS_pain < 10 ~ "1 to 9 pain",
NRS_pain > 9 ~ "10 pain"
))
#new dimensions 2580 x 27
#check duplicates
sum(duplicated(e2_full_long)) #no duplicates
## [1] 0
#check unique pts
length(unique(e2_full_long[["PID"]])) #432 PID
## [1] 466
output cleaned E2 x Aurora file
write.csv(e2_full_long, file = "cleaned_E2_and_Aurora_Data.csv", row.names = FALSE)