Cleaning e2 and aurora data

library(data.table);library(ggfortify);library(readxl)
library(factoextra);library(ClassDiscovery)
library(stringr);library(lattice);library(kableExtra)
library(ggplot2);library(MASS);library(memisc)
library(tidyverse);library(naniar);library(gtsummary)
require(lme4); require(nnet); require(tidyverse);

import aurora data

#aurora data
df0 <- read.csv("full_aurora.csv") #4745 x341

Batch 2 estrogen data from Simran

#estrogen levels
e2_conc <- read.csv("E2_concentrated.csv")
#e2_od <- read.csv("E2_OD.csv") #Do I need this data?

viz

e2_conc$Mean.Concentration..pg.mL. <- as.numeric(as.character(e2_conc$Mean.Concentration..pg.mL.))

ggplot(e2_conc, aes(x = as.factor(Plate.Number), y = Mean.Concentration..pg.mL.)) +
  geom_boxplot(fill = "skyblue", color = "black") +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 2, color = "red") +
  labs(x = "Plate Number", y = "Mean Concentration (pg/mL)") +
  theme_minimal()

e2_conc %>%
  filter(X..CV != "#DIV/0!") %>%
  group_by(Plate.Number) %>%
  summarize(
    Mean_Concentration = mean(Mean.Concentration..pg.mL.),
    Standard_Deviation = sd(Mean.Concentration..pg.mL.),
    Avg_CV = mean(as.numeric(X..CV)),
    Sample_Count = n(),
  )

## # A tibble: 25 × 5
##    Plate.Number Mean_Concentration Standard_Deviation Avg_CV Sample_Count
##           <int>              <dbl>              <dbl>  <dbl>        <int>
##  1            1               74.1               91.5   37.1           21
##  2            2               41.7               51.3   40.2           35
##  3            3               29.9               24.6   63.8           29
##  4            4               41.0               50.2   20.6           34
##  5            5               54.6               58.2   21.2           37
##  6            6               36.2               44.2   15.4           37
##  7            7               36.1               43.8   13.0           37
##  8            8               45.5               77.7   18.8           35
##  9            9               32.9               31.7   13.1           36
## 10           10               57.0               51.5   20.4           37
## # ℹ 15 more rows

clean estrogen data and transform

#filter out the 0 values, 420 values, 0 CV values, >20 CV values
#make columns numeric
e2_conc$Mean.Concentration..pg.mL. <- as.numeric(e2_conc$Mean.Concentration..pg.mL.)
e2_conc$X..CV <- as.numeric(e2_conc$X..CV)

#filter samples
e2_conc <- e2_conc %>% 
  rename(mean.conc = Mean.Concentration..pg.mL.) %>% 
  filter(mean.conc > 0) %>%
  filter(mean.conc < 420) %>%
  filter(`X..CV` > 0) %>%
  #filter(`X..CV` < 20) %>% #if filtering out CV values greater than 20
  rename(CV = X..CV)

#square root transform concentration
e2_conc <- e2_conc %>%
  mutate(sqrt.conc = sqrt(mean.conc)) %>%
  select(everything(),mean.conc, sqrt.conc)

ggplot(e2_conc, aes(x = as.factor(Plate.Number), y = sqrt.conc)) +
  geom_boxplot(fill = "skyblue", color = "black") +
  stat_summary(fun.y = mean, geom = "point", shape = 20, size = 2, color = "red") +
  labs(x = "Plate Number", y = "Sqrt Mean Concentration (pg/mL)") +
  theme_minimal()

e2_conc %>%
  group_by(Plate.Number) %>%
  summarize(
    sqrt_Mean_Concentration = mean(sqrt.conc),
    Standard_Deviation = sd(sqrt.conc),
    Avg_CV = mean(as.numeric(CV)),
    Sample_Count = n(),
  )

## # A tibble: 25 × 5
##    Plate.Number sqrt_Mean_Concentration Standard_Deviation Avg_CV Sample_Count
##           <int>                   <dbl>              <dbl>  <dbl>        <int>
##  1            1                    6.92               3.06   39.0           20
##  2            2                    5.84               2.81   40.2           35
##  3            3                    5.12               1.96   63.8           29
##  4            4                    5.66               3.04   20.6           34
##  5            5                    6.78               2.98   21.2           37
##  6            6                    5.37               2.74   15.4           37
##  7            7                    5.29               2.89   13.0           37
##  8            8                    5.56               3.89   18.8           35
##  9            9                    5.30               2.21   13.1           36
## 10           10                    7.02               2.81   20.4           37
## # ℹ 15 more rows

#remove extra columns
e2_conc <- e2_conc[, -c(4, 5, 7, 9, 10)] #dimensions 860 x 6 ... #or dimensions 644 x 6 if filtering out CV>20 values

read keys and combine with E2 data file to link to full Aurora data

#PID to inventory ID key for estrogen
key <- read_excel("e2_pid_key.xlsx")

#Link estrogen data with key
e2_keyed <- e2_conc %>% 
  inner_join(key,by=c("Inventory.Code" = "InventoryCode")) #860

Subset by estrogen draw timepoints

#t0 <- e2_keyed %>% filter(AlternateID == "T0") 
#t2 <- e2_keyed %>% filter(AlternateID == "T1") 
#t3 <- e2_keyed %>% filter(AlternateID == "T2") 
#t4 <- e2_keyed %>% filter(AlternateID == "T3") 

length(unique(e2_keyed$PID)) # unique PID 466

## [1] 466

#the key has 906 inventory ID/PIDs and the estrogen has 860 samples
#for timepoint 0 for estrogen, only keep the T0 of 432 samples

#length(unique(e2_keyed[["PID"]])) #466 total PID in estrogen

Create wide df of estrogen (T0) and pain data

#aurora data is df0 #4745 rows

#combine e2 and aurora data
e2_keyed_full <- e2_keyed %>% 
  left_join(df0,by=c("PID" = "PID")) #1161 rows x 353 columns

## Warning in left_join(., df0, by = c(PID = "PID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 15 of `x` matches multiple rows in `y`.
## ℹ Row 946 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

sum(duplicated(e2_keyed_full)) #no duplicates

## [1] 0

create axial pain

#make axial pain feature combines either neck, or left and right shoulder, or lower and upper back. Reporting a 7+ in at least 1 of the axial body regions are defined as having severe axial pain. "M6_PainNeck","M6_PainLeftShoulder","M6_PainRightShoulder","M6_PainUpperBack", "M6_PainLowerBack"
e2_keyed_full$M6_AxialPain <- ifelse(
  e2_keyed_full$M6_PainNeck >= 7 |
  e2_keyed_full$M6_PainLeftShoulder >= 7 |
  e2_keyed_full$M6_PainRightShoulder >= 7 |
  e2_keyed_full$M6_PainUpperBack >= 7 |
  e2_keyed_full$M6_PainLowerBack >= 7,
  1,0)

#M3 axial pain
e2_keyed_full$M3_AxialPain <- ifelse(
  e2_keyed_full$M3_PainNeck >= 7 |
  e2_keyed_full$M3_PainLeftShoulder >= 7 |
  e2_keyed_full$M3_PainRightShoulder >= 7 |
  e2_keyed_full$M3_PainUpperBack >= 7 |
  e2_keyed_full$M3_PainLowerBack >= 7,
  1,0)

#M2 axial pain
e2_keyed_full$WK8_AxialPain <- ifelse(
  e2_keyed_full$WK8_PainNeck >= 7 |
  e2_keyed_full$WK8_PainLeftShoulder >= 7 |
  e2_keyed_full$WK8_PainRightShoulder >= 7 |
  e2_keyed_full$WK8_PainUpperBack >= 7 |
  e2_keyed_full$WK8_PainLowerBack >= 7,
  1,0)

sum(duplicated(e2_keyed_full)) #no duplicates

## [1] 0

add CV confidence levels

e2_keyed_full <- e2_keyed_full %>%
  mutate(e2_confidence = case_when(
    CV >= 0 & CV <= 10 ~ 1,
    CV > 10 & CV <= 20 ~ 2,
    CV > 20 & CV <= 30 ~ 3,
    CV > 30 ~ 4,
    TRUE ~ NA_integer_  # Handle other cases if necessary
  ))

sum(duplicated(e2_keyed_full)) #no duplicates

## [1] 0

fix the education category

#unique(e2_keyed_full$ED_HighestGrade)

#reduce education categories to 4 levels
level_mapping <- c(
  "Some college, no degree" = "Some College or Associate Degree",
  "Professional school degree: MD, DDS, DVM, JD" = "Advanced Degree",
  "Associate degree: Occupational, technical, or vocational program" = "Some College or Associate Degree",
  "High school graduate" = "High School or Less",
  "Bachelor's degree: BA, AB, BS, BBA" = "Bachelor's Degree",
  "Master's degree: MA, MS, MEng, MEd, MBA" = "Advanced Degree",
  "GED or equivalent" = "High School or Less",
  "8th grade" = "High School or Less",
  "Doctoral degree: PhD, EdD" = "Advanced Degree",
  "Associate degree: Academic program" = "Some College or Associate Degree",
  "12th grade, no diploma" = "High School or Less",
  "11th grade" = "High School or Less",
  "9th grade" = "High School or Less",
  "10th grade" = "High School or Less",
  "7th grade" = "High School or Less"
)

#apply new mapping to education category
e2_keyed_full$ED_HighestGrade <- factor(e2_keyed_full$ED_HighestGrade, levels = names(level_mapping), labels = level_mapping)
#dim 1161 x 357

#check duplicates
sum(duplicated(e2_keyed_full)) #0

## [1] 0

pull other features of interest

#where is the NRS pain score
grep("NRS", names(e2_keyed_full), value = TRUE)

## [1] "ED_Somatic_NRS_SUM"  "WK2_Somatic_NRS_SUM" "WK8_Somatic_NRS_SUM"
## [4] "M3_Somatic_NRS_SUM"  "M6_Somatic_NRS_SUM"

#added plate number to control for batch effects
#create wide df below
#add 2016 widespread pain bc it has a slight negative correlation with e2
#cor.test(e2_full_wide$sqrt.conc,e2_full_wide$M6_Pain_Widespread_2016)

e2_full_wide <- e2_keyed_full[,c("PID","ED_Age","ED_HighestGrade","BMI","ED_RaceEthCode","sqrt.conc","ED_NowPain_C","WK8_Pain_C", "M3_Pain_C", "M6_Pain_C","WK8_Pain_Widespread_2016","M3_Pain_Widespread_2016","M6_Pain_Widespread_2016","WK8_AxialPain","M3_AxialPain","M6_AxialPain", "Plate.Number", "AlternateID","e2_confidence")] #dim 1161 x 19

#check duplicates
sum(duplicated(e2_full_wide)) #301?

## [1] 301

#e2_full_wide[duplicated(e2_full_wide), ]

#subset_df <- e2_keyed_full[e2_keyed_full$PID == 114723, ]
#subset_df <- subset_df[1:3,]
#duplicates from CRP

#delete CRP duplicates
e2_full_wide <- distinct(e2_full_wide) #dim 860 x19

#rename some of the columns
e2_full_wide <- e2_full_wide %>% 
  rename(ED_Pain = ED_NowPain_C) %>% 
  rename(WK8_Pain = WK8_Pain_C) %>% 
  rename(M3_Pain = M3_Pain_C) %>% 
  rename(M6_Pain = M6_Pain_C) %>% 
  rename(Race = ED_RaceEthCode)   

length(unique(e2_full_wide[["PID"]])) #466

## [1] 466

Create long df of estrogen and pain data

#for NRS pain
e2_full_long_NRS <- e2_full_wide %>%
  gather(key = names, value = NRS_pain, WK8_Pain, M3_Pain, M6_Pain) %>%
  mutate(timepoint = case_when( names == "WK8_Pain" ~ 2, names == "M3_Pain" ~ 3, 
                                names == "M6_Pain" ~ 6,TRUE ~ NA_real_)) %>%
  arrange(PID, timepoint) %>%
  mutate(index = row_number())

#for widespread pain
e2_full_long_WS <- e2_full_wide %>%
  gather(key = names, value = Widespread_pain_2016, WK8_Pain_Widespread_2016, M3_Pain_Widespread_2016, M6_Pain_Widespread_2016) %>%
  mutate(timepoint = case_when(names == "WK8_Pain_Widespread_2016" ~ 2,names == "M3_Pain_Widespread_2016" ~ 3,
                               names == "M6_Pain_Widespread_2016" ~ 6,TRUE ~ NA_real_)) %>%
  arrange(PID, timepoint) %>%
  mutate(index = row_number())%>%
  select(1, 17:20)

#for axial pain
e2_full_long_Axial <- e2_full_wide %>%
  gather(key = names, value = Axial_Pain, WK8_AxialPain, M3_AxialPain, M6_AxialPain) %>%
  mutate(timepoint = case_when(names == "WK8_AxialPain" ~ 2,names == "M3_AxialPain" ~ 3,
                               names == "M6_AxialPain" ~ 6,TRUE ~ NA_real_)) %>%
  arrange(PID, timepoint) %>%
  mutate(index = row_number()) %>%
  select(1, 17:20)


# Merge the data frames to make final e2_full_long
dfs <- list(e2_full_long_NRS, e2_full_long_WS, e2_full_long_Axial)
e2_full_long <- reduce(dfs, full_join, by = c("PID", "timepoint", "index")) #2580 x 24

#check
length(unique(e2_full_long[["PID"]])) #466 PID

## [1] 466

add postmenopause var and remove the column called index bc its no longer needed

e2_full_long <- e2_full_long %>%
  mutate(postmenopause = ifelse(ED_Age >= 50, 1, 0))  %>% 
  select(-index)
#2580 x 24

examine e2 values and re-level

#visualize
summary(e2_full_long$sqrt.conc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.325   3.728   4.703   5.644   6.498  20.223

q1 <- quantile(e2_full_long$sqrt.conc, 0.25)
q3 <- quantile(e2_full_long$sqrt.conc, 0.75)

ggplot(data = e2_full_long, aes(x = sqrt.conc, y = ..density..)) +
  geom_density(color = "deeppink3", fill = "deeppink3", alpha = .4, size=1.8) +
  geom_histogram(fill = "skyblue", color = "gray20", alpha = 0.9, bins = 50) +
  geom_vline(xintercept = q1, linetype = "dashed", color = "gray30", size = .5) +
  geom_vline(xintercept = q3, linetype = "dashed", color = "gray30", size = .5) +
  annotate("text", x = q1, y = 0.24, label = paste("Q1 =", round(q1, 2)), color = "gray20", hjust = 1.3, size = 4) +
  annotate("text", x = q3, y = 0.24, label = paste("Q3 =", round(q3, 2)), color = "gray20", hjust = -0.1, size = 4) +
  labs(x = "Estrogen Level (sqrt.conc)", y = "Density",
       title = "Density Plot of Estrogen Levels")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

explore cut off points, pick levels

summary(e2_full_long$sqrt.conc)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.325   3.728   4.703   5.644   6.498  20.223

length(e2_full_long[e2_full_long$sqrt.conc >= q3, "sqrt.conc"]) #645 values greater than or equal to 3rd quartile

## [1] 645

length(e2_full_long[e2_full_long$sqrt.conc <= q1, "sqrt.conc"]) #645 values less than or equal to 1st quartile

## [1] 645

length(e2_full_long$sqrt.conc[e2_full_long$sqrt.conc > q1 & e2_full_long$sqrt.conc < q3]) #1290 values between 1st and 3rd quartile

## [1] 1290

make new E2 categories based on quartiles

e2_full_long <- e2_full_long %>%
  mutate(e2_bin = case_when(
    sqrt.conc <= q1 ~ "low E2",
    sqrt.conc > q1 & sqrt.conc < q3 ~ "mid E2",
    sqrt.conc >= q3 ~ "high E2"
  ))
#new dimensions 2580 x25

make new levels for pain

e2_full_long <- e2_full_long %>%
  mutate(bin_pain = case_when(
    NRS_pain < 1 ~ "0 pain",
    NRS_pain > 0 ~ "Any pain"
  ))

e2_full_long <- e2_full_long %>%
  mutate(bin_pain2 = case_when(
    NRS_pain < 1 ~ "0 pain",
    NRS_pain > 0 & NRS_pain < 10 ~ "1 to 9 pain",
    NRS_pain > 9 ~ "10 pain"
  ))
#new dimensions 2580 x 27

#check duplicates
sum(duplicated(e2_full_long)) #no duplicates

## [1] 0

#check unique pts
length(unique(e2_full_long[["PID"]])) #432 PID

## [1] 466

output cleaned E2 x Aurora file

write.csv(e2_full_long, file = "cleaned_E2_and_Aurora_Data.csv", row.names = FALSE)

Cleaning e2 and aurora data

Alice Woolard

2024-03-14