Import and Wrangle CAMP Data

df <- read_csv("data/CAMP_3280.csv")
## Rows: 9947 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): TG, ETHNIC
## dbl (26): TX, id, age_rz, GENDER, hemog, PREFEV, PREFVC, PREFF, PREPF, POSFE...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$TX <- as.factor(df$TX)
df$TX <- fct_recode(df$TX, "budesonide"="0")
df$TX <- fct_recode(df$TX, "nedocromil"="1")
df$TX <- fct_recode(df$TX, "budesonide placebo"="2")
df$TX <- fct_recode(df$TX, "nedocromil placebo"="3")

df$TG <- as.factor(df$TG)
df$TG <- fct_recode(df$TG, "budesonide" = "A")
df$TG <- fct_recode(df$TG, "nedocromil" = "B")
df$TG <- fct_recode(df$TG, "placebo" = "C")

df$GENDER <- as.factor(df$GENDER)
df$GENDER <- fct_recode(df$GENDER, "female" = "0")
df$GENDER <- fct_recode(df$GENDER, "male" = "1")

df$ETHNIC <- as.factor(df$ETHNIC)
df$ETHNIC <- fct_recode(df$ETHNIC, "black" = "b")
df$ETHNIC <- fct_recode(df$ETHNIC, "hispanic" = "h")
df$ETHNIC <- fct_recode(df$ETHNIC, "other" = "o")
df$ETHNIC <- fct_recode(df$ETHNIC, "white" = "w")

df$anypet <- as.factor(df$anypet)
df$anypet <- fct_recode(df$anypet, "yes" = "1")
df$anypet <- fct_recode(df$anypet, "no" = "2")

df$woodstove <- as.factor(df$woodstove)
df$woodstove <- fct_recode(df$woodstove, "yes" = "1")
df$woodstove <- fct_recode(df$woodstove, "no" = "2")

df$dehumid <- as.factor(df$dehumid)
df$dehumid <- fct_recode(df$dehumid, "yes" = "1")
df$dehumid <- fct_recode(df$dehumid, "no" = "2")

df$parent_smokes <- as.factor(df$parent_smokes)
df$parent_smokes <- fct_recode(df$parent_smokes, "yes" = "1")
df$parent_smokes <- fct_recode(df$parent_smokes, "no" = "2")

df$any_smokes <- as.factor(df$any_smokes)
df$any_smokes <- fct_recode(df$any_smokes, "yes" = "1")
df$any_smokes <- fct_recode(df$any_smokes, "no" = "2")

df$camp <- ifelse(df$POSFVCPP<75, "low", "normal")

df$dFEV <- (df$POSFEV-df$PREFEV)/df$PREFEV *100

df.l1 <- pivot_longer(data = df,
                     cols = c(8:15),
                     names_to = "test",
                     values_to = "liters")

df.l <- pivot_longer(data = df.l1,
                     cols = c(8:11),
                     names_to = "predicted",
                     values_to = "percent")

Question 1

Question 1 notes and explanations here…

#month of follow up = long, pre/post bronchodilator = wide

Question 2

Question 2 notes and characteristics here (no code needed)… #1. Variable names use underscores instead of spaces.

#2. Variable names do not start with numbers

#3. Some titles are all capitalized -> not good

#4. No special characters

#5. No empty cells

Question 3

Question 3 notes and explanations here…

#6 PRE/POS FEV, FVC, FF, PF, FEVPP, FVCPP

Question 4

Question 4 notes and explanations here…

#make sure im only taking 1 yes/no per person
df_unique <- df %>% distinct(id, GENDER, dehumid)
#make sure I only take cases where there is a answer
df_unique1 <- filter(df_unique, complete.cases(df_unique))
#Makes sure that it is yes and male
df.dehumid <- filter(df_unique1, dehumid == "yes", GENDER == "male")
#76

Question 5

Question 5 notes and explanations here…

#Post FEV1 per day w/o N/A -> 1.808862
tapply(df$POSFEV, df$visitc, mean, na.rm = TRUE)
##        0        2        4       12       16       24       28       36 
## 1.808862 1.870573 1.898087 2.063120 2.116611 2.282950 2.358638 2.526866 
##       40       44       48       52       56       60       64       72 
## 2.603704 2.605000 2.788770 2.894299 2.977554 3.068783 2.944211 3.283833 
##       84       96      108      120 
## 3.492565 3.701359 3.811414 3.754789
#1.808862

Question 6

Question 6 notes and explanations here…

#Filter all observations of people that are hispanic
df.h <- filter(df, ETHNIC == "hispanic")
#Out of all observations of hispanic people, filter out POSFEVPP less than 80 and wbc less than #100
df.h1 <- filter(df.h, POSFEVPP >= 80 |  wbc >= 100)
#933

Question 7

Question 7 notes and explanations here…

#In wrangling code chunk

Question 8

Question 8 notes and explanations here…

#Also in old chunk
#Summary of dFEV (percent change in FEV1 PRE/POS)
summary(df$dFEV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -47.661   3.932   7.186   9.262  11.977 115.447      37
#3.932

Question 9

Question 9 notes and explanations here…

#Consider only PRE/POS FEV1 and fdays
df.9 <- select(df, c(8,12,28))
#Consider only in the first two years and exclude inconclusive data
df.9.1 <- filter(df.9, fdays <= 2*365, !is.na(POSFEV), !is.na(PREFEV))
#3717

Question 10

Question 10 notes and explanations here…

#Filter entries by baseline only
df.b <- filter(df, fdays == "0")
#Create bar graph with FEV1 by anypet and gender showing mean and sd
ggplot(df.b, aes(anypet, POSFEV, fill = GENDER))+
         geom_bar(stat = "summary",
           fun = mean,
           position = position_dodge(.95))+
   geom_errorbar(stat = "summary",
                fun = mean,
                fun.min = function(x)mean(x)-sd(x),
                fun.max = function(x)mean(x)+sd(x),
                position = position_dodge(.95)) +
  labs(title = "Baseline Post-bronchodilator FEV1 by Gender and Pet Status",
              x = "Does the Child have a Pet at Home?",
             y = "Baseline Post-bronchodilator FEV1 (L)") 

Question 11

#Figure out what the IQR and median are in order to eliminate extreme outliers
tapply(df$wbc, df$TG, summary)
## $budesonide
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    27.0    56.0    68.0    70.7    80.0   177.0    2629 
## 
## $nedocromil
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   32.00   56.00   67.00   70.44   82.00  259.00    2619 
## 
## $placebo
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   29.00   57.00   69.00   71.33   81.00  231.00    3451
summary(df$wbc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   27.00   56.00   68.00   70.87   81.00  259.00    8699
#Eliminate extreme outliers by erasing values that are 1.5 times the IQR above Q3
df$wbc <- ifelse(df$wbc> 1.5*(81-56) + 81, NA, df$wbc)
#Create boxplot to see if there are any significant differences in wbc among treatment groups
ggplot(df, aes(TG, wbc))+
         geom_boxplot(color = "red",
                  fill = "green")+

  labs(title = "White Blood Cell Count based on Treatment Group",
              x = "Treatment Group",
             y = "White Blood Cell Count (1000 cells/ul)")
## Warning: Removed 8728 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#No significant differences

Import and Wrangle Opioid_Phenotype Data (Q12)

df.o <- read_csv("data/Opioid_Phenotype.csv")
## Rows: 38 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Rat_ID, Sex, Strain, Treatment
## dbl (10): Day1, Day2, Day3, Day4, Day5, Day6, Day7, Day8, Day9, Day10
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df.o.l <- pivot_longer(data = df.o,
                       cols = c(5:14),
                       names_to = "day",
                       values_to = "o_presses")

Question 13

Question 13 notes and explanations here…

#Create dataframe w/ only males
df.o.m <- filter(df.o.l, Sex == "male")
#Summary of dosage based on strain
tapply(df.o.m$o_presses, df.o.m$Strain, mean)
##     ACI      BN     SHR 
##  8.9625 12.3000 16.4750
#8.9625

Bonus (Q14)