df <- read_csv("data/CAMP_3280.csv")
## Rows: 9947 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): TG, ETHNIC
## dbl (26): TX, id, age_rz, GENDER, hemog, PREFEV, PREFVC, PREFF, PREPF, POSFE...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df$TX <- as.factor(df$TX)
df$TX <- fct_recode(df$TX, "budesonide"="0")
df$TX <- fct_recode(df$TX, "nedocromil"="1")
df$TX <- fct_recode(df$TX, "budesonide placebo"="2")
df$TX <- fct_recode(df$TX, "nedocromil placebo"="3")
df$TG <- as.factor(df$TG)
df$TG <- fct_recode(df$TG, "budesonide" = "A")
df$TG <- fct_recode(df$TG, "nedocromil" = "B")
df$TG <- fct_recode(df$TG, "placebo" = "C")
df$GENDER <- as.factor(df$GENDER)
df$GENDER <- fct_recode(df$GENDER, "female" = "0")
df$GENDER <- fct_recode(df$GENDER, "male" = "1")
df$ETHNIC <- as.factor(df$ETHNIC)
df$ETHNIC <- fct_recode(df$ETHNIC, "black" = "b")
df$ETHNIC <- fct_recode(df$ETHNIC, "hispanic" = "h")
df$ETHNIC <- fct_recode(df$ETHNIC, "other" = "o")
df$ETHNIC <- fct_recode(df$ETHNIC, "white" = "w")
df$anypet <- as.factor(df$anypet)
df$anypet <- fct_recode(df$anypet, "yes" = "1")
df$anypet <- fct_recode(df$anypet, "no" = "2")
df$woodstove <- as.factor(df$woodstove)
df$woodstove <- fct_recode(df$woodstove, "yes" = "1")
df$woodstove <- fct_recode(df$woodstove, "no" = "2")
df$dehumid <- as.factor(df$dehumid)
df$dehumid <- fct_recode(df$dehumid, "yes" = "1")
df$dehumid <- fct_recode(df$dehumid, "no" = "2")
df$parent_smokes <- as.factor(df$parent_smokes)
df$parent_smokes <- fct_recode(df$parent_smokes, "yes" = "1")
df$parent_smokes <- fct_recode(df$parent_smokes, "no" = "2")
df$any_smokes <- as.factor(df$any_smokes)
df$any_smokes <- fct_recode(df$any_smokes, "yes" = "1")
df$any_smokes <- fct_recode(df$any_smokes, "no" = "2")
df$camp <- ifelse(df$POSFVCPP<75, "low", "normal")
df$dFEV <- (df$POSFEV-df$PREFEV)/df$PREFEV *100
df.l1 <- pivot_longer(data = df,
cols = c(8:15),
names_to = "test",
values_to = "liters")
df.l <- pivot_longer(data = df.l1,
cols = c(8:11),
names_to = "predicted",
values_to = "percent")
Question 1 notes and explanations here…
#month of follow up = long, pre/post bronchodilator = wide
Question 2 notes and characteristics here (no code needed)… #1. Variable names use underscores instead of spaces.
#2. Variable names do not start with numbers
#3. Some titles are all capitalized -> not good
#4. No special characters
#5. No empty cells
Question 3 notes and explanations here…
#6 PRE/POS FEV, FVC, FF, PF, FEVPP, FVCPP
Question 4 notes and explanations here…
#make sure im only taking 1 yes/no per person
df_unique <- df %>% distinct(id, GENDER, dehumid)
#make sure I only take cases where there is a answer
df_unique1 <- filter(df_unique, complete.cases(df_unique))
#Makes sure that it is yes and male
df.dehumid <- filter(df_unique1, dehumid == "yes", GENDER == "male")
#76
Question 5 notes and explanations here…
#Post FEV1 per day w/o N/A -> 1.808862
tapply(df$POSFEV, df$visitc, mean, na.rm = TRUE)
## 0 2 4 12 16 24 28 36
## 1.808862 1.870573 1.898087 2.063120 2.116611 2.282950 2.358638 2.526866
## 40 44 48 52 56 60 64 72
## 2.603704 2.605000 2.788770 2.894299 2.977554 3.068783 2.944211 3.283833
## 84 96 108 120
## 3.492565 3.701359 3.811414 3.754789
#1.808862
Question 6 notes and explanations here…
#Filter all observations of people that are hispanic
df.h <- filter(df, ETHNIC == "hispanic")
#Out of all observations of hispanic people, filter out POSFEVPP less than 80 and wbc less than #100
df.h1 <- filter(df.h, POSFEVPP >= 80 | wbc >= 100)
#933
Question 7 notes and explanations here…
#In wrangling code chunk
Question 8 notes and explanations here…
#Also in old chunk
#Summary of dFEV (percent change in FEV1 PRE/POS)
summary(df$dFEV)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -47.661 3.932 7.186 9.262 11.977 115.447 37
#3.932
Question 9 notes and explanations here…
#Consider only PRE/POS FEV1 and fdays
df.9 <- select(df, c(8,12,28))
#Consider only in the first two years and exclude inconclusive data
df.9.1 <- filter(df.9, fdays <= 2*365, !is.na(POSFEV), !is.na(PREFEV))
#3717
Question 10 notes and explanations here…
#Filter entries by baseline only
df.b <- filter(df, fdays == "0")
#Create bar graph with FEV1 by anypet and gender showing mean and sd
ggplot(df.b, aes(anypet, POSFEV, fill = GENDER))+
geom_bar(stat = "summary",
fun = mean,
position = position_dodge(.95))+
geom_errorbar(stat = "summary",
fun = mean,
fun.min = function(x)mean(x)-sd(x),
fun.max = function(x)mean(x)+sd(x),
position = position_dodge(.95)) +
labs(title = "Baseline Post-bronchodilator FEV1 by Gender and Pet Status",
x = "Does the Child have a Pet at Home?",
y = "Baseline Post-bronchodilator FEV1 (L)")
#Figure out what the IQR and median are in order to eliminate extreme outliers
tapply(df$wbc, df$TG, summary)
## $budesonide
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 27.0 56.0 68.0 70.7 80.0 177.0 2629
##
## $nedocromil
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 32.00 56.00 67.00 70.44 82.00 259.00 2619
##
## $placebo
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 29.00 57.00 69.00 71.33 81.00 231.00 3451
summary(df$wbc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 27.00 56.00 68.00 70.87 81.00 259.00 8699
#Eliminate extreme outliers by erasing values that are 1.5 times the IQR above Q3
df$wbc <- ifelse(df$wbc> 1.5*(81-56) + 81, NA, df$wbc)
#Create boxplot to see if there are any significant differences in wbc among treatment groups
ggplot(df, aes(TG, wbc))+
geom_boxplot(color = "red",
fill = "green")+
labs(title = "White Blood Cell Count based on Treatment Group",
x = "Treatment Group",
y = "White Blood Cell Count (1000 cells/ul)")
## Warning: Removed 8728 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#No significant differences
df.o <- read_csv("data/Opioid_Phenotype.csv")
## Rows: 38 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Rat_ID, Sex, Strain, Treatment
## dbl (10): Day1, Day2, Day3, Day4, Day5, Day6, Day7, Day8, Day9, Day10
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df.o.l <- pivot_longer(data = df.o,
cols = c(5:14),
names_to = "day",
values_to = "o_presses")
Question 13 notes and explanations here…
#Create dataframe w/ only males
df.o.m <- filter(df.o.l, Sex == "male")
#Summary of dosage based on strain
tapply(df.o.m$o_presses, df.o.m$Strain, mean)
## ACI BN SHR
## 8.9625 12.3000 16.4750
#8.9625