Homework 5

Author

Andrea Nunez

Published

October 4, 2023


Setup Code

#==============================================================================#
# Setup Options
#==============================================================================#

# remove all objects if restarting script
rm(list=ls())

# options
options(
  tibble.width = Inf,  # print all columns
  scipen = 999         # remove scientific notation
)

#==============================================================================#
# Install Packages
#==============================================================================#

# In R, we first have to download the packages we want from the online 
# repository called CRAN. 

# Once installed, you have to load it in each session with the library() 
# function. 

# download package
#install.packages("DT")
#install.packages("lubridate")
#install.packages("tidyverse")

#==============================================================================#
# Packages
#==============================================================================#

# Here we must load the packages for the current environment to make them
# accessible. 

# load libraries
library(DT)
library(plotly)
Loading required package: ggplot2
Warning: package 'ggplot2' was built under R version 4.1.3

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(cowplot)
Warning: package 'cowplot' was built under R version 4.1.3
library(lubridate)
Warning: package 'lubridate' was built under R version 4.1.3

Attaching package: 'lubridate'
The following object is masked from 'package:cowplot':

    stamp
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.1.3
Warning: package 'tibble' was built under R version 4.1.3
Warning: package 'tidyr' was built under R version 4.1.3
Warning: package 'readr' was built under R version 4.1.3
Warning: package 'purrr' was built under R version 4.1.3
Warning: package 'dplyr' was built under R version 4.1.3
Warning: package 'stringr' was built under R version 4.1.3
Warning: package 'forcats' was built under R version 4.1.3
-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr   1.1.2     v stringr 1.5.0
v forcats 1.0.0     v tibble  3.2.1
v purrr   1.0.1     v tidyr   1.3.0
v readr   2.1.4     
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter()    masks plotly::filter(), stats::filter()
x dplyr::lag()       masks stats::lag()
x lubridate::stamp() masks cowplot::stamp()
i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#==============================================================================#
# Set Paths
#==============================================================================#

# set all paths
path_main    <- "C:\\Users\\anune\\Downloads\\ANS500AB\\AndreaNunez"
path_data    <- str_c(path_main, "Data/", sep="")
path_plots   <- str_c(path_main, "Plots/", sep="")
path_scripts <- str_c(path_main, "Scripts/", sep="")

# NOTE: str_c() is from the stringr package, which is a part of 'tidyverse'
# this is equivalent to paste() in base R, but we'll try to use tidyverse
# functions when possible in this class. 

# set working directory
#setwd(path_main)

# NOTE: We cannot set the working directory in quarto with this method
# as we do in an R script. Use the root.dir option in YAML. 

#==============================================================================#
# Set Inputs
#==============================================================================#

# set file names here
data_file_milk <- "Sheep_Milk_Wide.csv"
data_file_wt   <- "Sheep_Weight_Wide.csv"

#==============================================================================#
# Check session info again
#==============================================================================#

# check sessionInfo
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 22621)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2     purrr_1.0.1    
 [5] readr_2.1.4     tidyr_1.3.0     tibble_3.2.1    tidyverse_2.0.0
 [9] lubridate_1.9.2 cowplot_1.1.1   plotly_4.10.2   ggplot2_3.4.2  
[13] DT_0.20        

loaded via a namespace (and not attached):
 [1] compiler_4.1.2    pillar_1.9.0      tools_4.1.2       digest_0.6.29    
 [5] timechange_0.2.0  viridisLite_0.4.0 jsonlite_1.8.4    evaluate_0.14    
 [9] lifecycle_1.0.3   gtable_0.3.0      pkgconfig_2.0.3   rlang_1.1.0      
[13] cli_3.6.1         rstudioapi_0.15.0 yaml_2.2.1        xfun_0.29        
[17] fastmap_1.1.0     httr_1.4.6        withr_2.5.0       knitr_1.37       
[21] hms_1.1.3         generics_0.1.1    vctrs_0.6.1       htmlwidgets_1.5.4
[25] grid_4.1.2        tidyselect_1.2.0  glue_1.6.2        data.table_1.14.2
[29] R6_2.5.1          fansi_0.5.0       rmarkdown_2.11    tzdb_0.2.0       
[33] magrittr_2.0.3    scales_1.2.1      htmltools_0.5.2   colorspace_2.0-2 
[37] utf8_1.2.2        stringi_1.7.6     lazyeval_0.2.2    munsell_0.5.0    


Question 1

Use read_delim() (must use) to read in both the milk and weight data from the sheep dataset we worked on in week 1. Please read them in how they are and do not alter them prior to reading in.

library(tidyverse)
library(car)
Warning: package 'car' was built under R version 4.1.3
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
library(readr)
library(DT)
library(readr)

data_weigth <- read_delim(
  file = "C:\\Users\\anune\\Downloads\\ANS500AB\\AndreaNunez\\Sheep_Weight_Wide.csv",
  delim = ",",      
  skip  = 1,       
  col_names = c("EweType","Diet","Wk0","Wk1","Wk2", "Wk3","Wk4","Wk5","Wk6","Wk7"),
          col_types = "ffnnnnnnnn", # Set Types
          na = c(""))


datatable(data_weigth)
data_milk <- read_delim(
  file = "C:\\Users\\anune\\Downloads\\ANS500AB\\AndreaNunez\\Sheep_Milk_Wide.csv",
  delim = ",",      
  skip  = 1,       
  col_names = c("EweType","Diet","Wk0","Wk1","Wk2", "Wk3","Wk4","Wk5","Wk6","Wk7"),
          col_types = "ffnnnnnnnn", # Set Types
          na = c(""))

datatable(data_milk)


Question 2

Part a

We do not have a unique ID in this dataset, but later we will need to join the datasets.

Create a unique identifier for each row in the wide format datasets. I suggest issing an operator such as 1:nrow(data) to get a unique sequence and then convert to a factor or character variable.

Also use relocate() to move this new ID column to the 1st column.

data_weigth <- data_weigth %>%
  mutate(ID = as.character(1:nrow(data_weigth)))

data_weigth <- data_weigth %>%
  relocate(ID, .before = 1)

#now with the milk data set 
data_milk <- data_milk %>%
  mutate(ID = as.character(1:nrow(data_milk)))

data_milk <- data_weigth %>%
  relocate(ID, .before = 1)

Part b

Please pivot the milk dataset to the current ‘wide’ format to the ‘long’ format. I suggest saving this dataset as another name for later and then printing.

data_long_milk <- data_milk %>%
  pivot_longer(
    cols      = Wk0:Wk7, 
    names_to = "Week",        
    values_to = "MilkProduction"       
  )

print(data_long_milk)
# A tibble: 240 x 5
   ID    EweType Diet  Week  MilkProduction
   <chr> <fct>   <fct> <chr>          <dbl>
 1 1     1       1     Wk0             67.5
 2 1     1       1     Wk1             68  
 3 1     1       1     Wk2             68  
 4 1     1       1     Wk3             66.5
 5 1     1       1     Wk4             69.5
 6 1     1       1     Wk5             69.5
 7 1     1       1     Wk6             72  
 8 1     1       1     Wk7             77.5
 9 2     1       1     Wk0             86  
10 2     1       1     Wk1             88  
# i 230 more rows

Part c

Please pivot the weight dataset to the current ‘wide’ format to the ‘long’ format. I suggest saving this dataset as another name for later and then printing.

data_long_weigth <- data_weigth %>%
pivot_longer(
  cols      = Wk0:Wk7,
  names_to  = "Week", 
  values_to = "Weight"
)

print(data_long_weigth)
# A tibble: 240 x 5
   ID    EweType Diet  Week  Weight
   <chr> <fct>   <fct> <chr>  <dbl>
 1 1     1       1     Wk0     67.5
 2 1     1       1     Wk1     68  
 3 1     1       1     Wk2     68  
 4 1     1       1     Wk3     66.5
 5 1     1       1     Wk4     69.5
 6 1     1       1     Wk5     69.5
 7 1     1       1     Wk6     72  
 8 1     1       1     Wk7     77.5
 9 2     1       1     Wk0     86  
10 2     1       1     Wk1     88  
# i 230 more rows


Question 3

Part a

Take the long milk data produced in question 2 and make histogram of Milk for week 0. Change the bar color. Save this plot for later.

Print with ggplotly() from the plotly package.

data_long_milk0 <- data_long_milk %>%
  filter(Week == "Wk0") %>%
  ggplot(aes(x = MilkProduction)) +    
  geom_histogram(fill = "#4D6291", color = "white")

ggplotly(data_long_milk0)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part b

Take the long milk data produced in question 2 and make histogram of Milk for week 7. Change the bar color to another unique color. Save this plot for later.

Print with ggplotly() from the plotly package.

data_long_milk7 <- data_long_milk %>%
  filter(Week == "Wk7") %>%
  ggplot(aes(x = MilkProduction)) +    
  geom_histogram(fill = "#BD6C99", color = "#F3A469")

ggplotly(data_long_milk7)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part c

Take the long weight data produced in question 2 and make histogram of Milk for week 0. Change the bar color to another unique color. Save this plot for later.

Print with ggplotly() from the plotly package.

data_long_weigth0 <- data_long_weigth %>%
filter(Week == "Wk0") %>%
ggplot(aes(x=Weight)) +    
   geom_histogram(fill = "#CCEDB1", color = "black")

ggplotly(data_long_weigth0)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part d

y Take the long weight data produced in question 2 and make histogram of Milk for week 7. Change the bar color to another unique color. Save this plot for later.

Print with ggplotly() from the plotly package.

data_long_weigth7 <- data_long_weigth %>%
filter(Week == "Wk7") %>%
ggplot(aes(x=Weight)) +    
   geom_histogram(fill = "#EEC9E5", color = "black")
ggplotly(data_long_weigth7)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part e

Use cowplot to plot all 4 into one plot. All 4 should have different bar colors. Use labels “A”, “B”, “C”, “D” for parts A-D respectively in the plot. A and B are from milk and C and D are from weight.

Don’t use ggplotly()

library(ggplot2)
library(cowplot)


pA <- data_long_milk0

pB <- data_long_milk7


pC <- data_long_weigth0
  

  
pD <- data_long_weigth7


plot_grid(pA, pB, pC, pD,
          nrow   = 2, 
          labels = c('A', 'B', 'C','D'))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


Question 4

Part a

Please join the 2 long datasets you created in question 2. Use what columns you need.

data_long_milk1 <- data_long_milk %>%
  select(ID, EweType, Diet, Week, MilkProduction)


datatable(data_long_milk1)
data_long_wg1 <- data_long_weigth %>%
  select(ID, EweType, Diet, Week, Weight)


datatable(data_long_wg1)
data_joined <- inner_join(data_long_milk1, data_long_wg1, by = c("ID", "EweType", "Diet", "Week"))

data_joined
# A tibble: 240 x 6
   ID    EweType Diet  Week  MilkProduction Weight
   <chr> <fct>   <fct> <chr>          <dbl>  <dbl>
 1 1     1       1     Wk0             67.5   67.5
 2 1     1       1     Wk1             68     68  
 3 1     1       1     Wk2             68     68  
 4 1     1       1     Wk3             66.5   66.5
 5 1     1       1     Wk4             69.5   69.5
 6 1     1       1     Wk5             69.5   69.5
 7 1     1       1     Wk6             72     72  
 8 1     1       1     Wk7             77.5   77.5
 9 2     1       1     Wk0             86     86  
10 2     1       1     Wk1             88     88  
# i 230 more rows


Question 5

Part a

Use the dataset from Question 4 (should have milk and weight data in long format).

Use a for loop to loop over all 3 diets.

  • Print a message with the current diet number
  • Print a scatterplot of Milk production on Weight for each diet
  • Add a linear line to the scatterplot
  • Add at minimum a title and subtitle, the subtitle should change with each diet (1,2,3)
unique_diets <- unique(data_joined$Diet)

scatterplots <- list()

for (diet in unique_diets) {
 
  diet_data <- subset(data.joined, Diet == diet)
  
  plot <- ggplot(diet_data, aes(x = Weight, y = MilkProduction)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE) +
    labs(
      title = paste("Scatterplot of Milk Production on Weight (Diet", diet, ")"),
      subtitle = paste("Diet Number:", diet)
    )
  
  print(plot)
  
  scatterplots[[diet]] <- plot
}
Error in subset(data.joined, Diet == diet): object 'data.joined' not found