Homework 4

Author

AndreNunez

Published

September 27, 2023


Setup Code

#==============================================================================#
# Setup Options
#==============================================================================#

# remove all objects if restarting script
rm(list=ls())

# options
options(
  tibble.width = Inf,  # print all columns
  scipen = 999         # remove scientific notation
)

#==============================================================================#
# Install Packages
#==============================================================================#

# In R, we first have to download the packages we want from the online 
# repository called CRAN. 

# Once installed, you have to load it in each session with the library() 
# function. 

# download package
#install.packages("DT")
#install.packages("lubridate")
#install.packages("tidyverse")

#==============================================================================#
# Packages
#==============================================================================#

# Here we must load the packages for the current environment to make them
# accessible. 

# load libraries
library(DT)
library(lubridate)
Warning: package 'lubridate' was built under R version 4.1.3

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.1.3
Warning: package 'ggplot2' was built under R version 4.1.3
Warning: package 'tibble' was built under R version 4.1.3
Warning: package 'tidyr' was built under R version 4.1.3
Warning: package 'readr' was built under R version 4.1.3
Warning: package 'purrr' was built under R version 4.1.3
Warning: package 'dplyr' was built under R version 4.1.3
Warning: package 'stringr' was built under R version 4.1.3
Warning: package 'forcats' was built under R version 4.1.3
-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr   1.1.2     v readr   2.1.4
v forcats 1.0.0     v stringr 1.5.0
v ggplot2 3.4.2     v tibble  3.2.1
v purrr   1.0.1     v tidyr   1.3.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#==============================================================================#
# Set Paths
#==============================================================================#

# set all paths
path_main    <- "C:\\Users\\anune\\Downloads\\ANS500AB\\AndreaNunez\\swine_data_MSU.csv"
path_data    <- str_c(path_main, "Data/", sep="")
path_plots   <- str_c(path_main, "Plots/", sep="")
path_scripts <- str_c(path_main, "Scripts/", sep="")

# NOTE: str_c() is from the stringr package, which is a part of 'tidyverse'
# this is equivalent to paste() in base R, but we'll try to use tidyverse
# functions when possible in this class. 

# set working directory
#setwd(path_main)

# NOTE: We cannot set the working directory in quarto with this method
# as we do in an R script. Use the root.dir option in YAML. 

#==============================================================================#
# Set Inputs
#==============================================================================#

# set file names here
data_file <- "swine_data_MSU.csv"

#==============================================================================#
# Check session info again
#==============================================================================#

# check sessionInfo
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 22621)

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2     purrr_1.0.1    
 [5] readr_2.1.4     tidyr_1.3.0     tibble_3.2.1    ggplot2_3.4.2  
 [9] tidyverse_2.0.0 lubridate_1.9.2 DT_0.20        

loaded via a namespace (and not attached):
 [1] compiler_4.1.2    pillar_1.9.0      tools_4.1.2       digest_0.6.29    
 [5] jsonlite_1.8.4    evaluate_0.14     lifecycle_1.0.3   gtable_0.3.0     
 [9] timechange_0.2.0  pkgconfig_2.0.3   rlang_1.1.0       cli_3.6.1        
[13] rstudioapi_0.15.0 yaml_2.2.1        xfun_0.29         fastmap_1.1.0    
[17] withr_2.5.0       knitr_1.37        hms_1.1.3         generics_0.1.1   
[21] vctrs_0.6.1       htmlwidgets_1.5.4 grid_4.1.2        tidyselect_1.2.0 
[25] glue_1.6.2        R6_2.5.1          fansi_0.5.0       rmarkdown_2.11   
[29] tzdb_0.2.0        magrittr_2.0.3    scales_1.2.1      htmltools_0.5.2  
[33] colorspace_2.0-2  utf8_1.2.2        stringi_1.7.6     munsell_0.5.0    


Question 1

Read in the MSU swine dataset. You can find in the Data/ folder.

library(tidyverse)
library(car)
Warning: package 'car' was built under R version 4.1.3
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
library(readr)
library(DT)
library(readr)

data_MSU <- read_delim(
  file = "C:\\Users\\anune\\Downloads\\ANS500AB\\AndreaNunez\\swine_data_MSU.csv",
  delim = ",",
  skip = 1,
  col_names = c("ID", "wt_birth", "car_wt", "num_ribs", "car_bf10", "car_lma",
                 "sex", "perc_duroc", "sire", "dam", "litter", "slgdt_cd"),
  col_types = "cddindfiic"
)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
datatable(data_MSU)

Question 2

Part a

Start with a simple bar chart counting the number of offspring per sire. Change all the bar colors to your favorite color.

data_MSU %>%
  count(sire, sex) %>%                # count by line first
ggplot(., aes(x=sire, y=n, fill = sex)) +    # aes maps x axis -> Line
  geom_col(fill  = "#93ABE1") 

Part b

Copy part a and change the fill color by Sex.

data_MSU %>%
  count(sire, sex) %>%
  ggplot(aes(x = sire, y = n, fill = sex)) +
  geom_col() +
  scale_fill_manual(values = c("M" = "red", "F" = "blue"))

Part c

Copy part b and change the fill color to your two favorite colors (by sex).

data_MSU %>%
  count(sire, sex) %>%
  ggplot(aes(x = sire, y = n, fill = sex)) +
  geom_col() +
  scale_fill_manual(values = c("M" = "#93ABE1", "F" = "#FFA07A"))

Part d

Copy part c and now facet by Sex (along with the fill by Sex).

data_MSU %>%
  count(sire, sex) %>%
  ggplot(aes(x = sire, y = n, fill = sex)) +
  geom_col() +
  scale_fill_manual(values = c("M" = "#93ABE1", "F" = "#FFA07A")) +
  facet_wrap(~ sex)

Part e

Copy part d and now add all 5 labels to the plot.

data_MSU %>%
  count(sire, sex) %>%
  ggplot(aes(x = sire, y = n, fill = sex)) +
  geom_col() +
  scale_fill_manual(values = c("M" = "#537a59", "F" = "#FFA07A")) +
  facet_wrap(~ sex) +
  labs(
    title = "Bar chart of offspring per sire",
    subtitle = "MSU swine data set",
    x = "Sire",
    y = "Count (n)",
    caption = "Count the numer of offspring per sire utilizing the MSU swine data set"
  )

Part f

Copy part e and change the color of the title to your favorite color. The theme option you need is plot.title.

data_MSU %>%
  count(sire, sex) %>%
  ggplot(aes(x = sire, y = n, fill = sex)) +
  geom_col() +
  scale_fill_manual(values = c("M" = "#537a59", "F" = "#FFA07A")) +
  facet_wrap(~ sex) +
  labs(
    title = "Bar chart of offspring per sire",
    subtitle = "MSU swine data set",
    x = "Sire",
    y = "Count (n)",
    caption = "Count the numer of offspring per sire utilizing the MSU swine data set"
  )+
  theme(plot.title = element_text(size=12, color="darkblue")
    
  )

Question 3

Part a

Please create a simple scatterplot of carcass LMA on carcass BF.

data_MSU %>%
  fill(car_bf10, car_lma) %>%
  ggplot(aes(x = car_bf10, y = car_lma)) +
  geom_point()

Part b

Copy part a and now color the points by sex.

data_MSU %>%
  fill(car_bf10, car_lma) %>%
  ggplot(aes(x = car_bf10, y = car_lma, color = sex)) +
  geom_point() +
  scale_color_manual(values = c("M" = "#537a59", "F" = "#FFA07A")) 

Part c

Copy part b and add a smooth line (will be 2 separate lines).

data_MSU %>%
  ggplot(aes(x = car_bf10, y = car_lma, color = sex)) +
  geom_point() +
  geom_smooth(formula = y ~ x, se = FALSE, aes(group = sex), color = "purple", size = 1, linetype = "solid") +
  scale_color_manual(values = c("M" = "#537a59", "F" = "#FFA07A"))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
i Please use `linewidth` instead.
`geom_smooth()` using method = 'loess'
Warning: Removed 13 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 13 rows containing missing values (`geom_point()`).

Part d

Copy part c and change the lines to linear (straight) lines.

data_MSU %>%
  ggplot(aes(x = car_bf10, y = car_lma, color = sex)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, aes(group = sex), color = "purple", size = 1, linetype = "solid") +
  scale_color_manual(values = c("M" = "#537a59", "F" = "#FFA07A"))
Warning: Removed 13 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 13 rows containing missing values (`geom_point()`).

Part e

Copy part d and change the point shapes by Sire.

data_MSU$sire <- as.factor(data_MSU$sire)

data_MSU %>%
   ggplot(aes(x = car_bf10, y = car_lma, color = sex, shape = sire)) +
  geom_point() +
  stat_smooth(method = "lm", formula = y ~ x, se = FALSE, aes(group = sex), color = "purple", size = 2, linetype = "solid") +
  scale_color_manual(values = c("M" = "#537a59", "F" = "#FFA07A"))
Warning: Removed 13 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: shape
i This can happen when ggplot fails to infer the correct grouping structure in
  the data.
i Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?
Warning: Removed 13 rows containing missing values (`geom_point()`).

Question 4

Part a

Create a boxplot of birth weight on sire.

data_MSU %>%
ggplot(., aes(x=sire, y= wt_birth)) +
  geom_boxplot()

Part b

Copy part a and change the fill and outline colors of the boxplots.

data_MSU %>%
  ggplot(aes(x = sire, y = wt_birth)) +
  geom_boxplot(fill = "#6B4226", color = "#2E86C1")

Part c

Copy part b and facet by sex.

data_MSU %>%
  ggplot(aes(x = sire, y = wt_birth)) +
  geom_boxplot(fill = "#6B4226", color = "#2E86C1") +
  facet_wrap(~ sex)

Question 5

Part a

Start by making a table of the average backfat by sire as well as the SD.

data_MSU %>%
  group_by(sire) %>%
  summarise(
    AvgBF = mean(car_bf10, na.rm=TRUE),
    SD  = sd(car_bf10, na.rm=TRUE)
  ) 
# A tibble: 6 x 3
  sire  AvgBF    SD
  <fct> <dbl> <dbl>
1 311    27.1  7.66
2 326    24.7  8.00
3 328    23.8  7.20
4 330    24.9  6.34
5 331    25.9  7.70
6 335    20.8  5.74

Part b

Now plot the average with SD in a bar chart.

data_MSU %>%
  group_by(sire) %>%
  summarise(
    AvgBF = mean(car_bf10, na.rm = TRUE),
    SD = sd(car_bf10, na.rm = TRUE)
  ) %>%
  ggplot(aes(x = sire, y = AvgBF)) +
  geom_bar(stat = "identity", fill = "#537a59", color = "black", width = 0.6) +
  geom_errorbar(aes(ymin = AvgBF - SD, ymax = AvgBF + SD), width = 0.2, color = "blue")