Datasets

We’ll be using the titanic.rda, Aus_athletes.rda, and cdc.txt datasets.

# Load package(s)
library(ggplot2)
library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v stringr 1.4.0  
## v tibble  2.1.1       v forcats 0.4.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
# Load datasets
load(file = "data/titanic.rda")
load(file = "data/Aus_athletes.rda")

# Read in the cdc dataset
cdc <- read_delim(file = "data/cdc.txt", delim = "|") %>%
  mutate(genhlth = factor(genhlth,
    levels = c("excellent", "very good", "good", "fair", "poor"),
    labels = c("Excellent", "Very Good", "Good", "Fair", "Poor")
  ))
## Parsed with column specification:
## cols(
##   genhlth = col_character(),
##   exerany = col_double(),
##   hlthplan = col_double(),
##   smoke100 = col_double(),
##   height = col_double(),
##   weight = col_double(),
##   wtdesire = col_double(),
##   age = col_double(),
##   gender = col_character()
## )
# Set seed
set.seed(8221984)

# Selecting a random subset of size 1000
cdc_small <- cdc %>% sample_n(1000)

Above, I have loaded packages ggplot, tidyverse, dplyr, and cowplot and datasets of titanic.rda, Aus_athletes.rda, and cdc.

Exercise 1

labels <- c('1st' = "1st",
                    '2nd' = "2nd",
                    '3rd' = "3rd",
                    '0' = "died",
                    '1' = "survived")

ggplot(titanic, aes(sex)) +
  geom_bar(aes(fill = sex), show.legend = FALSE) +
  facet_grid(survived ~ class, labeller = as_labeller(labels), scales = "free") +
  theme_minimal() +
  scale_fill_manual(values = c("#D55E00D0","#0072B2D0") ) +
  xlab(NULL)

Here, I have created a facet_grid barplot distinguished by the survived and class variables. I believe this shows effectively for us to compare the survival rates by gender and class in a single look.

Exercise 2

# Get list of sports played by BOTH sexes
both_sports <- Aus_athletes %>%
  distinct(sex, sport) %>%
  count(sport) %>%
  filter(n == 2) %>%
  pull(sport)

# Process data
athletes_dat <- Aus_athletes %>%
  filter(sport %in% both_sports) %>%
  mutate(sport = case_when(
    sport == "track (400m)" ~ "track",
    sport == "track (sprint)" ~ "track",
    TRUE ~ sport
  ))

Above, the athletes_dat dataset is formed from the Aus_athletes dataset.

plot.a<-ggplot(athletes_dat, aes(sex)) +
  geom_bar(aes(fill = sex), show.legend = FALSE) +
  scale_x_discrete(breaks = c("f", "m"), 
                   labels = c("female", "male")) +
  scale_y_continuous(breaks = c(0,25,50,75),
                     expand = c(0,0),
                     limits = c(0,95)) +
  scale_fill_manual(values = c("#D55E00D0", "#0072B2D0")) +
  theme_minimal() +
  xlab(NULL) +
  ylab("number")
           
plot.b<-ggplot(athletes_dat, aes(rcc, wcc)) +
  geom_point(aes(colour = sex), size = 3, show.legend = FALSE) +
  theme_minimal() +
  xlab("RBC count") +
  scale_colour_manual(values = c("#D55E00D0", "#0072B2D0")) +
  ylab("WBC count")

plot.c<-  ggplot(athletes_dat, aes(sport, pcBfat)) +
  theme_minimal() + 
  geom_boxplot(width = 0.5, aes(fill = sex, colour = sex)) +
  scale_fill_manual(values = c("#D55E0040", "#0072B240"),
                    breaks = c("f","m"),
                    labels = c("female", "male")) +
  scale_colour_manual(values = c("#D55E00", "#0072B2"),
                      breaks = c("f", "m"),
                      labels = c("female", "male")) +
  theme(legend.justification = c(1,1),
        legend.position = c(1,1),
        legend.title = element_blank(),
        legend.direction = "horizontal") +
  guides(colour = guide_legend(override.aes = list(colour = NA),
                             fill = c("#D55E0040","#0072B240"))) +
  xlab(NULL) +
  ylab("% body fat")

#Combine Plots
require(cowplot)
top_row_graphs<-plot_grid(plot.a,plot.b, labels = c('',''))
plots<-align_plots(plot.c, top_row_graphs, 
                   align = 'v', axis = '1')
plot_grid(plots[[2]], plot.c, labels = c('',''),ncol = 1, rel_heights = c(1,1.2))

Above, I have formed three plots of plot.a, plot.b, plot.c into a single plot using the cowplot package and the plot_grid function, and by using the athletes_dat dataset.

Exercise 3

#Modify Data
cdc_small["Weight_Loss_Gain_in_Pounds"]<-NA
cdc_small$Weight_Loss_Gain_in_Pounds<-cdc_small$wtdesire-cdc_small$weight
cdc_small$genhlth_f = factor(cdc_small$genhlth,
                             levels = c('Poor', 'Fair', 'Good',
                                        'Very Good', 'Excellent'))

Above, I have modified the dataset of cdc dataset to create the new column and variable of “Weight_Loss_Gain_in_Pounds” that equals to the value of the wtdesire variable minus the weight variable.

labels3 <- c('Poor' = "Poor",
            'Fair' = "Fair",
            'Good' = "Good",
            'Very Good' = "Very Good",
            'Excellent' = "Excellent",
            'f' = "Women",
            'm' = "Men")

df2<-dplyr::select(cdc_small, -gender)

ggplot(cdc_small, aes(weight, Weight_Loss_Gain_in_Pounds)) +
  theme_minimal() +
  geom_point(data = df2, colour = "grey80") +
  geom_point(aes(colour = gender), size = 2, show.legend = FALSE) +
  facet_grid(gender ~ genhlth_f,labeller = as_labeller(labels3)) +
  scale_x_continuous(breaks = c(100,200,300)) +
  scale_y_continuous(breaks = c(-100,0)) +
  scale_colour_manual(values = c("#D55E00D0","#0072B2D0")) +
  xlab("Weight(lbs)") +
  ylab("Weight Loss/Gain in Pounds")

Here, I created a new dataset of df2 which takes out the gender variable in order to use grey points in the background, with only the specified gender points to be colored. The facet_grid function helped with this, to create these plots differentiated by gender and general health status.