Datasets

# Load package(s)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(lubridate)

# Load datasets
load(file = "data/cows.rda")
load(file = "data/tech_stocks.rda")
?ggplot2::mpg
load(file = here::here("data", "cows.rda"))
load(file = here::here("data", "tech_stocks.rda"))

# Read in the cdc dataset
cdc <- read_delim(file = "data/cdc.txt", delim = "|") %>%
  mutate(genhlth = factor(genhlth,
    levels = c("excellent", "very good", "good", "fair", "poor")
  ))

# Set seed
set.seed(9876)

Above, I have loaded four packages of ggplot2, tidyverse, dplyr, and lubridate and loaded three datasets of mpg, cows.rda, and tech_stocks.rda in order to plot layers. I have also read data from the cdc dataset and set the seed to 9876.

Exercises

Exercise 1

# Additional dataset for plot
class_dat <- mpg %>%
  group_by(class) %>%
  summarise(
    n = n(),
    hwy = mean(hwy),
    label = str_c("n = ", n, sep = "")
  )

The above code loads an additional dataset called class_dat from the mpg dataset that will help plot the geom_jitter plot with summarised datapoints

#Mapping of data
ggplot(mpg, aes(class, hwy)) +
  geom_jitter(width = 0.1) +
  stat_summary(geom = "point", fun.y = mean, colour = "red", 
               size = 5, alpha = 0.6) +
  geom_text(data = class_dat, aes(y = 10, label = label)) +
  xlab("Vehicle class") +
  ylab("Highway miles per gallon")

Above, I have used geom_jitter in order to create jittered points on the plot using the mpg dataset with class and hwy variables. The stat_summary function forms bigger points that captures the distribution of the data points.

Exercise 2

# Graphic dataset
cow_means <- cows %>%
  filter(breed != "Canadian") %>%
  group_by(breed) %>%
  summarize(
    mean = mean(butterfat),
    se = sd(butterfat) / sqrt(n())
  ) %>%
  mutate(breed = fct_reorder(factor(breed), desc(mean)))

#Confidence Interval of 95%
z_star_95<-qnorm(0.975)

#Mapping dataset
ggplot(cow_means, aes(breed, mean)) + 
  geom_bar(position = position_dodge(), stat = "identity", 
           fill = "#56B4E9") +
  geom_errorbar(aes(ymin = mean - z_star_95*se, ymax = mean + z_star_95*se), 
                width = 0.1, position = position_dodge(0.7)) +
  xlab("Cattle breed") +
  ylab("Mean percent butterfat content in milk")

Above, I have created a bar chart with a confidence interval of 95 percent. The geom_errorbar function portrays this on the top of each bar chart corresponding to each cattle breed.

Exercise 3

# percentage increase data
perc_increase <- tech_stocks %>%
  ungroup(ticker) %>%
  arrange(desc(date)) %>%
  distinct(company, .keep_all = TRUE) %>%
  mutate(
    perc = 100 * (price - index_price) / index_price,
    label = str_c(round(perc), "%", sep = ""),
    company = fct_reorder(factor(company), perc)
  )

Above, the perc_increase dataset is formed to aid the graphing of the plot using the tech_stocks dataset.

ggplot(perc_increase, aes(x = company, y = perc )) +
  geom_bar(position = "dodge", stat = "identity", 
           fill = "#56B4E9", size = 5) +
  geom_text(aes(y= perc , label = label), hjust = 1.1, color = "white") +
  coord_flip() +
  xlab("") + 
  ylab("")

Above, the geom_bar function creates bar plots for each tech company that shows the percentage increase of their stocks with the numbers portrayed on the end of the bars.

Exercise 4

# 95% CI for weight for genhlth, gender groups
cdc_weight_95ci <- cdc %>%
  group_by(genhlth, gender) %>%
  summarise(
    mean_wt = mean(weight),
    se = sd(weight) / sqrt(n()),
    moe = qt(0.975, n() - 1) * se
  )

#Confidence interval of 95%
z_star_95<-qnorm(0.975)

The above code captures the 95 percent confidence interval of the weight for the genhlth variable from the cdc dataset.

ggplot(cdc_weight_95ci, aes(x = gender, y = mean_wt, color = genhlth)) +
  geom_errorbar(aes(ymin = mean_wt - z_star_95*se,
                    ymax = mean_wt + z_star_95*se),  width = 0.1, 
                position = position_dodge(0.5)) +
  geom_point(aes(color = genhlth), position = position_dodge(0.5)) +
  coord_flip() +
  xlab("Gender") +
  ylab("Weight(lbs)")

Above, I have created a geom_errorbar chart that captures the data with a 95 percent confidence interval. The geom_point function sets dots on the middle of the geom_errorbar plots.