# Load package(s)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(lubridate)
# Load datasets
load(file = "data/cows.rda")
load(file = "data/tech_stocks.rda")
?ggplot2::mpg
load(file = here::here("data", "cows.rda"))
load(file = here::here("data", "tech_stocks.rda"))
# Read in the cdc dataset
cdc <- read_delim(file = "data/cdc.txt", delim = "|") %>%
mutate(genhlth = factor(genhlth,
levels = c("excellent", "very good", "good", "fair", "poor")
))
# Set seed
set.seed(9876)
Above, I have loaded four packages of ggplot2, tidyverse, dplyr, and lubridate and loaded three datasets of mpg, cows.rda, and tech_stocks.rda in order to plot layers. I have also read data from the cdc dataset and set the seed to 9876.
# Additional dataset for plot
class_dat <- mpg %>%
group_by(class) %>%
summarise(
n = n(),
hwy = mean(hwy),
label = str_c("n = ", n, sep = "")
)
The above code loads an additional dataset called class_dat from the mpg dataset that will help plot the geom_jitter plot with summarised datapoints
#Mapping of data
ggplot(mpg, aes(class, hwy)) +
geom_jitter(width = 0.1) +
stat_summary(geom = "point", fun.y = mean, colour = "red",
size = 5, alpha = 0.6) +
geom_text(data = class_dat, aes(y = 10, label = label)) +
xlab("Vehicle class") +
ylab("Highway miles per gallon")
Above, I have used geom_jitter in order to create jittered points on the plot using the mpg dataset with class and hwy variables. The stat_summary function forms bigger points that captures the distribution of the data points.
# Graphic dataset
cow_means <- cows %>%
filter(breed != "Canadian") %>%
group_by(breed) %>%
summarize(
mean = mean(butterfat),
se = sd(butterfat) / sqrt(n())
) %>%
mutate(breed = fct_reorder(factor(breed), desc(mean)))
#Confidence Interval of 95%
z_star_95<-qnorm(0.975)
#Mapping dataset
ggplot(cow_means, aes(breed, mean)) +
geom_bar(position = position_dodge(), stat = "identity",
fill = "#56B4E9") +
geom_errorbar(aes(ymin = mean - z_star_95*se, ymax = mean + z_star_95*se),
width = 0.1, position = position_dodge(0.7)) +
xlab("Cattle breed") +
ylab("Mean percent butterfat content in milk")
Above, I have created a bar chart with a confidence interval of 95 percent. The geom_errorbar function portrays this on the top of each bar chart corresponding to each cattle breed.
# percentage increase data
perc_increase <- tech_stocks %>%
ungroup(ticker) %>%
arrange(desc(date)) %>%
distinct(company, .keep_all = TRUE) %>%
mutate(
perc = 100 * (price - index_price) / index_price,
label = str_c(round(perc), "%", sep = ""),
company = fct_reorder(factor(company), perc)
)
Above, the perc_increase dataset is formed to aid the graphing of the plot using the tech_stocks dataset.
ggplot(perc_increase, aes(x = company, y = perc )) +
geom_bar(position = "dodge", stat = "identity",
fill = "#56B4E9", size = 5) +
geom_text(aes(y= perc , label = label), hjust = 1.1, color = "white") +
coord_flip() +
xlab("") +
ylab("")
Above, the geom_bar function creates bar plots for each tech company that shows the percentage increase of their stocks with the numbers portrayed on the end of the bars.
# 95% CI for weight for genhlth, gender groups
cdc_weight_95ci <- cdc %>%
group_by(genhlth, gender) %>%
summarise(
mean_wt = mean(weight),
se = sd(weight) / sqrt(n()),
moe = qt(0.975, n() - 1) * se
)
#Confidence interval of 95%
z_star_95<-qnorm(0.975)
The above code captures the 95 percent confidence interval of the weight for the genhlth variable from the cdc dataset.
ggplot(cdc_weight_95ci, aes(x = gender, y = mean_wt, color = genhlth)) +
geom_errorbar(aes(ymin = mean_wt - z_star_95*se,
ymax = mean_wt + z_star_95*se), width = 0.1,
position = position_dodge(0.5)) +
geom_point(aes(color = genhlth), position = position_dodge(0.5)) +
coord_flip() +
xlab("Gender") +
ylab("Weight(lbs)")
Above, I have created a geom_errorbar chart that captures the data with a 95 percent confidence interval. The geom_point function sets dots on the middle of the geom_errorbar plots.