library(tidyverse)
## ── Attaching packages ───────────────
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Use palettes from RColorBrewer.
dat = data.frame(
"Species" = c("Lions", "Tigers", "Panda Bears"),
"AvgWeight" = c(200, 250, 85))
# students should copy
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) +
geom_col() +
scale_fill_brewer(palette = "Pastel2") +
theme_minimal() +
theme(panel.grid = element_blank()) +
labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs
# talk about choice of palettes
# sequential, diverging, qual
# http://socviz.co/refineplots.html
# alternately use the viridis color palettes
# which should be familiar from matplotlib
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) +
geom_col() +
scale_fill_viridis_d() +
theme_minimal() +
theme(panel.grid = element_blank()) +
labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs
Show as much data relative to the ‘ink’ you use.
Redundant chart elements should be eliminated.
Unless categories have a specific ordering (i.e. ordinal variables), always order your bar charts. Use reorder() or forcats::fct_reorder().
# talk about chartjunk
# students should copy
ggplot(data = dat,
mapping = aes(x = reorder(Species, AvgWeight), y = AvgWeight, fill = Species)) +
geom_col(width = .6) +
scale_fill_brewer(palette = "Accent") +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.text.y = element_blank()) +
guides(fill = FALSE) +
geom_text(aes(label = paste(AvgWeight, "kg")), nudge_y = -15, color = "white") +
labs(x = "", y = "", title = "Average weight of animals")
Using msleep dataset:
Do heavy animals sleep more than light ones?
Which types of animal sleeps below 5 hours? Can you name these animals?
msleep %>%
mutate(text_labels = ifelse(sleep_total %in% c(max(sleep_total), min(sleep_total)),
name,
"")) %>%
ggplot(aes(x = bodywt, y = sleep_total)) +
geom_point(aes(color = text_labels)) +
scale_color_manual(values = c("grey30", "blue", "red")) +
geom_text(aes(label = text_labels), hjust = 0) +
scale_x_log10(labels = scales::comma) +
guides(color = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "gray", size = .3) +
labs(title = "Sleep vs. body weight",
subtitle = "Heavier animals seem to sleep less than lighter ones") +
theme_minimal() + theme(panel.grid = element_blank()) +
labs(x = "Body weight", y = "Sleep (hours)")
# MC
ggplot(msleep, aes(x = bodywt, y = sleep_total)) +
geom_point(color = "grey60") +
scale_x_log10(labels = scales::comma) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Sleep vs. body weight",
subtitle = "Heavier animals seem to sleep less than lighter ones") +
theme_minimal() + theme(panel.grid = element_blank()) +
labs(x = "Body weight", y = "Sleep (hours)")
# MC
msleep %>%
filter(sleep_total < 5) %>%
# mutate(vore = forcats::fct_recode(vore, carnivore = "carni", herbivore = "herbi")) %>%
ggplot(aes(x = vore, fill = conservation)) + geom_bar(position = "dodge") +
labs(title = "Animals that sleep less than 5 hours a day", x = "") +
theme_minimal() + theme(legend.position = "bottom")
# MC
msleep %>%
mutate(name_text = ifelse(sleep_total < 5, name, NA)) %>%
ggplot(aes(x = bodywt, y = sleep_total)) +
geom_point(color = "grey50") +
scale_x_log10(label = scales::comma) +
geom_text(mapping = aes(label = name_text),
check_overlap = TRUE,
nudge_y = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
coord_cartesian(ylim = c(0, 8)) +
ggtitle("Heavier animals seem to sleep **less** than lighter ones") +
theme_minimal() + theme(panel.grid = element_blank())
## Warning: Removed 72 rows containing missing values (geom_text).
Using bank dataset:
Find out which job saves the most money in the bank and plot the result
Find three more interesting insights through plotting.
bank = read.csv('../data/bank.csv', sep=';')
# MC
bank %>%
group_by(job) %>%
summarise(med_bal = median(balance)) %>%
ungroup() %>%
mutate(job = forcats::fct_reorder(job, med_bal),
retired = job == "retired") %>%
ggplot(aes(x = job, y = med_bal, fill = retired)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c("grey70", "blue")) +
theme_minimal() + theme(panel.grid = element_blank()) +
labs(x = "", y = "Median bank balance",
title = "Retirees have the highest median balance") +
guides(fill = FALSE) +
scale_y_continuous(label = scales::dollar)
Plot the distribution of home values
Plot the changes of home values in MA and TX states through time. hint: use date in x-axis
Create a scatterplot of Land.Value vs Structure.Cost by Q1 of 2001, change the symbols so that they correspond to “region”
Add a regression line to the scatterplot
landdata = read.csv("../data/landdata-states.csv")
library(ggbeeswarm)
ggplot(landdata, aes(x = region, y = Home.Value, color = region)) +
geom_quasirandom(size = .1, alpha = .2) +
theme_minimal() +
theme(panel.grid = element_blank()) +
scale_y_log10() + guides(color = FALSE)
# MC
ggplot(landdata, aes(x = Home.Value, fill = region)) +
geom_histogram() +
facet_wrap(vars(region)) +
theme_minimal() +
scale_x_log10(breaks = c(1e05, 1e06),
labels = function(x) paste0(x/1000, "k")) +
guides(fill = FALSE) + theme(panel.grid = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# MC
landdata %>%
filter(State %in% c("MA", "TX")) %>%
group_by(State) %>%
mutate(max_date = max(Date),
last_home_value = ifelse(Date == max(Date), Home.Value, NA)) %>%
ggplot(aes(x = Date, y = Home.Value, group = State, color = State)) +
geom_line() +
geom_text(aes(x = max_date + 1.8, y = last_home_value, label = State)) +
theme_minimal() + theme(panel.grid = element_blank()) +
guides(color = FALSE) +
geom_vline(xintercept = 2008, linetype = "dashed", color = "grey") +
geom_text(x = 2007, y = 200000, label = "2008 subprime mortgage crisis",
hjust = 1, color = "grey") +
scale_y_continuous(labels = function(x) paste0(x / 1000, "k")) +
labs(title = "House prices in MA and TX", x = "",
y = "Home value")
## Warning: Removed 304 rows containing missing values (geom_text).
# MC
landdata %>%
filter(Year == 2001 & Qrtr == 1) %>%
drop_na() %>%
ggplot(aes(x = Structure.Cost, y = Land.Value, color = region)) +
geom_point() +
scale_y_log10(breaks = c(0, 10000, 100000),
labels = function(x) paste0(x / 1000, "k")) +
scale_x_continuous(breaks = c(100000, 150000),
labels = function(x) paste0(x / 1000, "k")) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(vars(region), nrow = 1) +
theme_minimal() + guides(color = FALSE) +
theme(panel.grid = element_blank()) +
labs(x = "Structure cost",
y = "Land value")