Data Viz: Recreate Plots Sample Solutions

Useful Packages

library(ggplot2)
library(dplyr)
library(tidyr)
library(tidyverse)
library(esquisse)
library(scales)
library(janitor)
library(lubridate)
library(stringr)

Board Games

board_games<-readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")

## Rows: 10532 Columns: 22

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): description, image, name, thumbnail, artist, category, compilation...
## dbl (10): game_id, max_players, max_playtime, min_age, min_players, min_play...

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#board_games$name[str_detect(board_games$name, "Catan")]

top5 <- board_games %>%
  filter(name %in% c("Catan", "Acquire", "Twilight Struggle", "Connect Four", "Mouse Trap")) %>% select(name, year_published, average_rating)

#install.packages("ggthemes")
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 3.6.2

plot <- ggplot()+
  geom_point(data = board_games, aes(x=year_published, y=average_rating), alpha=.05) +
  geom_smooth(data = board_games, aes(x = year_published, y = average_rating), color="red", se = FALSE) +
  geom_point(data = top5, aes(x = year_published, y = average_rating), color = "black") + 
  ggtitle("A Golden Age Of Board Games?", subtitle= "Average user ratings for board games by original year of production")+
  scale_x_discrete(limits = c(1950, 1960, 1970, 1980, 1990, 2000, 2010), name = "",
                   label = c("1950", "'60", "'70", "'80", "'90", "'00", "'10")) +
  ylim(NA, 10.0) + 
  labs(y = "Average user rating") +
  annotate("text", x = top5$year_published[1], y = top5$average_rating[1], label = top5$name[1], vjust = -1, fontface = "bold") +
  annotate("text", x = top5$year_published[2], y = top5$average_rating[2], label = "The Settlers of Catan", vjust = -1, fontface = "bold") + 
  annotate("text", x = top5$year_published[3], y = top5$average_rating[3], label = top5$name[3], vjust = -1, fontface = "bold") +
  annotate("text", x = top5$year_published[4], y = top5$average_rating[4], label = top5$name[4], vjust = -1, fontface = "bold") + 
  annotate("text", x = top5$year_published[5], y = top5$average_rating[5], label = top5$name[5], vjust = -1, fontface = "bold") +
  theme_fivethirtyeight() +
  theme(plot.title = element_text(vjust = 1), plot.subtitle = element_text(vjust = 2), axis.title.y = element_text(face = "bold"))

## Warning: Continuous limits supplied to discrete scale.
## Did you mean `limits = factor(...)` or `scale_*_continuous()`?

plot

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Measles

library(colorspace)

## Warning: package 'colorspace' was built under R version 3.6.2

library(dslabs)

## Warning: package 'dslabs' was built under R version 3.6.2

measles <- us_contagious_diseases %>%
filter(!state%in%c("Hawaii","Alaska") & disease == "Measles") %>% mutate(rate = count / population * 100000 * 52 / weeks_reporting)

ggplot(measles, aes(x= year, y= factor(state, levels = rev(levels(factor(state)))))) +
      geom_tile(aes(fill= rate), color = "white", size= 0.25) +
 
   #scale_fill_distiller(trans="sqrt", palette="Spectral", direction=-1, aesthetics="fill",na.value="white") +
 # scale_fill_gradient2(
  #  #trans="sqrt",
  #  low = "deepskyblue", 
 #   mid = "yellow", 
 #   high = "red", 
  #  midpoint = 1700, 
 #   na.value = "white"
  #)+
scale_fill_gradientn(colours =c("aliceblue", "deepskyblue","springgreen4",
                                "yellow","gold","darkgoldenrod1",
                                "orange","red","firebrick1","firebrick2",
                                "firebrick3","firebrick4"),
                      na.value = "white")+
  geom_vline(xintercept = 1963, color = "black", size = 1.2)+
  labs(x="",y="", title= "Measles",
         subtitle = "Vaccine introduced",
         caption = "Note: CDC data from 2003-2012 comes from its Summary of Notifiable Diseases, which\npublishes yearly rather than weekly and counts confirmed cases as opposed\nto provisional ones.") +
 
      theme_classic()+    
  theme(
    axis.line=element_blank(),
    axis.ticks=element_line(size=0.4),
    plot.background=element_blank(),
    plot.margin=margin(0.7,0.4,0.1,0.2,"cm"),
    plot.title = element_text(size =12, face = "bold"),
    plot.subtitle= element_text(size = 7, hjust = 0.55),
    plot.caption.position = "panel",
    plot.caption = element_text(hjust = 0, size = 7),
   
    panel.grid = element_blank(),
   
   
    legend.position = "bottom",
    legend.title = element_blank(),
    legend.margin=margin(grid::unit(0,"cm")),
    legend.key.width=grid::unit(0.8,"cm"),
    legend.key.height=grid::unit(0.2,"cm")
   
   
    )

California Wild Fires

wildfires<-readr::read_csv("https://raw.githubusercontent.com/BuzzFeedNews/2018-07-wildfire-trends/master/data/calfire_frap.csv") %>%
  mutate(plot_date = as.Date(format(alarm_date,"2017-%m-%d")))

## Rows: 14847 Columns: 18

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): state, agency, unit_id, fire_name, inc_num, comments, fire_num
## dbl  (9): objectid, year_, cause, report_ac, gis_acres, c_method, objective,...
## date (2): alarm_date, cont_date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(lubridate)
library(scales)

ggplot(wildfires, aes(x=plot_date, y=year_, size=shape_area))+
  geom_point(alpha=0.4, color="orange")+
  scale_y_reverse(n.breaks=4, labels=c("1950","1970","1990","2010"))+
  ggtitle("Big fires have gotten more common.")+
  theme(plot.background = element_rect(fill = "black"),
        panel.background = element_rect(fill="black"),
        panel.grid=element_blank(),
        panel.grid.major.y = element_line(color="grey"),
        panel.grid.minor.y=element_line(color="grey"),
        axis.title=element_blank(),
        axis.text=element_text(color="lightgrey", face="bold"),
        legend.position = "none",
        title=element_text(color="lightgrey"))+
  scale_x_date(date_breaks="1 month", 
               date_labels=c("Dec","Jan","Feb","Mar", "Apr","May", "Jun","Jul",
                             "Aug","Sep","Oct","Nov"))+
  scale_size(range=c(0.1,9))

## Warning: Removed 1617 rows containing missing values (geom_point).

Foul Balls

foul_balls<-readr::read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/foul-balls/foul-balls.csv") %>%
  mutate(velocity_mag = if_else(exit_velocity < 90, "< 90 mph", "> 90 mph", missing = "Unknown"))

## Rows: 906 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): matchup, type_of_hit
## dbl  (4): exit_velocity, predicted_zone, camera_zone, used_zone
## date (1): game_date

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(foul_balls,aes(predicted_zone,fill= velocity_mag))+
  geom_bar()+
  coord_flip()+
  scale_fill_manual(values=c("#A0BF77", "#73A9AF", "#D3D3D3"))+
  labs(title = "The hardest-hit fouls seem to land in unprotected areas")+
  xlab("Zone")+
  scale_x_continuous(trans="reverse",breaks=seq(from=1,to=7,by=1))+
  theme(panel.border = element_blank(),
        panel.background = element_blank(),
        axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x=element_blank(),
        axis.title.y = element_text(angle=0,hjust=0.1),
        plot.title = element_text(),
        axis.text.x = element_blank())

Data Viz: Recreate Plots Sample Solutions

Kitada Smalley

Useful Packages

Board Games

Measles

California Wild Fires

Foul Balls