Lesson 5 - ggplot cheatsheet

1. Objective

The objective of this exercise is to create a ggplot cheatsheet for future visualization tasks. We will create the following charts:

  • Bar Chart
  • Line Chart
  • Stacked Area
  • Histogram
  • Density Plot
  • Boxplot
  • Scatterplot

We will modify each chart to apply the principles of design, including adding meaningful axis labels, titles, subtitles, and captions, customizing font, removing chart junk, formatting scale and adding pre-attentive attributes where appropriate.

2. Import Libraries

pacman::p_load(ggplot2, tidyverse, ggthemes, knitr, extrafont, dplyr, scales, lubridate)

3. Import Data & Data Pre-processing

For this exercise, we use the bikeshare dataset provided.

setwd("C:/Users/engha/Desktop/Alvin/NYU MSBA/Module 3 - Shanghai/Data Visualization/Lesson 5")
data <- read_csv("bikesharedailydata.csv")
## Parsed with column specification:
## cols(
##   instant = col_integer(),
##   dteday = col_character(),
##   season = col_integer(),
##   yr = col_integer(),
##   mnth = col_integer(),
##   holiday = col_integer(),
##   weekday = col_integer(),
##   workingday = col_integer(),
##   weathersit = col_integer(),
##   temp = col_double(),
##   atemp = col_double(),
##   hum = col_double(),
##   windspeed = col_double(),
##   casual = col_integer(),
##   registered = col_integer(),
##   cnt = col_integer()
## )

Let’s take a look at the data.

head(data)
## # A tibble: 6 x 16
##   instant dteday season    yr  mnth holiday weekday workingday weathersit
##     <int>  <chr>  <int> <int> <int>   <int>   <int>      <int>      <int>
## 1       1 1/1/11      1     0     1       0       6          0          2
## 2       2 1/2/11      1     0     1       0       0          0          2
## 3       3 1/3/11      1     0     1       0       1          1          1
## 4       4 1/4/11      1     0     1       0       2          1          1
## 5       5 1/5/11      1     0     1       0       3          1          1
## 6       6 1/6/11      1     0     1       0       4          1          1
## # ... with 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## #   windspeed <dbl>, casual <int>, registered <int>, cnt <int>
summary(data)
##     instant         dteday              season            yr        
##  Min.   :  1.0   Length:731         Min.   :1.000   Min.   :0.0000  
##  1st Qu.:183.5   Class :character   1st Qu.:2.000   1st Qu.:0.0000  
##  Median :366.0   Mode  :character   Median :3.000   Median :1.0000  
##  Mean   :366.0                      Mean   :2.499   Mean   :0.5007  
##  3rd Qu.:548.5                      3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :731.0                      Max.   :4.000   Max.   :1.0000  
##                                     NA's   :1                       
##       mnth           holiday           weekday        workingday   
##  Min.   : 1.000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 4.000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000  
##  Median : 7.000   Median :0.00000   Median :3.000   Median :1.000  
##  Mean   : 6.527   Mean   :0.02873   Mean   :2.997   Mean   :0.684  
##  3rd Qu.:10.000   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000  
##  Max.   :12.000   Max.   :1.00000   Max.   :6.000   Max.   :1.000  
##  NA's   :1                                                         
##    weathersit         temp             atemp              hum        
##  Min.   :1.000   Min.   :0.05913   Min.   :0.07907   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200  
##  Median :1.000   Median :0.49833   Median :0.48673   Median :0.6267  
##  Mean   :1.395   Mean   :0.49538   Mean   :0.47435   Mean   :0.6279  
##  3rd Qu.:2.000   3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302  
##  Max.   :3.000   Max.   :0.86167   Max.   :0.84090   Max.   :0.9725  
##                                                                      
##    windspeed           casual         registered        cnt      
##  Min.   :0.02239   Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.:0.13495   1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
##  Median :0.18097   Median : 713.0   Median :3662   Median :4548  
##  Mean   :0.19049   Mean   : 848.2   Mean   :3656   Mean   :4504  
##  3rd Qu.:0.23321   3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
##  Max.   :0.50746   Max.   :3410.0   Max.   :6946   Max.   :8714  
## 

The variables season and mnth have one missing data each we go ahead to drop those two instances.

data1 <- na.omit(data)
dim(data1)
## [1] 729  16

We are now ready to create the charts.

Let us first create some utility variables to help with the visualizations.

# customize font
fonts <- theme_bw() + 
  theme(plot.title = element_text(size=14, family="Trebuchet MS", color="#666666", face="bold")) +
  theme(plot.subtitle = element_text(size=10, family="Trebuchet MS", color="#666666")) +
  theme(axis.title = element_text(size=10, family="Trebuchet MS", color="#666666", face="bold")) + 
  theme(axis.title.y = element_text(angle=0)) +
  theme(plot.caption = element_text(size=8, family="Trebuchet MS", color="#666666"))

# remove chart junk
removeJunk <- theme(panel.border = element_blank(),
                    panel.grid = element_blank(),
                    panel.grid.minor = element_blank(),
                    axis.line = element_line(colour = "gray"),
                    axis.ticks.x = element_blank(),
                    axis.ticks.y = element_blank())

Bar Chart

# create another column with abbreviated months
data1 <- mutate(data1, month = month.abb[mnth])
data1$month <- factor(data1$month, levels = month.abb)

# pre-attentive attributes
userMnthlyTotal <- data1 %>% 
  group_by(mnth) %>% 
  summarize(total = sum(cnt))
userAverage <- mean(userMnthlyTotal$total)
attributeMean <- geom_hline(yintercept = userAverage, size=1, color="#999999", linetype="dashed")

# plot bar chart
ggplot(data1, aes(x=month, y=cnt)) +
  geom_bar(stat="identity", fill = "seagreen3") + # Basic bar chart
  labs(title="NUMBER OF BIKE RENTALS BY MONTH", # add labels
       subtitle="This chart shows the number of bike rentals by month over the period Jan 2011-Dec 2012. 
The warmer months of May-Oct see an above-average rentals.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Month",
       y="Rentals") +
  fonts + 
  removeJunk +
  scale_y_continuous(labels = comma) + # format scale
  attributeMean +
  annotate("text", x=12, y=300000, label="Average", color="#666666", size=3)

Line Chart

# create column for dates
data1$date <- mdy(data1$dteday)

# plot line chart
ggplot(data1, aes(x=date, y=cnt, group=1)) +
  geom_line(color = "seagreen3") + 
  geom_point(color = "seagreen3", size=1) +
  geom_smooth(method=loess) +
  labs(title="DAILY BIKE RENTALS",
       subtitle="This chart shows the number of daily bike rentals over the period Jan 2011-Dec 2012. 
While daily rentals fluctuate significantly, rental demand has generally trended higher over time.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Date",
       y="Rentals") +
  fonts + 
  removeJunk +
  scale_y_continuous(labels = comma) +
  scale_x_date(date_breaks = "4 month", 
               labels=date_format("%b-%y"),
               limits = as.Date(c('2011-01-01','2012-12-31')))

Stacked Area

ggplot(data1, aes(x=date, y=cnt, group=1)) +
  geom_area(fill="seagreen3", color="seagreen1") + 
  geom_smooth(method=loess, color="limegreen", se=FALSE) +
  labs(title="DAILY BIKE RENTALS",
       subtitle="This chart shows the number of daily bike rentals over the period Jan 2011-Dec 2012. 
While daily rentals fluctuate significantly, rental demand has generally trended higher over time.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Date",
       y="Rentals") +
  fonts + 
  removeJunk +
  scale_y_continuous(labels = comma) +
  scale_x_date(date_breaks = "4 month", 
               labels=date_format("%b-%y"),
               limits = as.Date(c('2011-01-01','2012-12-31')))

Histogram

max = round(max(data1$cnt),-3) # round maximum daily rentals to nearest thousand 

# plot histogram
ggplot(data1, aes(x=cnt)) +
  geom_histogram(breaks=seq(0,max,max/(max/100)), fill="seagreen3") +
  labs(title="HISTOGRAM OF BIKE RENTALS",
       subtitle="This histogram shows the distribution of daily bike rentals over the period Jan 2011-Dec 2012. 
Each bin represents 100 rentals. There was one day with less than 100 rentals.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Rentals",
       y="Frequency") +
  fonts +
  removeJunk +
  scale_x_continuous(labels = comma) 

Density Plot

ggplot(data1, aes(x=cnt)) +
  geom_density(fill="seagreen3") +
  labs(title="DENSITY DISTRIBUTION OF BIKE RENTALS",
       subtitle="This chart shows the density distribution of daily bike rentals over the period Jan 2011-Dec 2012. 
We see a trimodal distribution, with a peak around 4,500 rentals and two lower peaks.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Rentals",
       y="Density") +
  fonts +
  removeJunk +
  scale_x_continuous(labels = comma) + 
  geom_vline(xintercept = 4500, size=1, color="#999999", linetype="dashed")

Boxplot

ggplot(data1, aes(x=month, y=cnt)) +
  geom_boxplot(fill="white", color="seagreen3") +
  labs(title="BOXPLOT OF BIKE RENTALS",
       subtitle="This boxplot shows the key summary statistics of daily bike rentals over the period Jan 2011-Dec 2012. 
The highest median daily rentals was observed in Jul while Mar and Oct saw noticeaably wide ranges.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Month",
       y="Rentals") +
  fonts +
  removeJunk

Scatterplot

data1$season[data1$season==1]<- "Winter"
data1$season[data1$season==2]<- "Spring"
data1$season[data1$season==3]<- "Summer"
data1$season[data1$season==4]<- "Fall"
season <- data1$season

# plot scatterplot
ggplot(data1, aes(x=temp, y=cnt)) +
  geom_point(aes(col=season)) +
  geom_smooth(method=lm, color="grey", se=FALSE) +
  labs(title="HOW TEMPERATURE AFFECTS BIKE RENTALS",
       subtitle="This charts plots daily bike rentals against (normalized) temperature over the period Jan 2011-Dec 2012. 
We see a positive relationship; on warmer days rental demand increases.",
       caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
       x = "Temperature (Normalized)",
       y="Rentals") +
  fonts + 
  removeJunk +
  scale_y_continuous(labels = comma)

Alvin Eng

8 October 2017