Lesson 5 - ggplot cheatsheet
1. Objective
The objective of this exercise is to create a ggplot cheatsheet for future visualization tasks. We will create the following charts:
- Bar Chart
- Line Chart
- Stacked Area
- Histogram
- Density Plot
- Boxplot
- Scatterplot
We will modify each chart to apply the principles of design, including adding meaningful axis labels, titles, subtitles, and captions, customizing font, removing chart junk, formatting scale and adding pre-attentive attributes where appropriate.
2. Import Libraries
pacman::p_load(ggplot2, tidyverse, ggthemes, knitr, extrafont, dplyr, scales, lubridate)
3. Import Data & Data Pre-processing
For this exercise, we use the bikeshare dataset provided.
setwd("C:/Users/engha/Desktop/Alvin/NYU MSBA/Module 3 - Shanghai/Data Visualization/Lesson 5")
data <- read_csv("bikesharedailydata.csv")
## Parsed with column specification:
## cols(
## instant = col_integer(),
## dteday = col_character(),
## season = col_integer(),
## yr = col_integer(),
## mnth = col_integer(),
## holiday = col_integer(),
## weekday = col_integer(),
## workingday = col_integer(),
## weathersit = col_integer(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## casual = col_integer(),
## registered = col_integer(),
## cnt = col_integer()
## )
Let’s take a look at the data.
head(data)
## # A tibble: 6 x 16
## instant dteday season yr mnth holiday weekday workingday weathersit
## <int> <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1/1/11 1 0 1 0 6 0 2
## 2 2 1/2/11 1 0 1 0 0 0 2
## 3 3 1/3/11 1 0 1 0 1 1 1
## 4 4 1/4/11 1 0 1 0 2 1 1
## 5 5 1/5/11 1 0 1 0 3 1 1
## 6 6 1/6/11 1 0 1 0 4 1 1
## # ... with 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>,
## # windspeed <dbl>, casual <int>, registered <int>, cnt <int>
summary(data)
## instant dteday season yr
## Min. : 1.0 Length:731 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 Class :character 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 Mode :character Median :3.000 Median :1.0000
## Mean :366.0 Mean :2.499 Mean :0.5007
## 3rd Qu.:548.5 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 Max. :4.000 Max. :1.0000
## NA's :1
## mnth holiday weekday workingday
## Min. : 1.000 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.000 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.527 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.000 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.000 Max. :1.00000 Max. :6.000 Max. :1.000
## NA's :1
## weathersit temp atemp hum
## Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
## Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
## Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
## Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
##
## windspeed casual registered cnt
## Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :0.18097 Median : 713.0 Median :3662 Median :4548
## Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
##
The variables season and mnth have one missing data each we go ahead to drop those two instances.
data1 <- na.omit(data)
dim(data1)
## [1] 729 16
We are now ready to create the charts.
Let us first create some utility variables to help with the visualizations.
# customize font
fonts <- theme_bw() +
theme(plot.title = element_text(size=14, family="Trebuchet MS", color="#666666", face="bold")) +
theme(plot.subtitle = element_text(size=10, family="Trebuchet MS", color="#666666")) +
theme(axis.title = element_text(size=10, family="Trebuchet MS", color="#666666", face="bold")) +
theme(axis.title.y = element_text(angle=0)) +
theme(plot.caption = element_text(size=8, family="Trebuchet MS", color="#666666"))
# remove chart junk
removeJunk <- theme(panel.border = element_blank(),
panel.grid = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "gray"),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank())
Bar Chart
# create another column with abbreviated months
data1 <- mutate(data1, month = month.abb[mnth])
data1$month <- factor(data1$month, levels = month.abb)
# pre-attentive attributes
userMnthlyTotal <- data1 %>%
group_by(mnth) %>%
summarize(total = sum(cnt))
userAverage <- mean(userMnthlyTotal$total)
attributeMean <- geom_hline(yintercept = userAverage, size=1, color="#999999", linetype="dashed")
# plot bar chart
ggplot(data1, aes(x=month, y=cnt)) +
geom_bar(stat="identity", fill = "seagreen3") + # Basic bar chart
labs(title="NUMBER OF BIKE RENTALS BY MONTH", # add labels
subtitle="This chart shows the number of bike rentals by month over the period Jan 2011-Dec 2012.
The warmer months of May-Oct see an above-average rentals.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Month",
y="Rentals") +
fonts +
removeJunk +
scale_y_continuous(labels = comma) + # format scale
attributeMean +
annotate("text", x=12, y=300000, label="Average", color="#666666", size=3)
Line Chart
# create column for dates
data1$date <- mdy(data1$dteday)
# plot line chart
ggplot(data1, aes(x=date, y=cnt, group=1)) +
geom_line(color = "seagreen3") +
geom_point(color = "seagreen3", size=1) +
geom_smooth(method=loess) +
labs(title="DAILY BIKE RENTALS",
subtitle="This chart shows the number of daily bike rentals over the period Jan 2011-Dec 2012.
While daily rentals fluctuate significantly, rental demand has generally trended higher over time.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Date",
y="Rentals") +
fonts +
removeJunk +
scale_y_continuous(labels = comma) +
scale_x_date(date_breaks = "4 month",
labels=date_format("%b-%y"),
limits = as.Date(c('2011-01-01','2012-12-31')))
Stacked Area
ggplot(data1, aes(x=date, y=cnt, group=1)) +
geom_area(fill="seagreen3", color="seagreen1") +
geom_smooth(method=loess, color="limegreen", se=FALSE) +
labs(title="DAILY BIKE RENTALS",
subtitle="This chart shows the number of daily bike rentals over the period Jan 2011-Dec 2012.
While daily rentals fluctuate significantly, rental demand has generally trended higher over time.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Date",
y="Rentals") +
fonts +
removeJunk +
scale_y_continuous(labels = comma) +
scale_x_date(date_breaks = "4 month",
labels=date_format("%b-%y"),
limits = as.Date(c('2011-01-01','2012-12-31')))
Histogram
max = round(max(data1$cnt),-3) # round maximum daily rentals to nearest thousand
# plot histogram
ggplot(data1, aes(x=cnt)) +
geom_histogram(breaks=seq(0,max,max/(max/100)), fill="seagreen3") +
labs(title="HISTOGRAM OF BIKE RENTALS",
subtitle="This histogram shows the distribution of daily bike rentals over the period Jan 2011-Dec 2012.
Each bin represents 100 rentals. There was one day with less than 100 rentals.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Rentals",
y="Frequency") +
fonts +
removeJunk +
scale_x_continuous(labels = comma)
Density Plot
ggplot(data1, aes(x=cnt)) +
geom_density(fill="seagreen3") +
labs(title="DENSITY DISTRIBUTION OF BIKE RENTALS",
subtitle="This chart shows the density distribution of daily bike rentals over the period Jan 2011-Dec 2012.
We see a trimodal distribution, with a peak around 4,500 rentals and two lower peaks.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Rentals",
y="Density") +
fonts +
removeJunk +
scale_x_continuous(labels = comma) +
geom_vline(xintercept = 4500, size=1, color="#999999", linetype="dashed")
Boxplot
ggplot(data1, aes(x=month, y=cnt)) +
geom_boxplot(fill="white", color="seagreen3") +
labs(title="BOXPLOT OF BIKE RENTALS",
subtitle="This boxplot shows the key summary statistics of daily bike rentals over the period Jan 2011-Dec 2012.
The highest median daily rentals was observed in Jul while Mar and Oct saw noticeaably wide ranges.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Month",
y="Rentals") +
fonts +
removeJunk
Scatterplot
data1$season[data1$season==1]<- "Winter"
data1$season[data1$season==2]<- "Spring"
data1$season[data1$season==3]<- "Summer"
data1$season[data1$season==4]<- "Fall"
season <- data1$season
# plot scatterplot
ggplot(data1, aes(x=temp, y=cnt)) +
geom_point(aes(col=season)) +
geom_smooth(method=lm, color="grey", se=FALSE) +
labs(title="HOW TEMPERATURE AFFECTS BIKE RENTALS",
subtitle="This charts plots daily bike rentals against (normalized) temperature over the period Jan 2011-Dec 2012.
We see a positive relationship; on warmer days rental demand increases.",
caption="Source: Alvin Eng, for NYU MSBA Data Visualization by Professor K. Sosulski.",
x = "Temperature (Normalized)",
y="Rentals") +
fonts +
removeJunk +
scale_y_continuous(labels = comma)