Sarah Hosking
October 19, 2017
Example: Anscombe’s quartet
## x1 x2 x3 x4 y1 y2 y3 y4
## mean 9.000 9.000 9.000 9.000 7.501 7.501 7.50 7.501
## std_dev 3.317 3.317 3.317 3.317 2.032 2.032 2.03 2.031
≠# Why Ggplot2?
It’s all about layers.
tail(airparif.tidy)## date year month day heure pollutant value
## 245615 2017-08-13 2017 08 13 19 CO 300
## 245616 2017-08-13 2017 08 13 20 CO 500
## 245617 2017-08-13 2017 08 13 21 CO 300
## 245618 2017-08-13 2017 08 13 22 CO 300
## 245619 2017-08-13 2017 08 13 23 CO 300
## 245620 2017-08-13 2017 08 13 24 CO 200
PM25 = Particulate matter < 2.5 µmPM10 = Particulate matter < 10 µmO3 = OzoneNO2 = Nitrogen Dioxide (Azote)CO = Carbon monoxideWhat is the shape of the data?
# create plot
hg <- ggplot(data = airparif.tidy, aes(x = value))
hg # add geom
hg <- hg + geom_histogram()
hg# add facets
hg +
facet_wrap(~pollutant)# fix x-axis
hg + facet_wrap(~pollutant, scales = 'free_x')Not tidy
## date year month day heure PM10 PM25 NO2 O3 CO
## 2 2011-07-21 2011 07 21 1 7 NA 12 NA NA
## 3 2011-07-21 2011 07 21 2 6 NA 10 NA NA
## 4 2011-07-21 2011 07 21 3 7 NA 10 NA NA
## 5 2011-07-21 2011 07 21 4 7 NA 14 NA NA
## 6 2011-07-21 2011 07 21 5 10 NA 30 NA NA
## 7 2011-07-21 2011 07 21 6 10 NA 36 NA NA
Tidy
## date year month day heure pollutant value
## 1 2011-07-21 2011 07 21 1 PM10 7
## 2 2011-07-21 2011 07 21 2 PM10 6
## 3 2011-07-21 2011 07 21 3 PM10 7
## 4 2011-07-21 2011 07 21 4 PM10 7
## 5 2011-07-21 2011 07 21 5 PM10 10
## 6 2011-07-21 2011 07 21 6 PM10 10
%+% operatorWill colour make pollutants easier to differentiate?
# update data layer with new mapping
hg <- hg %+% aes(fill = pollutant)
# plot
hg + facet_wrap(~pollutant, scales = 'free_x')Do distributions change from month to month?
hg + facet_grid(month~pollutant, scales = 'free_x')How do the values change over the year?
# create boxplot var
bp <- ggplot(airparif.tidy, aes(month, value, fill = pollutant)) +
geom_boxplot()
# we'll use this a lot
fw <- facet_wrap(~pollutant, scales = 'free_y')
# monthly boxplot
bp + fw# hourly boxplot
bp.h <- bp %+% aes(x = as.factor(heure))
bp.h + fwHow bad are these pollution spikes?
# add horizontal line & levels df
hl <- geom_hline(data = levels_h.long,
aes(yintercept = value, group = pollutant), linetype = 2)
# without hlines
bp.h + fw# with hlines
bp.h + fw + hlHow correlated are the different pollutants?
# create a sample of 1000 obs
set.seed(888)
sample <- subset(airparif, select = c(PM10, PM25, NO2, O3, CO, year))
sample <- sample[sample(1:nrow(sample),1000),]# import library
library(GGally)
# look at correlations
ggpairs(data=sample, # data.frame with variables
title="Pollutant correlations") # title of the plotggpairs(data = sample,
lower = list(continuous = wrap("smooth", alpha=1/5, shape = I('.'),
colour ='blue')),
title="Pollutant correlations")Which months have the lowest pollution?
# data layer
hm <- ggplot(data = ap.scaled.tidy,
aes(x = pollutant, y = as.factor(month(date))))
# add geom
hm <- hm +
geom_tile(aes(fill = scale_value))
# add colour scale & plot
hm + scale_fill_gradient2(low="blue", high="darkorange", guide="colorbar")Which hours of the day are the least polluted?
# update df and y var
hm.h <- hm %+% ap.h.scaled.tidy + aes(y=hour)
# add colour scale & plot
hm.h + scale_fill_gradient2(low="blue", high="darkorange", guide="colorbar")How did pollution fluctuate throughout the day?
# create line plot
p <- ggplot(dec2016, aes(heure, value)) +
geom_line(colour = 'darkgrey')
p + fw# change default grouping
p <- p %+% aes(group = date)
p + fwWhen did particulate pollution spike?
# Dec 2016 particulate data
dec2016_pm <- dec2016 %>%
filter(pollutant %in% c('PM10', 'PM25'))
# find index of max value in Dec
max_val <- which.max(dec2016_pm[,'value'])
# find date
max_date <- dec2016_pm[max_val, 'date']
# filter on this date
dec2016_pm_max <- dec2016 %>%
filter(date == max_date)Was it a big spike for the other pollutants?
# pollution spike df
p_spike <- geom_line(aes(heure, value,
group = pollutant,
colour = pollutant))
# plot pollution spike day
ggplot(data = dec2016_pm_max) +
p_spike +
fw# create layer for other days
p_other <- geom_line(data = dec2016,
aes(heure, value, group = date),
colour = 'darkgrey')
# combine & plot
spag <- ggplot(data = dec2016_pm_max) +
p_other +
p_spike +
fw
spag# add threshold lines
spag + hlaes()facet_wrap & facet_gridggpairs correlation matrixgeom_tile heatmaps%+%Cookbook for R - Graphs: http://www.cookbook-r.com/Graphs/
Ggplots cheatsheet: http://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf
Ggally tutorial: http://koaning.io/ggally-explore-all-the-things.html
Data wrangling with Dplyr & Tidyr: https://s3.amazonaws.com/udacity-hosted-downloads/ud651/DataWranglingWithR.pdf