Introduction

Whats Covered

Part A:

  • Statistical plots
    • Aesthetics review,
    • box plots, density plots
    • multiple groups/variables
  • Plots for specific data types (Part 1)
    • graphics of large data
    • Ternary plots
    • Network plots
    • Diagnostic plots

Part B:

  • Plots for specific data types (Part 2)
    • choropleths
    • cartographic maps
    • animations

Part C:

  • ggplot2 internals
    • grid graphics, grid grapshics in ggplot2
    • ggplot objects
    • gridExtra
  • Data Munging and Visualization Case Study
    • Bag plot case study, weather case study

Libraries and Data

source("create_datasets.R")
load('data/test_datasets.RData')

library(readr)
library(dplyr)
library(ggplot2)
library(purrr)

library(grid)
library(gtable)
library(aplpack)

   


ggplot2 internals


Grid Graphics

– Viewport basics (1)

# the grid library is loaded

# Draw rectangle in null viewport
grid.rect(gp = gpar(fill = "grey90"))

# Write text in null viewport
grid.text("null viewport")

# Draw a line
grid.lines(x = c(0, 0.75), y = c(0.25, 1),
          gp = gpar(lty = 2, col = "red"))

– Viewport basics (2)

# Populate null viewport
grid.rect(gp = gpar(fill = "grey90"))
grid.text("null viewport")
grid.lines(x = c(0,0.75), y = c(0.25, 1),
           gp = gpar(lty = 2, col = "red"))

# Create new viewport: vp
vp <- viewport(x = 0.5, y = 0.5, width = 0.5, height = 0.5, just = "center")

# Push vp
pushViewport(vp)

# Populate new viewport with rectangle
grid.rect(gp = gpar(fill = "blue"))

– Build a plot from scratch (1)

# 1 - Create plot viewport: pvp
mar <- c(5, 4, 2, 2)
pvp <- plotViewport(mar)

# 2 - Push pvp
pushViewport(pvp)

# 3 - Add rectangle
grid.rect(gp = gpar(fill = "grey80"))

# Create data viewport: dvp
dvp <- dataViewport(xData = mtcars$wt, yData = mtcars$mpg)

# 4 - Push dvp
pushViewport(dvp)

# Add two axes
grid.xaxis()
grid.yaxis()

– Build a plot from scratch (2)

# Work from before
pushViewport(plotViewport(c(5, 4, 2, 2)))
grid.rect(gp = gpar())
pushViewport(dataViewport(xData = mtcars$wt, yData = mtcars$mpg))
grid.xaxis()
grid.yaxis()

# 1 - Add text to x axis
grid.text("Weight", y = unit(-3, "lines"))

# 2 - Add text to y axis
grid.text("MPG", x = unit(-3, "lines"), rot = 90)

# 3 - Add points
grid.points(x = mtcars$wt, y = mtcars$mpg, pch = 16)

– Modifying a plot with grid.edit

# Work from before
pushViewport(plotViewport(c(5, 4, 2, 2)))
grid.rect(gp = gpar())
pushViewport(dataViewport(xData = mtcars$wt, yData = mtcars$mpg))
grid.xaxis()
grid.yaxis()

# Work from before - add names
grid.text("Weight", y = unit(-3, "lines"), name = "xaxis")
grid.text("MPG", x = unit(-3, "lines"), rot = 90, name = "yaxis")
grid.points(x = mtcars$wt, y = mtcars$mpg, pch = 16, name = "datapoints")

# Edit "xaxis"
grid.edit("xaxis", label = "Weight (1000 lbs)")

# Edit "yaxis"
grid.edit("yaxis", label = "Miles/(US) gallon")

# Edit "datapoints"
grid.edit("datapoints", 
  gp = gpar(col = "#C3212766", cex = 2))

Grid Graphics in ggplot2

– Exploring the gTable

# A simple plot p
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()

# Create gtab with ggplotGrob()
gtab <- ggplotGrob(p)

# Print out gtab
gtab
## TableGrob (10 x 9) "layout": 18 grobs
##     z         cells       name                                  grob
## 1   0 ( 1-10, 1- 9) background        rect[plot.background..rect.80]
## 2   5 ( 5- 5, 3- 3)     spacer                        zeroGrob[NULL]
## 3   7 ( 6- 6, 3- 3)     axis-l    absoluteGrob[GRID.absoluteGrob.56]
## 4   3 ( 7- 7, 3- 3)     spacer                        zeroGrob[NULL]
## 5   6 ( 5- 5, 4- 4)     axis-t                        zeroGrob[NULL]
## 6   1 ( 6- 6, 4- 4)      panel               gTree[panel-1.gTree.36]
## 7   9 ( 7- 7, 4- 4)     axis-b    absoluteGrob[GRID.absoluteGrob.49]
## 8   4 ( 5- 5, 5- 5)     spacer                        zeroGrob[NULL]
## 9   8 ( 6- 6, 5- 5)     axis-r                        zeroGrob[NULL]
## 10  2 ( 7- 7, 5- 5)     spacer                        zeroGrob[NULL]
## 11 10 ( 4- 4, 4- 4)     xlab-t                        zeroGrob[NULL]
## 12 11 ( 8- 8, 4- 4)     xlab-b titleGrob[axis.title.x..titleGrob.39]
## 13 12 ( 6- 6, 2- 2)     ylab-l titleGrob[axis.title.y..titleGrob.42]
## 14 13 ( 6- 6, 6- 6)     ylab-r                        zeroGrob[NULL]
## 15 14 ( 6- 6, 8- 8)  guide-box                     gtable[guide-box]
## 16 15 ( 3- 3, 4- 4)   subtitle  zeroGrob[plot.subtitle..zeroGrob.77]
## 17 16 ( 2- 2, 4- 4)      title     zeroGrob[plot.title..zeroGrob.76]
## 18 17 ( 9- 9, 4- 4)    caption   zeroGrob[plot.caption..zeroGrob.78]
# Extract the grobs from gtab: gtab
g <- gtab$grob
 
# Draw only the legend
legend_index <- which(vapply(g, inherits, what = "gtable", logical(1)))
grid::grid.draw(g[[legend_index]])

– Modifying the gTable

  • This seems to not work in the notebook. Its very weird. Something I will need to dig into more later
    • It seems to work fine in Rstudio though
  • Also, the grid library seems really flaky. None of its functions were working in this code chunk
    • They work fine in some of the above code chuncks
    • I reloaded it here and that seems to help.
  • Over messing with the grid graphics seems like flaky business.
library(grid)

# Code from before
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()
gtab <- ggplotGrob(p)
g <- gtab$grobs
legend_index <- which(vapply(g, inherits, what = "gtable", logical(1)))
grid::grid.draw(g[[legend_index]])

# the gtable library is loaded

# 1 - Show layout of legend grob
## this is not printing in the html but it shows fine in the console. 
gtable_show_layout(g[[legend_index]])

# Create text grob
my_text <- grid::textGrob(label = "Motor Trend, 1974", gp = gpar(fontsize = 7, col = "gray25"))

# 2 - Use gtable_add_grob to modify original gtab
new_legend <- gtable_add_grob(
  gtab$grobs[[legend_index]], my_text, 3, 2)

# 3 - Update in gtab
gtab$grobs[[legend_index]] <- new_legend

# 4 - Draw gtab
grid::grid.draw(gtab)

ggplot Objects

– Exploring ggplot objects

# Simple plot p
p <- ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) + geom_point()

# Examine class() and names()
class(p)
## [1] "gg"     "ggplot"
names(p)
## [1] "data"        "layers"      "scales"      "mapping"     "theme"      
## [6] "coordinates" "facet"       "plot_env"    "labels"
# Print the scales sub-list
p$scales$scales
## list()
# Update p
p <- p +
  scale_x_continuous("Length", limits = c(4, 8), expand = c(0, 0)) +
  scale_y_continuous("Width", limits = c(2, 4.5), expand = c(0, 0))

# Print the scales sub-list
p$scales$scales
## [[1]]
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    4 --    8
## 
## [[2]]
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    2 --  4.5

– ggplot_build and ggplot_gtable

# Box plot of mtcars: p
p <- ggplot(mtcars, aes(x = factor(cyl), y = wt)) + geom_boxplot()

# Create pbuild
pbuild <- ggplot_build(p)

# a list of 3 elements
names(pbuild)
## [1] "data"   "layout" "plot"
# Print out each element in pbuild
# In the exercise they have panel as an option. But its `plot now`
pbuild$data
## [[1]]
##    ymin  lower middle   upper ymax            outliers notchupper
## 1 1.513 1.8850  2.200 2.62250 3.19                       2.551336
## 2 2.620 2.8225  3.215 3.44000 3.46                       3.583761
## 3 3.170 3.5325  3.755 4.01375 4.07 5.250, 5.424, 5.345   3.958219
##   notchlower x PANEL group ymin_final ymax_final  xmin  xmax weight colour
## 1   1.848664 1     1     1      1.513      3.190 0.625 1.375      1 grey20
## 2   2.846239 2     1     2      2.620      3.460 1.625 2.375      1 grey20
## 3   3.551781 3     1     3      3.170      5.424 2.625 3.375      1 grey20
##    fill size alpha shape linetype
## 1 white  0.5    NA    19    solid
## 2 white  0.5    NA    19    solid
## 3 white  0.5    NA    19    solid
pbuild$layout
## <ggproto object: Class Layout>
##     facet: <ggproto object: Class FacetNull, Facet>
##         compute_layout: function
##         draw_back: function
##         draw_front: function
##         draw_labels: function
##         draw_panels: function
##         finish_data: function
##         init_scales: function
##         map: function
##         map_data: function
##         params: list
##         render_back: function
##         render_front: function
##         render_panels: function
##         setup_data: function
##         setup_params: function
##         shrink: TRUE
##         train: function
##         train_positions: function
##         train_scales: function
##         vars: function
##         super:  <ggproto object: Class FacetNull, Facet>
##     finish_data: function
##     get_scales: function
##     map: function
##     map_position: function
##     panel_layout: data.frame
##     panel_ranges: list
##     panel_scales: list
##     render: function
##     render_labels: function
##     reset_scales: function
##     setup: function
##     train_position: function
##     train_ranges: function
##     xlabel: function
##     ylabel: function
##     super:  <ggproto object: Class Layout>
## This should show the plot. Not working in the markdown doc though. 
pbuild$plot

# Create gtab from pbuild
gtab <- ggplot_gtable(pbuild)

# Draw gtab
grid::grid.draw(gtab)

– Extracting details

  • This is cool becasue we can get exact values of the box plot.
  • There are many ways to calculate quartiles and if we do it ourselves outside the plot, its likely we will have different values
# Box plot of mtcars: p
p <- ggplot(mtcars, aes(x = factor(cyl), y = wt)) + geom_boxplot()

# Build pdata
pdata <- ggplot_build(p)$data

# confirm that the first element of the list is a data frame
class(pdata[[1]])
## [1] "data.frame"
# Isolate this data frame
my_df <- pdata[[1]]

# The x labels
my_df$group <- ggplot_build(p)$layout$panel_ranges[[1]]$x.labels

# Print out specific variables
my_df[c(1:6, 11)]
##    ymin  lower middle   upper ymax            outliers group
## 1 1.513 1.8850  2.200 2.62250 3.19                         4
## 2 2.620 2.8225  3.215 3.44000 3.46                         6
## 3 3.170 3.5325  3.755 4.01375 4.07 5.250, 5.424, 5.345     8

gridExtra

– Arranging plots (1)

# Add a theme (legend at the bottom)
g1 <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
  geom_point(alpha = 0.5) +
  theme(legend.position = "bottom")

# Add a theme (no legend)
g2 <- ggplot(mtcars, aes(disp, fill = cyl)) +
  geom_histogram(position = "identity", alpha = 0.5, binwidth = 20) +
  theme(legend.position = "none")

# Load gridExtra
library(gridExtra)

# Call grid.arrange()
grid.arrange(g1, g2, ncol = 2)

– Arranging plots (2)

# ggplot2, grid and gridExtra have been loaded for you
# Definitions of g1 and g2
g1 <- ggplot(mtcars, aes(wt, mpg, col = cyl)) +
  geom_point() +
  theme(legend.position = "bottom")

g2 <- ggplot(mtcars, aes(disp, fill = cyl)) +
  geom_histogram(binwidth = 20) +
  theme(legend.position = "none")

# Extract the legend from g1
my_legend <- ggplotGrob(g1)$grobs[[legend_index]]  

# Create g1_noleg
g1_noleg <- g1 + 
    theme(legend.position = "none")

# Calculate the height: legend_height
legend_height <- sum(my_legend$heights)

# Arrange g1_noleg, g2 and my_legend
grid.arrange(
  g1_noleg, g2, my_legend,
  layout_matrix = matrix(
    c(1,3,2,3), 
    ncol = 2),
    heights = unit.c(unit(1, "npc") - legend_height, legend_height))

   


Data Munging and Visualization Case Study


Case Study I - Bag Plot

– Base package bag plot

# test_datasets.RData has been loaded

# Call bagplot() on test_data
# The aplpack library has been loaded
head(ch5_test_data)
##      x   y
## 1 2560  97
## 2 2345 114
## 3 1845  81
## 4 2260  91
## 5 2440 113
## 6 2285  97
bagplot(ch5_test_data)

# Call compute.bagplot on test_data, assign to bag
bag <- compute.bagplot(ch5_test_data)

# Display information
bag$hull.loop
##     x   y
##  3690 146
##  2840 107
##  1900  73
##  1845  81
##  3325 231
##  3610 232
##  3735 202
##  3735 181
bag$hull.bag
##        [,1]      [,2]
##    2965.409 133.34581
##    2780.925 121.99004
##    2690.199 116.59625
##    2460.339 103.15300
##    2423.851 101.45165
##    2416.798 101.15107
##    2332.435  98.08723
##    2329.436  98.01789
## sl 2318.892  98.01833
## sl 2297.342  98.02599
## sl 2297.341  98.02648
## sl 2297.184  99.03550
## sl 2311.571 104.81678
##    2312.502 105.04681
##    2331.535 109.10241
##    2359.803 114.74815
##    2379.444 118.65044
##    2396.732 121.60977
##    2558.743 142.23486
##    2574.610 143.96787
##    2618.128 148.28650
##    2647.898 150.86817
##    2674.115 153.07863
##    2741.581 158.07621
##    2750.780 158.71717
##    3026.607 177.86184
##    3059.711 180.12607
##    3226.141 189.73602
##    3287.371 192.79199
##    3425.012 199.59414
##    3448.124 199.76179
## sr 3490.523 200.06172
##    3509.671 199.07448
##    3500.493 195.92219
##    3435.869 177.44535
##    3259.740 155.69116
##    3247.534 154.37051
##    3218.173 151.54236
##    3183.859 148.31374
##    3138.247 144.31759
bag$pxy.outlier
##         x   y
## [1,] 3320 305
## [2,] 3310 302
## [3,] 3855 305
## [4,] 3850 302
# Highlight components
points(bag$hull.loop, col = "green", pch = 16)
points(bag$hull.bag, col = "orange", pch = 16)
points(bag$pxy.outlier, col = "purple", pch = 16)

– Multilayer ggplot2 bag plot

# bag and ch5_test_data are available

# Create data frames from matrices
hull.loop <- data.frame(x = bag$hull.loop[,1], y = bag$hull.loop[,2])
hull.bag <- data.frame(x = bag$hull.bag[,1], y = bag$hull.bag[,2])
pxy.outlier <- data.frame(x = bag$pxy.outlier[,1], y = bag$pxy.outlier[,2])

# Finish the ggplot command
ggplot(ch5_test_data, aes(x = x,  y = y)) +
  geom_polygon(data = hull.loop, fill = "green") +
  geom_polygon(data = hull.bag, fill = "orange") +
  geom_point(data = pxy.outlier, col = "purple", pch = 16, cex = 1.5)

– Creating ggproto functions

# ggproto for StatLoop (hull.loop)
StatLoop <- ggproto(
  "StatLoop", 
  Stat,
  required_aes = c("x", "y"),
  compute_group = function(data, scales) {
    bag <- compute.bagplot(x = data$x, y = data$y)
    data.frame(x = bag$hull.loop[,1], y = bag$hull.loop[,2])
  })

# ggproto for StatBag (hull.bag)
StatBag <- ggproto(
  "StatBag", 
  Stat,
  required_aes = c("x", "y"),
  compute_group = function(data, scales) {
   bag <- compute.bagplot(x = data$x, y = data$y)
   data.frame(x = bag$hull.bag[,1], y = bag$hull.bag[,2])
  })

# ggproto for StatOut (pxy.outlier)
StatOut <- ggproto(
  "StatOut", 
  Stat,
  required_aes = c("x", "y"),
  compute_group = function(data, scales) {
   bag <- compute.bagplot(x = data$x, y = data$y)
   data.frame(x = bag$pxy.outlier[,1], y = bag$pxy.outlier[,2])
  })

– Creating stat_bag()

  • This all works fine in the class console.
  • But locally I get the most vague error possible.
# StatLoop, StatBag and StatOut are available

# Combine ggproto objects in layers to build stat_bag()
stat_bag <- function(mapping = NULL, data = NULL, geom = polygon,
                     position = "identity", na.rm = FALSE, show.legend = NA,
                     inherit.aes = TRUE, loop = FALSE, ...) {
  list(
    # StatLoop layer
    layer(
      stat = StatLoop, data = data, mapping = mapping, geom = geom, 
      position = position, show.legend = show.legend, inherit.aes = inherit.aes,
      params = list(na.rm = na.rm, alpha = 0.35, col = NA, ...)
    ),
    # StatBag layer
    layer(
      stat = StatBag, data = data, mapping = mapping, geom = geom, 
      position = position, show.legend = show.legend, inherit.aes = inherit.aes,
      params = list(na.rm = na.rm, alpha = 0.35, col = NA, ...)
    ),
    # StatOut layer
    layer(
      stat = StatOut, data = data, mapping = mapping, geom = "point", 
      position = position, show.legend = show.legend, inherit.aes = inherit.aes,
      params = list(na.rm = na.rm, alpha = 0.7, col = NA, shape = 21, ...)
    )
  )
}

– Use stat_bag()

  • The stat_bag function is not working locally
  • I get a `Error: object of type ‘closure’ is not subsettable
    • This is the most vague error. I have no idea what is wrong in the multiple prior steps.
  • Maybe I should gind a tutorial on this later when I want to use something like this and hope I can get it working right.
  • After reading a bunch of discussions, it looks like some libraries may be out of sync.
# hull.loop, hull.bag and pxy.outlier are available
# stat_bag, ch5_test_data and ch5_test_data2 are available

# Previous method
ggplot(ch5_test_data, aes(x = x,  y = y)) +
  geom_polygon(data = hull.loop, fill = "green") +
  geom_polygon(data = hull.bag, fill = "orange") +
  geom_point(data = pxy.outlier, col = "purple", pch = 16, cex = 1.5)

# stat_bag
ggplot(data = ch5_test_data, aes(x = x, y = y)) +
  stat_bag(ch5_test_data, fill = 'black')
## Error: Mapping must be created by `aes()` or `aes_()`
# stat_bag on test_data2
ggplot(ch5_test_data2, aes(x = x, y = y, fill = treatment)) +
  stat_bag()
## Error: object of type 'closure' is not subsettable

Case Study II - Weather (Part 1)

– Step 1: Read in data and examine

# Import weather data
weather_data_urls <- c(
  'https://assets.datacamp.com/production/course_862/datasets/NYNEWYOR.txt',
  'https://assets.datacamp.com/production/course_862/datasets/FRPARIS.txt',
  'https://assets.datacamp.com/production/course_862/datasets/ILREYKJV.txt',
  'https://assets.datacamp.com/production/course_862/datasets/UKLONDON.txt'
)
weather_data_files <- c("data/NYNEWYOR.txt","data/FRPARIS.txt", "data/ILREYKJV.txt", "data/UKLONDON.txt")

download.file(weather_data_urls, weather_data_files)  
# Check out the NY weather data
weather <- read.fwf('data/NYNEWYOR.txt',
                    header = F,
                    col.names = c("month", "day", "year", "temp"),
                    widths = c(14, 14, 13, 4))

# Check structure of weather
str(weather)
## 'data.frame':    7824 obs. of  4 variables:
##  $ month: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ year : num  1995 1995 1995 1995 1995 ...
##  $ temp : num  44 41 28 31 21 27 42 35 34 29 ...
# Create past with two filter() calls
past <- weather %>%
  filter(!(month == 2 & day == 29)) %>%
  filter(year != 2016)
  
# Check structure of past
str(past)
## 'data.frame':    7665 obs. of  4 variables:
##  $ month: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day  : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ year : num  1995 1995 1995 1995 1995 ...
##  $ temp : num  44 41 28 31 21 27 42 35 34 29 ...

– Step 2: Summarize history

# Create new version of past
past_summ <- past %>%
  group_by(year) %>%
  mutate(yearday = 1:length(day)) %>%
  ungroup() %>%
  filter(temp != -99) %>%
  group_by(yearday) %>%
  mutate(max = max(temp),
         min = min(temp),
         avg = mean(temp),
         CI_lower = Hmisc::smean.cl.normal(temp)[2],
         CI_upper = Hmisc::smean.cl.normal(temp)[3]) %>%
  ungroup()

# Structure of past_summ
str(past_summ)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7645 obs. of  10 variables:
##  $ month   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ year    : num  1995 1995 1995 1995 1995 ...
##  $ temp    : num  44 41 28 31 21 27 42 35 34 29 ...
##  $ yearday : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ max     : num  51 48 57 55 56 62 52 57 54 47 ...
##  $ min     : num  17 15 16 15 21 14 14 12 21 8.5 ...
##  $ avg     : num  35.6 35.4 34.9 35.1 35.9 ...
##  $ CI_lower: num  31 31.6 29.7 29.9 31.9 ...
##  $ CI_upper: num  40.1 39.2 40 40.4 39.9 ...
head(past_summ)
## # A tibble: 6 x 10
##   month   day  year  temp yearday   max   min      avg CI_lower CI_upper
##   <dbl> <dbl> <dbl> <dbl>   <int> <dbl> <dbl>    <dbl>    <dbl>    <dbl>
## 1     1     1  1995    44       1    51    17 35.57143 31.00682 40.13604
## 2     1     2  1995    41       2    48    15 35.38095 31.57395 39.18796
## 3     1     3  1995    28       3    57    16 34.85714 29.73285 39.98144
## 4     1     4  1995    31       4    55    15 35.14286 29.85929 40.42642
## 5     1     5  1995    21       5    56    21 35.90476 31.89258 39.91695
## 6     1     6  1995    27       6    62    14 35.95238 31.26040 40.64437
Hmisc::smean.cl.normal(past$temp)[2]
##    Lower 
## 54.69111
table(past$temp)
## 
## -99 8.5  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25 
##  20   1   3   2   7   8   8   9   6   7  12  17  20  39  23  18  46  38 
##  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43 
##  41  54  69  83  76  82  77 107 109 112 126 113 126 148 144 137 137 135 
##  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61 
## 146 133 135 135 152 132 128 143 133 125 125 123 137 148 144 151 119 121 
##  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79 
## 130 127 109 124 119 117 136 119 146 136 177 187 188 180 175 165 158 106 
##  80  81  82  83  84  85  86  87  88  89  91  92  93 
##  98  90  80  53  42  33  27  19  11  11   4   6   2
table(past_summ$temp)
## 
## 8.5  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26 
##   1   3   2   7   8   8   9   6   7  12  17  20  39  23  18  46  38  41 
##  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44 
##  54  69  83  76  82  77 107 109 112 126 113 126 148 144 137 137 135 146 
##  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62 
## 133 135 135 152 132 128 143 133 125 125 123 137 148 144 151 119 121 130 
##  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
## 127 109 124 119 117 136 119 146 136 177 187 188 180 175 165 158 106  98 
##  81  82  83  84  85  86  87  88  89  91  92  93 
##  90  80  53  42  33  27  19  11  11   4   6   2

– Step 3: Plot history

# Adapt historical plot
ggplot(past_summ, aes(x = yearday, y = temp)) +
  geom_point(
    col = "#EED8AE", 
    alpha = 0.3,
    shape = 16) +
  geom_linerange(
    aes(ymin = CI_lower, ymax = CI_upper), 
    col = "#8B7E66")

– Step 4: Plot present

# weather and past are available in your workspace

# Create present
present <- weather %>%
  filter(!(month == 2 & day == 29)) %>%
  filter(year == max(year)) %>%
  group_by(year) %>%
  mutate(yearday = 1:length(day)) %>%
  ungroup() %>%
  filter(temp != -99)

# Add geom_line to ggplot command
ggplot(past_summ, aes(x = yearday, y = temp)) + 
  geom_point(
    col = "#EED8AE", 
    alpha = 0.3, 
    shape = 16) +
  geom_linerange(
    aes(ymin = CI_lower, ymax = CI_upper), 
    col = "#8B7E66") + 
  geom_line(data = present)

– Step 5: Find new record highs

# Create past_highs
past_highs <- past_summ %>%
  group_by(yearday) %>%
  summarise(past_high = max(temp))

# Create record_high
record_high <- present %>%
  left_join(past_highs) %>%
  filter(temp > past_high)
  
record_high
## # A tibble: 11 x 6
##    month   day  year  temp yearday past_high
##    <dbl> <dbl> <dbl> <dbl>   <int>     <dbl>
##  1     1    10  2016    50      10        47
##  2     2     4  2016    54      35        47
##  3     2    21  2016    55      52        54
##  4     3     9  2016    54      68        52
##  5     3    10  2016    70      69        56
##  6     3    11  2016    61      70        56
##  7     3    25  2016    55      84        54
##  8     4     1  2016    62      91        60
##  9     4    22  2016    70     112        66
## 10     5    26  2016    82     146        80
## 11     5    28  2016    79     148        78
# Add record_high information to plot
ggplot(past_summ, aes(x = yearday, y = temp)) + 
  geom_point(col = "#EED8AE", alpha = 0.3, shape = 16) +
  geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66") +
  geom_line(data = present) +
  geom_point(data = record_high, col = "#CD2626")

– Step 6: Efficiently calculate record highs and lows

# Create past_extremes
past_extremes <- past_summ %>%
  group_by(yearday) %>%
  summarise(past_low = min(temp),
            past_high = max(temp))

# Create record_high_low
record_high_low <- present %>%
  left_join(past_extremes) %>%
  mutate(record = ifelse(temp < past_low, 
                         "#0000CD",
                         ifelse(temp > past_high, 
                                "#CD2626", 
                                "#00000000")))

# Structure of record_high_low
str(record_high_low)
## Classes 'tbl_df', 'tbl' and 'data.frame':    153 obs. of  8 variables:
##  $ month    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ year     : num  2016 2016 2016 2016 2016 ...
##  $ temp     : num  41 37 40 33 19 32 39 40 43 50 ...
##  $ yearday  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ past_low : num  17 15 16 15 21 14 14 12 21 8.5 ...
##  $ past_high: num  51 48 57 55 56 62 52 57 54 47 ...
##  $ record   : chr  "#00000000" "#00000000" "#00000000" "#00000000" ...
head(record_high_low)
## # A tibble: 6 x 8
##   month   day  year  temp yearday past_low past_high    record
##   <dbl> <dbl> <dbl> <dbl>   <int>    <dbl>     <dbl>     <chr>
## 1     1     1  2016    41       1       17        51 #00000000
## 2     1     2  2016    37       2       15        48 #00000000
## 3     1     3  2016    40       3       16        57 #00000000
## 4     1     4  2016    33       4       15        55 #00000000
## 5     1     5  2016    19       5       21        56   #0000CD
## 6     1     6  2016    32       6       14        62 #00000000
# Add point layer of record_high_low
p <- ggplot(past_summ, aes(x = yearday, y = temp)) + 
  geom_point(col = "#EED8AE", alpha = 0.3, shape = 16) +
  geom_linerange(aes(ymin = CI_lower, ymax = CI_upper), col = "#8B7E66") +
  geom_line(data = present) +
  geom_point(data = record_high_low, aes(col = record)) +
  scale_color_identity()
p

– Step 7: Custom legend

# Finish the function draw_pop_legend
draw_pop_legend <- function(x = 0.6, y = 0.2, width = 0.2, height = 0.2, fontsize = 10) {
  
  # Finish viewport() function
  pushViewport(viewport(x = x, y = y, width = width, height = height, just = "center"))

  legend_labels <- c("Past record high",
                     "95% CI range",
                     "Current year",
                     "Past years",
                     "Past record low")

  legend_position <- c(0.9, 0.7, 0.5, 0.2, 0.1)
  
  # Finish grid.text() function
  grid.text(label = legend_labels, x = 0.12, y = legend_position, 
            just = "left", 
            gp = gpar(fontsize = fontsize, col = "grey20"))
  
  # Position dots, rectangle and line
  point_position_y <- c(0.1, 0.2, 0.9)
  point_position_x <- rep(0.06, length(point_position_y))
  grid.points(x = point_position_x, y = point_position_y, pch = 16,
              gp = gpar(col = c("#0000CD", "#EED8AE", "#CD2626")))
  grid.rect(x = 0.06, y = 0.5, width = 0.06, height = 0.4,
            gp = gpar(col = NA, fill = "#8B7E66"))
  grid.lines(x = c(0.03, 0.09), y = c(0.5, 0.5),
             gp = gpar(col = "black", lwd = 3))
  
  # Add popViewport() for bookkeeping
  popViewport()
}

# Plotting object p, from previous exercise
p

# Call draw_pop_legend()
draw_pop_legend()

Case Study II - Weather (Part 2)

– Step 1: clean_weather()

# Finish the clean_weather function
clean_weather <- function(file) {
  weather <- read.fwf(file,
                      header = FALSE,
                      col.names = c("month", "day", "year", "temp"),
                      widths = c(14, 14, 13, 4))
  weather %>%
    filter(!(month == 2 & day == 29)) %>%
    group_by(year) %>%
    mutate(yearday = 1:length(day)) %>%
    ungroup() %>%
    filter(temp != -99)
}

# Import NYNEWYOR.txt: my_data
my_data <- clean_weather('data/NYNEWYOR.txt')
ny_weather_data_url <- 'https://assets.datacamp.com/production/course_862/datasets/NYNEWYOR.txt'
download.file(ny_weather_data_url, 'data/NYNEWYOR.txt')

– Step 2: Historical data

# Create the stats object
StatHistorical <- ggproto("StatHistorical", Stat,
                    compute_group = function(data, scales, params) {
                      data <- data %>%
                        filter(year != max(year)) %>%
                        group_by(x) %>%
                        mutate(ymin = Hmisc::smean.cl.normal(y)[3],
                               ymax = Hmisc::smean.cl.normal(y)[2]) %>%
                        ungroup()
                    },
                    required_aes = c("x", "y", "year"))

# Create the layer
stat_historical <- function(mapping = NULL, data = NULL, geom = "point",
                            position = "identity", na.rm = FALSE, show.legend = NA, 
                            inherit.aes = TRUE, ...) {
  list(
    layer(
      stat = "identity", data = data, mapping = mapping, geom = geom,
      position = position, show.legend = show.legend, inherit.aes = inherit.aes,
      params = list(na.rm = na.rm, col = "#EED8AE", alpha = 0.3, shape = 16, ...)
    ),
    layer(
      stat = StatHistorical, data = data, mapping = mapping, geom = "linerange",
      position = position, show.legend = show.legend, inherit.aes = inherit.aes,
      params = list(na.rm = na.rm, col = "#8B7E66", ...)
    )
  )
}

# Build the plot
my_data <- clean_weather("data/NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
  stat_historical()

– Step 3: Present data

# Create the stats object
StatPresent <- ggproto("StatPresent", Stat,
                       compute_group = function(data, scales, params) {
                         data <- filter(data, year == max(year))
                       },
                       required_aes = c("x", "y", "year"))

# Create the layer
stat_present <- function(mapping = NULL, data = NULL, geom = "line",
                       position = "identity", na.rm = FALSE, show.legend = NA, 
                       inherit.aes = TRUE, ...) {
  layer(
    stat = StatPresent, data = data, mapping = mapping, geom = geom,
    position = position, show.legend = show.legend, inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
}

# Build the plot
my_data <- clean_weather("data/NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
  stat_historical() +
  stat_present()

– Step 4: Extremes

# Create the stats object
StatExtremes <- ggproto("StatExtremes", Stat,
                        compute_group = function(data, scales, params) {
                          
                          present <- data %>%
                            filter(year == max(year)) 
                          
                          past <- data %>%
                            filter(year != max(year)) 
                          
                          past_extremes <- past %>%
                            group_by(x) %>%
                            summarise(past_low = min(y),
                                      past_high = max(y))
                          
                          # transform data to contain extremes
                          data <- present %>%
                            left_join(past_extremes) %>%
                            mutate(record = ifelse(y < past_low, 
                                                   "#0000CD", 
                                                   ifelse(y > past_high, 
                                                          "#CD2626", 
                                                          "#00000000")))
                        },
                        required_aes = c("x", "y", "year"))

# Create the layer
stat_extremes <- function(mapping = NULL, data = NULL, geom = "point",
                          position = "identity", na.rm = FALSE, show.legend = NA, 
                          inherit.aes = TRUE, ...) {
  layer(
    stat = StatExtremes, data = data, mapping = mapping, geom = geom,
    position = position, show.legend = show.legend, inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
}

# Build the plot
my_data <- clean_weather("data/NYNEWYOR.txt")
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
  stat_historical() +
  stat_present() +
  stat_extremes(aes(col = ..record..)) +
  scale_color_identity() # Colour specification

– Step 5: Re-use plotting style

# File paths of all datasets
my_files <- c("data/NYNEWYOR.txt","data/FRPARIS.txt", "data/ILREYKJV.txt", "data/UKLONDON.txt")

# Build my_data with a for loop
my_data <- NULL
for (file in my_files) {
  temp <- clean_weather(file)
  temp$id <- sub(".txt", "", file)
  my_data <- rbind(my_data, temp)
}

str(my_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    31017 obs. of  6 variables:
##  $ month  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day    : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ year   : num  1995 1995 1995 1995 1995 ...
##  $ temp   : num  44 41 28 31 21 27 42 35 34 29 ...
##  $ yearday: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id     : chr  "data/NYNEWYOR" "data/NYNEWYOR" "data/NYNEWYOR" "data/NYNEWYOR" ...
# Build the final plot, from scratch!
ggplot(my_data, aes(x = yearday, y = temp, year = year)) +
  stat_historical() +
  stat_present() +
  stat_extremes(aes(col = ..record..)) +
  scale_color_identity() +  # specify colour here
  facet_wrap(~ id, ncol = 2)