Introduction

Whats Covered

Part A:

  • Statistical plots
    • Aesthetics review,
    • box plots, density plots
    • multiple groups/variables
  • Plots for specific data types (Part 1)
    • graphics of large data
    • Ternary plots
    • Network plots
    • Diagnostic plots

Part B:

  • Plots for specific data types (Part 2)
    • choropleths
    • cartographic maps
    • animations

Part C:

  • ggplot2 internals
    • grid graphics, grid grapshics in ggplot2
    • ggplot objects
    • gridExtra
  • Data Munging and Visualization Case Study
    • Bag plot case study, weather case study

Libraries and Data

source("create_datasets.R")
load('data/test_datasets.RData')

library(readr)
library(dplyr)
library(ggplot2)
library(purrr)

library(maps)
library(ggmap)
library(ggthemes)
library(viridis)
library(rgdal)
library(ggfortify)
library(animation)
library(gganimate)
library(gapminder)
library(car)

   


Plots for specific data types (Part 2)


Choropleths

– Working with maps from the maps package: USA

# library(maps)
# library(ggmap)

# Use map_data() to create usa and inspect
usa <- map_data("usa")

str(usa)
## 'data.frame':    7243 obs. of  6 variables:
##  $ long     : num  -101 -101 -101 -101 -101 ...
##  $ lat      : num  29.7 29.7 29.7 29.6 29.6 ...
##  $ group    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ order    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region   : chr  "main" "main" "main" "main" ...
##  $ subregion: chr  NA NA NA NA ...
head(usa)
##        long      lat group order region subregion
## 1 -101.4078 29.74224     1     1   main      <NA>
## 2 -101.3906 29.74224     1     2   main      <NA>
## 3 -101.3620 29.65056     1     3   main      <NA>
## 4 -101.3505 29.63911     1     4   main      <NA>
## 5 -101.3219 29.63338     1     5   main      <NA>
## 6 -101.3047 29.64484     1     6   main      <NA>
table(usa$group)
## 
##    1    2    3    4    5    6    7    8    9   10 
## 6886   36   30   16   10  168   17   17   19   44
# Build the map
ggplot(usa, aes(x = long, y = lat, group = group)) +
  geom_polygon() +
  coord_map() +
  theme_nothing()

– Working with maps from the maps package: adding points

# usa, cities, and all required packages are available
# library(readr)
cities <- read_tsv('https://assets.datacamp.com/production/course_862/datasets/US_Cities.txt')

head(usa)
##        long      lat group order region subregion
## 1 -101.4078 29.74224     1     1   main      <NA>
## 2 -101.3906 29.74224     1     2   main      <NA>
## 3 -101.3620 29.65056     1     3   main      <NA>
## 4 -101.3505 29.63911     1     4   main      <NA>
## 5 -101.3219 29.63338     1     5   main      <NA>
## 6 -101.3047 29.64484     1     6   main      <NA>
head(cities)
## # A tibble: 6 x 5
##         City      State Pop_est     lat      long
##        <chr>      <chr>   <int>   <dbl>     <dbl>
## 1     Eugene     Oregon  163460 44.0567 -123.1162
## 2      Salem     Oregon  164549 44.9237 -123.0231
## 3  Hillsboro     Oregon  102347 45.5167 -122.9833
## 4 Santa Rosa California  174972 38.4468 -122.7061
## 5   Portland     Oregon  632309 45.5370 -122.6500
## 6  Vancouver Washington  172860 45.6372 -122.5965
## Need this to get the theme_map() and scale_color_viridis() functions
# library(ggthemes)
# library(viridis)

# Finish plot 1
ggplot(usa, aes(x = long, y = lat, group = group)) +
  geom_polygon() +
  geom_point(data = cities, aes(group = State, size = Pop_est),
             col = "red", shape = 16, alpha = 0.6) +
  coord_map() +
  theme_map()

# Arrange cities
cities_arr <- arrange(cities, Pop_est)

# Copy-paste plot 1 and adapt
ggplot(usa, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "grey90") +
  geom_point(data = cities_arr, aes(group = State, col = Pop_est),
             shape = 16, size = 2) +
  coord_map() +
  theme_map() +
  scale_color_viridis()

  • Pretty much only new york shows up as a yellow dot.
  • LA is the green dot
  • Adjusting the scale here would help.
## New york really tips the scale
## The colors might look better on a log scale
cities_arr %>%
  arrange(desc(Pop_est)) %>%
  head(10)
## # A tibble: 10 x 5
##            City        State Pop_est     lat      long
##           <chr>        <chr>   <int>   <dbl>     <dbl>
##  1     New York     New York 8550405 40.6643  -73.9385
##  2  Los Angeles   California 3971883 34.0194 -118.4108
##  3      Chicago     Illinois 2720546 41.8376  -87.6818
##  4      Houston        Texas 2296224 29.7805  -95.3863
##  5 Philadelphia Pennsylvania 1567442 40.0094  -75.1333
##  6      Phoenix      Arizona 1563025 33.5722 -112.0880
##  7  San Antonio        Texas 1469845 29.4724  -98.5251
##  8    San Diego   California 1394928 32.8153 -117.1350
##  9       Dallas        Texas 1300092 32.7757  -96.7967
## 10     San Jose   California 1026908 37.2969 -121.8193

– State choropleth

# pop and all required packages are available

# Use map_data() to create state
state <- map_data("state")
head(state)
##        long      lat group order  region subregion
## 1 -87.46201 30.38968     1     1 alabama      <NA>
## 2 -87.48493 30.37249     1     2 alabama      <NA>
## 3 -87.52503 30.37249     1     3 alabama      <NA>
## 4 -87.53076 30.33239     1     4 alabama      <NA>
## 5 -87.57087 30.32665     1     5 alabama      <NA>
## 6 -87.58806 30.32665     1     6 alabama      <NA>
# Map of states
ggplot(state, aes(x = long, y = lat, fill = region, group = group)) +
  geom_polygon(col = "white") +
  coord_map() +
  theme_nothing()

# Merge state and pop: state2
state2 <- merge(state, pop)
head(state2)
##    region      long      lat group order subregion Pop_est
## 1 alabama -87.46201 30.38968     1     1      <NA> 4858979
## 2 alabama -87.48493 30.37249     1     2      <NA> 4858979
## 3 alabama -87.52503 30.37249     1     3      <NA> 4858979
## 4 alabama -87.53076 30.33239     1     4      <NA> 4858979
## 5 alabama -87.57087 30.32665     1     5      <NA> 4858979
## 6 alabama -87.58806 30.32665     1     6      <NA> 4858979
# Map of states with populations
ggplot(state2, aes(x = long, y = lat, fill = Pop_est, group = group)) +
  geom_polygon(col = "white") +
  coord_map() +
  theme_map()

– Map from shapefiles

# Import shape information: germany
# library(rgdal)
germany <- readOGR(dsn = 'data/shape_files', layer = "DEU_adm1")
## OGR data source with driver: ESRI Shapefile 
## Source: "data/shape_files", layer: "DEU_adm1"
## with 16 features
## It has 16 fields
# fortify germany: bundes
bundes <- fortify(germany)

# Plot map of germany
ggplot(bundes, aes(x = long, y = lat, group = group)) +
    geom_polygon(fill = "blue", col = "white") +
    coord_map() +
    theme_nothing()

– Choropleth from shapefiles

# germany, bundes and unemp are available
head(bundes)
##       long      lat order  hole piece id group
## 1 9.650460 49.77634     1 FALSE     1  0   0.1
## 2 9.650968 49.76515     2 FALSE     1  0   0.1
## 3 9.656839 49.76145     3 FALSE     1  0   0.1
## 4 9.640400 49.75014     4 FALSE     1  0   0.1
## 5 9.652028 49.74276     5 FALSE     1  0   0.1
## 6 9.652208 49.73903     6 FALSE     1  0   0.1
# re-add state names to bundes
bundes$state <- factor(as.numeric(bundes$id))
levels(bundes$state) <- germany$NAME_1

head(bundes)
##       long      lat order  hole piece id group             state
## 1 9.650460 49.77634     1 FALSE     1  0   0.1 Baden-Württemberg
## 2 9.650968 49.76515     2 FALSE     1  0   0.1 Baden-Württemberg
## 3 9.656839 49.76145     3 FALSE     1  0   0.1 Baden-Württemberg
## 4 9.640400 49.75014     4 FALSE     1  0   0.1 Baden-Württemberg
## 5 9.652028 49.74276     5 FALSE     1  0   0.1 Baden-Württemberg
## 6 9.652208 49.73903     6 FALSE     1  0   0.1 Baden-Württemberg
head(unemp)
##                state unemployment
## 1             Bayern          3.7
## 2  Baden-Württemberg          4.0
## 3    Rheinland-Pfalz          5.4
## 4             Hessen          5.8
## 5      Niedersachsen          6.5
## 6 Schleswig-Holstein          6.7
# Merge bundes and unemp: bundes_unemp
bundes_unemp <- merge(bundes, unemp)

# Update the ggplot call
ggplot(bundes_unemp, aes(x = long, y = lat, group = group, fill = unemployment)) +
  geom_polygon() +
  coord_map() +
  theme_map()

Cartographic Maps

– Different templates

# Load the ggmap package
# library(ggmap)

# Create london_map_13 with get_map
london_map_13 <- get_map("London, England", zoom = 13)

# Create the map of london
ggmap(london_map_13)

# Experiment with get_map() and use ggmap() to plot it!
ggmap(get_map(
    "London, England", 
    zoom = 13, 
    maptype = "toner",
    source = "stamen"))

– Mapping points onto a cartographic map

# london_sites and ggmap are available
london_sites <- c("Tower of London, London", "Buckingham Palace, London", "Tower Bridge, London", 
  "Westminster Abbey, London", "Queen Elizabeth Olympic Park, London"
  )

# Use geocode() to create xx
xx <- geocode(london_sites)

# Add a location column to xx
xx$location <- sub(", London","",london_sites)

# Get map data
london_ton_13 <- get_map(location = "London, England", zoom = 13,
                         source = "stamen", maptype = "toner")

# Add a geom_points layer
ggmap(london_ton_13) + 
  geom_point(data = xx, aes(col = location), size = 6)

– Using a bounding box

# london_sites and ggmap are available

# Build xx
xx <- geocode(london_sites)
xx$location <- sub(", London", "", london_sites)
xx$location[5] <- "Queen Elizabeth\nOlympic Park"

# Create bounding box: bbox
bbox <- make_bbox(lon = xx$lon, lat = xx$lat, f = 0.3)

# Re-run get_map to use bbox
london_ton_13 <- get_map(location = bbox, zoom = 13,
                         source = "stamen", maptype = "toner")

# Map from previous exercise
ggmap(london_ton_13) +
  geom_point(data = xx, aes(col = location), size = 6)

# New map with labels
ggmap(london_ton_13) +
  geom_label(
    data = xx, 
    aes(label = location), 
    size = 4, 
    fontface = "bold", 
    fill = "grey90", 
    col = "#E41A1C")

– Combine cartographic and choropleth maps

# bundes is available, as are all required packages

# Get the map data of "Germany"
germany_06 <- get_map(location = "Germany", zoom = 6)

# Plot map and polygon on top:
ggmap(germany_06) +
  geom_polygon(
    data = bundes,
    aes(x = long, y = lat, group = group),
    fill = NA, 
    col = "red") +
  coord_map()

Animations

– The population pyramid

  • First, I needed to instal ImageMagik with homebrew
  • Once the gif is created I can just load it into the doc like an image
# Inspect structure of japan
japan <- read_tsv('https://assets.datacamp.com/production/course_862/datasets/japanPOP.txt')
str(japan)
## Classes 'tbl_df', 'tbl' and 'data.frame':    8282 obs. of  4 variables:
##  $ AGE : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ POP : int  -572954 -581748 -585239 -582223 -568788 -571899 -590530 -602349 -612527 -620373 ...
##  $ time: int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ SEX : chr  "Male" "Male" "Male" "Male" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 4
##   .. ..$ AGE : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ POP : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ time: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ SEX : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
# Finish the code inside saveGIF
# library(animation)

saveGIF({

  # Loop through all time points
  for (i in unique(japan$time)) {

    # Subset japan: data
    data <- subset(japan, time == i)

    # Finish the ggplot command
    p <- ggplot(data, aes(x = AGE, y = POP, fill = SEX, width = 1)) +
      coord_flip() +
      geom_bar(data = data[data$SEX == "Female",], stat = "identity") +
      geom_bar(data = data[data$SEX == "Male",], stat = "identity") +
      ggtitle(i)

    print(p)

  }

}, movie.name = "pyramid.gif", interval = 0.1)
## [1] TRUE

– Animations with gganimate

## from the car library
head(Vocab)
##          year    sex education vocabulary
## 20040001 2004 Female         9          3
## 20040002 2004 Female        14          6
## 20040003 2004   Male        14          9
## 20040005 2004 Female        17          8
## 20040008 2004   Male        14          1
## 20040010 2004   Male        14          7
# Update the static plot
p <- ggplot(Vocab, aes(x = education, y = vocabulary,
                       color = year, group = year,
                       frame = year, cumulative = T)) +
  stat_smooth(method = "lm", se = FALSE, size = 3)

# Call gganimate on p
# Form the gganimate library
animation <- gganimate(p, filename = "vocab.gif", interval = 0.5)

– Exploring Gapminder (Extra)

And one more because this is fun and I think its pretty powerful

# library(gapminder)
head(gapminder)
## # A tibble: 6 x 6
##       country continent  year lifeExp      pop gdpPercap
##        <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia  1977  38.438 14880372  786.1134
theme_set(theme_bw())

p <- ggplot(gapminder, aes(gdpPercap, lifeExp, size = log(pop), color = continent, frame = year)) +
  geom_point() +
  scale_x_log10()


animation <- gganimate(p, "gapminder.gif", interval = 0.5, ani.width = 800, ani.height = 600)