# this line specifies options for default options for all R Chunks
knitr::opts_chunk$set(echo=T, highlight=T)

# suppress scientific notation
options(scipen=100)

# install helper package that loads and installs other packages, if needed
if (!require("pacman")) install.packages("pacman", repos = "http://lib.stat.cmu.edu/R/CRAN/")
## Loading required package: pacman
# install and load required packages
# flexdashboard not required for today, but make sure you can install and load it
pacman::p_load(pacman,tidyverse, ggthemes, magrittr, knitr, lubridate, gridExtra, RColorBrewer, 
               flexdashboard, maps, usdata, countrycode, mapproj, shadowtext)

# verify packages
p_loaded()
##  [1] "shadowtext"    "mapproj"       "countrycode"   "usdata"       
##  [5] "maps"          "flexdashboard" "RColorBrewer"  "gridExtra"    
##  [9] "lubridate"     "knitr"         "magrittr"      "ggthemes"     
## [13] "forcats"       "stringr"       "dplyr"         "purrr"        
## [17] "readr"         "tidyr"         "tibble"        "ggplot2"      
## [21] "tidyverse"     "pacman"

Plan for This Week

In-class Zooming Option

  • If you want to attend remotely, you are required to email me BEFORE CLASS to ask permission and explain why you can’t be in the classroom.

  • If you are IN CLASS, and want to connect by Zoom to see the screen better, I will allow that.

  • Last week, I mistakenly clicked ‘Office Hours’ instead of Lecture so Lecture 10 was not recorded.

  • Please let me know ASAP if the Lecture Zoom does not work so I can correct this issue in class.


Updates

  • Quiz 2 is grades and solutions are posted

    • We will spend a little time on the solutions today
  • Proposals grades will be posted by Thursday

  • Project presentations are 2 weeks from today.

  • HW 5 - Part 2 is due Friday, 4/22 at midnight.

  • Thu. 4/7: Skills/Concepts Review for Quiz 2

  • Today - More on Geographic data


Quiz 2

  • Detailed solutions are posted.

  • Common challenges among students:

    • Using ifelse to create a categorical variable

    • converting date test to a date variable

    • using gsub to remove nuisance text and converting numeric text to a number

  • Common successes

    • Most students are comfortable with joins and stacking data using bind_rows

      • There many other tipes of joins and data-stacking skills

      • These commands are very useful and versatile


World Data

world <- map_data("world") |>
  select(!subregion)

intbxo <- read_csv("intnl_bxo.csv", show_col_types = F, skip=12,
                   col_names = c("region","date","num_releases",
                                 "num_1_release","dist", 
                                 "wknd_gross"),
                   col_select = c(region, wknd_gross)) |>
  
  filter(!is.na(wknd_gross)) |>

  mutate(wknd_gross = gsub("$", "", wknd_gross, fixed = T),
         wknd_gross = gsub(",", "", wknd_gross, fixed = T) |> as.numeric())

world_bxo_data <- left_join(intbxo, world) |>
  filter(!is.na(wknd_gross))
## Joining, by = "region"
world_bxo_data$continent = countrycode(sourcevar = world_bxo_data$region,
                                       origin = "country.name",
                                       destination = "continent")
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Central America, Middle East, Serbia and Montenegro

Choropleth Country Plot w/ Labels

Example - Asia

  • Most of the plot code below is review

    • There are a few new details:

      • shadowtext labels (see below)

      • modifying size of text elements (required trial and error)

  • NOTES:

    • The R package shadowtext includes the command geom_shadowtext

    • shadowtext is useful for creating visible labels for all countries regardless of color

    • Deciding on units ($1000) and transformation (log) took some trial and error.

# create asia box office dataset with location for name labels
asia_bxo_data <- world_bxo_data |>
  filter(continent=="Asia") |>
  mutate(wknd_gross = wknd_gross/1000)
         
# create dataset of country names with median lat and long for position
asia_nms <- asia_bxo_data |>
  select(region, long, lat, group, continent) |>
  group_by(continent, region) |>
  summarize(nm_x=median(long, na.rm=T),
            nm_y=median(lat, na.rm=T)) |>
  filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
asia_bxo_data <- inner_join(asia_bxo_data, asia_nms)
## Joining, by = c("region", "continent")
# create Asia box office plot with country labels
(asia_bxo_map <- asia_bxo_data |>
                   ggplot(aes(x=long, y=lat,
                              group=group,
                              fill=wknd_gross)) +
                   geom_polygon() +
                   theme_map() +
                   coord_map("albers", lat0 = 39, lat1 = 45) +
    
                   labs(fill= "$1000",
                        title="Weekend Gross ($ Thousands) in Asian Countries",
                        subtitle="Weekend Ending April 10, 2022 - Data are Log-transformed",
                        caption="Data Source: https://www.boxofficemojo.com/") +
    
                   scale_fill_continuous(type = "viridis",  trans="log",
                                         breaks =c(1,10,100,1000,10000)) +
    
  # add country labels with shadowtext
  # size determined by trial and error
                   geom_shadowtext(aes(x=nm_x, y=nm_y,
                                       label=region),
                                  color="white",
                                  check_overlap = T,
                                  show.legend = F,
                                  size=4) + 
                   
  # adjust size of all map text
                   theme(plot.title = element_text(size = 25),
                         plot.subtitle = element_text(size = 20),
                         plot.caption = element_text(size = 20),
                         legend.text = element_text(size = 12),
                         legend.title = element_text(size = 20)))


In-class Exercise

  • If you project includes data that can be mapped, this exercise will be helpful

  • Details like text size, shadowtext, and data transformations can all greatly improve data clarity.


Europe Box Office Map

  • Copy and paste R code from previous chunk

  • Convert it to a map of Europe

  • This requires going through code line by line and making the following changes:

    • change asia to euro

    • change “Asia” to “Europe”

    • country labels in map may need to be slightly smaller

      • examine map in html (knitted) file

      • try size=3 if current labels are too big


Europe Map without Log (LN) Transformation

# create asia box office dataset with location for name labels
euro_bxo_data <- world_bxo_data |>
  filter(continent=="Europe") |>
  mutate(wknd_gross = wknd_gross/1000)
         
# create dataset of country names with median lat and long for position
euro_nms <- euro_bxo_data |>
  select(region, long, lat, group, continent) |>
  group_by(continent, region) |>
  summarize(nm_x=median(long, na.rm=T),
            nm_y=median(lat, na.rm=T)) |>
  filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
euro_bxo_data <- inner_join(euro_bxo_data, euro_nms)
## Joining, by = c("region", "continent")
# create Europe box office plot with country labels
(euro_bxo_map <- euro_bxo_data |>
                   ggplot(aes(x=long, y=lat,
                              group=group,
                              fill=wknd_gross)) +
                   geom_polygon() +
                   theme_map() +
                   coord_map("albers", lat0 = 39, lat1 = 45) +
    
                   labs(fill= "$1000",
                        title="Weekend Gross ($ Thousands) in European Countries",
                        subtitle="Weekend Ending April 10, 2022",
                        caption="Data Source: https://www.boxofficemojo.com/") +
    
                   scale_fill_continuous(type = "viridis") +
    
  # add country labels with shadowtext
  # size determined by trial and error
                   geom_shadowtext(aes(x=nm_x, y=nm_y,
                                       label=region),
                                  color="white",
                                  check_overlap = T,
                                  show.legend = F,
                                  size=3) + 
                   
  # adjust size of all map text
                   theme(plot.title = element_text(size = 20),
                         plot.subtitle = element_text(size = 15),
                         plot.caption = element_text(size = 15),
                         legend.text = element_text(size = 10),
                         legend.title = element_text(size = 15)))


Europe Map with Log (LN) Transformation

# create asia box office dataset with location for name labels
euro_bxo_data <- world_bxo_data |>
  filter(continent=="Europe") |>
  mutate(wknd_gross = wknd_gross/1000)
         
# create dataset of country names with median lat and long for position
euro_nms <- euro_bxo_data |>
  select(region, long, lat, group, continent) |>
  group_by(continent, region) |>
  summarize(nm_x=median(long, na.rm=T),
            nm_y=median(lat, na.rm=T)) |>
  filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
euro_bxo_data <- inner_join(euro_bxo_data, euro_nms)
## Joining, by = c("region", "continent")
# create Europe box office plot with country labels
(euro_bxo_map <- euro_bxo_data |>
                   ggplot(aes(x=long, y=lat,
                              group=group,
                              fill=wknd_gross)) +
                   geom_polygon() +
                   theme_map() +
                   coord_map("albers", lat0 = 39, lat1 = 45) +
    
                   labs(fill= "$1000",
                        title="Weekend Gross ($ Thousands) in European Countries",
                        subtitle="Weekend Ending April 10, 2022 - Data are Log-transformed",
                        caption="Data Source: https://www.boxofficemojo.com/") +
    
                   scale_fill_continuous(type = "viridis",  trans="log",
                                         breaks =c(1,10,100,1000,5000)) +
    
  # add country labels with shadowtext
  # size determined by trial and error
                   geom_shadowtext(aes(x=nm_x, y=nm_y,
                                       label=region),
                                  color="white",
                                  check_overlap = T,
                                  show.legend = F,
                                  size=3) + 
                   
  # adjust size of all map text
                   theme(plot.title = element_text(size = 20),
                         plot.subtitle = element_text(size = 15),
                         plot.caption = element_text(size = 15),
                         legend.text = element_text(size = 10),
                         legend.title = element_text(size = 15)))


Material Added Wed. 4/20/2022

  • On Tuesday (4/19), a student asked me about creating state maps

  • Together (during office hours) we drafted a map of their project data.

  • There were details that I knew would take more time…

    • Yesterday, I tinkered…

    • Creating a composed visualization takes time

  • This code could be useful for any project that has data by state.

  • For example

    • Average costs and expenditures by state

    • Demographics

    • Voting records

    • Sports/Arts/Entertainment investments and expenditures

    • etc.


State Map Tips

  • state_stats is a great dataset in the usdata package

  • state labels were created the abbreviations

# state polygons (from R)
us_states <- map_data("state") |>
  select(long:region) |>
  rename("state" = "region")

# many useful variables in this dataset
state_abbr <- state_stats |>
  select(state, abbr) |>
  mutate(state = tolower(state))

# data by county (aggregated by state)
state_pop <- county_2019 |>
  select(state, pop) |>
  mutate(state=tolower(state),
         popM = pop/1000000) |>
  group_by(state) |>
  summarize(st_popM = sum(popM, na.rm=T)) |>
  full_join(state_abbr)
## Joining, by = "state"
# used left join because the lat and long info 
# is missing for Hawaii and Alaksa
statepop_map <- left_join(us_states, state_pop) 
## Joining, by = "state"

Adding State Midpoint Lat and Long

  • Using the median of each state lat and long data did not work

    • States are oddly shaped and small.
  • Instead I googled ‘state midpoint lat and long’ and found this website

    • Copied data and pasted into Excel

    • Saved as .csv file named state_coords.csv (included)

    • state_coords did not include DC so I googled that too and added it.

    • Added state_coords to state_match to verify agreement and added DC there.

      • New dataset created: state_match_check
    • Final dataset for plot created: state2019pop_map

# https://www.latlong.net/category/states-236-14.html

state_coords <- read_csv("state_coords.csv", show_col_types = F,
                         col_names = c("state", "m_lat", "m_long")) |>
  mutate(state = gsub(", USA", "", state, fixed=T),
         state = gsub(", the USA", "", state, fixed=T),
         state = gsub(", the US", "", state, fixed=T),
         state = tolower(state))

# save values for dc
state <- "district of columbia"
m_lat <- 38.9072
m_long <- -77.0369

# create dc dataset with 1 observation
dc <- tibble(state, m_lat, m_long)

# add dc to state_coords
state_coords <- bind_rows(state_coords, dc)

# remove dc values and tibble from Global Environment
rm(dc, state, m_lat, m_long)

statepop_map <- left_join(statepop_map, state_coords)
## Joining, by = "state"

State Population Plot

  • Similar to plots from Tuesday with a few changes

    • Added borders to states by adding color="darkgrey" to geom_polygon command.

    • Used State abbreviations for state labels.

    • Made State text labels smaller (Size = 2)

    • Changed breaks for log scaled population legend

  • These details seem minor but they take time and trial and error.

  • There was an engagement question asking for additional clarification on the log transformation

    • I show both the Europe and US maps with and without it to clarify the benefit

    • If you have right-skewed data, a log transformation is very helpful.

    • Reminder: log in R is LN, Natural Log

US State Pop. Map

# plot of un-logged data
# transformation and breaks statement added

(st_pop <- statepop_map |>
  ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
    geom_polygon(color="darkgrey") +
   
    theme_map() +
    coord_map("albers", lat0 = 39, lat1 = 45) +
   
    scale_fill_continuous(type = "viridis")+
   
    theme(legend.position = "bottom",
          legend.key.width = unit(1, "cm")) +
    
# add state abbreviations with shadowtext
# size determined by trial and error
    geom_shadowtext(aes(x=m_long, y=m_lat,
                        label=abbr),
                        color="white",
                        check_overlap = T,
                        show.legend = F,
                        size=2) + 
   
    labs(fill= "Pop. in Millions", title="Population by State",
         subtitle="Unit is 1 Million People",
         caption= "Not Shown: HI: 1.42 Million   AK: 0.74 Million
         
         Data Source: https://CRAN.R-project.org/package=usdata"))


US State Pop. Map with Log (LN) Transformation

# plot of logged data
# transformation and breaks statement added

(st_lpop <- statepop_map |>
  ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
    geom_polygon(color="darkgrey") +
   
    theme_map() +
    coord_map("albers", lat0 = 39, lat1 = 45) +
   
    scale_fill_continuous(type = "viridis", trans="log",
                          breaks=c(0,1,2,3,5,10,20,30))+
   
    theme(legend.position = "bottom",
          legend.key.width = unit(1, "cm")) +
    
# add state abbreviations with shadowtext
# size determined by trial and error
    geom_shadowtext(aes(x=m_long, y=m_lat,
                        label=abbr),
                        color="white",
                        check_overlap = T,
                        show.legend = F,
                        size=2) + 
   
    labs(fill= "Pop. in Millions", title="Population by State",
         subtitle="Unit is 1 Million People - Date are Log-transformed",
         caption= "Not Shown: HI: 1.42 Million   AK: 0.74 Million
         
         Data Source: https://CRAN.R-project.org/package=usdata"))


Managing Project Files

  • At this point, much of this should be review

  • Next week, I will spend a little time walking you through a consulting project

    • Project involved segmenting and rejoining data in multiple ways

    • Documentation is key

    • Take good notes

    • I use a Markdown notebook for documenting work that I will not show to a client

      (not to be printed)

For your project:

  • code_data_ output folder should have:

    • all raw .csv files needed

      • no data mangement should be done in Excel
    • any .png or other graphics files needed

    • dashboard.rmd file

    • dashboard.html file

  • Outer folder should have

    • code_data_output folder (see above)

    • Complete and accurate README.txt file

    • .Rproj file that is functional

    • Other files can be stored in project outer file if needed

  • Examples

    • HW5 - Part 1 Solution

    • This lecture


RPubs

  • If you want to publish your dashboard or any HTML file you create in R, you can do so for free.

  • R has a public online repository called RPubs.

  • I am not requiring students to use it for their projects, but it is useful if you want post something online and provide the link to it.

  • As an in class exercise, I am asking you each to create an account and publish your HW 5 dashboard file (see below).

    • It is useful to do this with your example dashboard so you can see how this changes the appearance of your panels and text.

Lecture 24 In-class Exercise

  1. Open your HW 5 - Part 1.Rmd file and knit it to create your dashboard.

    • Make sure this file has your name in the header.

    • It is okay if you haven’t revised it yet for HW 5 - Part 2

      • Rminder HW 5 - Part 2 is due tomorrow.
  2. Click the Rpubs icon, create a free account, and publish your html file.

    • If RStudio asks to install additional packages to complete the publishing process, click Yes.
  3. Submit the link to your published file on Blackboard.

  4. A Link to your published file must be submitted by Friday 4/22/22 at midnight to count for class participation today.


Next Week:

Additional Topics

  • Ask me questions about your project (Others may benefit)

  • I have 1-2 Examples

  • I will also demo knitting R Markdown to different formats:

    • Word document with Table of Contents

    • Powerpoint presentation

    • NOTE: HTML is ideal and flexible (and can easily be published online), but other formats are some times appropriate.

Skillset Terminology

  • Now that you are (almost) done with BUA 455 and more so, when you graduate you have a very useful set of skills

  • I will spend a little time talking about how to explain those skills to other people

  • Preview: It took me decades to figure out how to talk about what I do.

    • Increased interest in Data Science and Analytics has resulted in better terminology.