# this line specifies options for default options for all R Chunks
knitr::opts_chunk$set(echo=T, highlight=T)
# suppress scientific notation
options(scipen=100)
# install helper package that loads and installs other packages, if needed
if (!require("pacman")) install.packages("pacman", repos = "http://lib.stat.cmu.edu/R/CRAN/")
## Loading required package: pacman
# install and load required packages
# flexdashboard not required for today, but make sure you can install and load it
pacman::p_load(pacman,tidyverse, ggthemes, magrittr, knitr, lubridate, gridExtra, RColorBrewer,
flexdashboard, maps, usdata, countrycode, mapproj, shadowtext)
# verify packages
p_loaded()
## [1] "shadowtext" "mapproj" "countrycode" "usdata"
## [5] "maps" "flexdashboard" "RColorBrewer" "gridExtra"
## [9] "lubridate" "knitr" "magrittr" "ggthemes"
## [13] "forcats" "stringr" "dplyr" "purrr"
## [17] "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "pacman"
If you want to attend remotely, you are required to email me BEFORE CLASS to ask permission and explain why you can’t be in the classroom.
If you are IN CLASS, and want to connect by Zoom to see the screen better, I will allow that.
Last week, I mistakenly clicked ‘Office Hours’ instead of Lecture so Lecture 10 was not recorded.
Please let me know ASAP if the Lecture Zoom does not work so I can correct this issue in class.
Quiz 2 is grades and solutions are posted
Proposals grades will be posted by Thursday
Project presentations are 2 weeks from today.
HW 5 - Part 2 is due Friday, 4/22 at midnight.
Thu. 4/7: Skills/Concepts Review for Quiz 2
Today - More on Geographic data
Detailed solutions are posted.
Common challenges among students:
Using ifelse
to create a categorical
variable
converting date test to a date variable
using gsub to remove nuisance text and converting numeric text to a number
Common successes
Most students are comfortable with joins
and
stacking data using bind_rows
There many other tipes of joins
and data-stacking
skills
These commands are very useful and versatile
world <- map_data("world") |>
select(!subregion)
intbxo <- read_csv("intnl_bxo.csv", show_col_types = F, skip=12,
col_names = c("region","date","num_releases",
"num_1_release","dist",
"wknd_gross"),
col_select = c(region, wknd_gross)) |>
filter(!is.na(wknd_gross)) |>
mutate(wknd_gross = gsub("$", "", wknd_gross, fixed = T),
wknd_gross = gsub(",", "", wknd_gross, fixed = T) |> as.numeric())
world_bxo_data <- left_join(intbxo, world) |>
filter(!is.na(wknd_gross))
## Joining, by = "region"
world_bxo_data$continent = countrycode(sourcevar = world_bxo_data$region,
origin = "country.name",
destination = "continent")
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Central America, Middle East, Serbia and Montenegro
Most of the plot code below is review
There are a few new details:
shadowtext
labels (see below)
modifying size of text elements (required trial and error)
NOTES:
The R package shadowtext
includes the command
geom_shadowtext
shadowtext
is useful for creating visible labels for
all countries regardless of color
Deciding on units ($1000) and transformation (log
)
took some trial and error.
# create asia box office dataset with location for name labels
asia_bxo_data <- world_bxo_data |>
filter(continent=="Asia") |>
mutate(wknd_gross = wknd_gross/1000)
# create dataset of country names with median lat and long for position
asia_nms <- asia_bxo_data |>
select(region, long, lat, group, continent) |>
group_by(continent, region) |>
summarize(nm_x=median(long, na.rm=T),
nm_y=median(lat, na.rm=T)) |>
filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
asia_bxo_data <- inner_join(asia_bxo_data, asia_nms)
## Joining, by = c("region", "continent")
# create Asia box office plot with country labels
(asia_bxo_map <- asia_bxo_data |>
ggplot(aes(x=long, y=lat,
group=group,
fill=wknd_gross)) +
geom_polygon() +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
labs(fill= "$1000",
title="Weekend Gross ($ Thousands) in Asian Countries",
subtitle="Weekend Ending April 10, 2022 - Data are Log-transformed",
caption="Data Source: https://www.boxofficemojo.com/") +
scale_fill_continuous(type = "viridis", trans="log",
breaks =c(1,10,100,1000,10000)) +
# add country labels with shadowtext
# size determined by trial and error
geom_shadowtext(aes(x=nm_x, y=nm_y,
label=region),
color="white",
check_overlap = T,
show.legend = F,
size=4) +
# adjust size of all map text
theme(plot.title = element_text(size = 25),
plot.subtitle = element_text(size = 20),
plot.caption = element_text(size = 20),
legend.text = element_text(size = 12),
legend.title = element_text(size = 20)))
If you project includes data that can be mapped, this exercise will be helpful
Details like text size, shadowtext
, and data
transformations can all greatly improve data clarity.
Copy and paste R code from previous chunk
Convert it to a map of Europe
This requires going through code line by line and making the following changes:
change asia
to euro
change “Asia” to “Europe”
country labels in map may need to be slightly smaller
examine map in html (knitted) file
try size=3
if current labels are too big
# create asia box office dataset with location for name labels
euro_bxo_data <- world_bxo_data |>
filter(continent=="Europe") |>
mutate(wknd_gross = wknd_gross/1000)
# create dataset of country names with median lat and long for position
euro_nms <- euro_bxo_data |>
select(region, long, lat, group, continent) |>
group_by(continent, region) |>
summarize(nm_x=median(long, na.rm=T),
nm_y=median(lat, na.rm=T)) |>
filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
euro_bxo_data <- inner_join(euro_bxo_data, euro_nms)
## Joining, by = c("region", "continent")
# create Europe box office plot with country labels
(euro_bxo_map <- euro_bxo_data |>
ggplot(aes(x=long, y=lat,
group=group,
fill=wknd_gross)) +
geom_polygon() +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
labs(fill= "$1000",
title="Weekend Gross ($ Thousands) in European Countries",
subtitle="Weekend Ending April 10, 2022",
caption="Data Source: https://www.boxofficemojo.com/") +
scale_fill_continuous(type = "viridis") +
# add country labels with shadowtext
# size determined by trial and error
geom_shadowtext(aes(x=nm_x, y=nm_y,
label=region),
color="white",
check_overlap = T,
show.legend = F,
size=3) +
# adjust size of all map text
theme(plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 10),
legend.title = element_text(size = 15)))
# create asia box office dataset with location for name labels
euro_bxo_data <- world_bxo_data |>
filter(continent=="Europe") |>
mutate(wknd_gross = wknd_gross/1000)
# create dataset of country names with median lat and long for position
euro_nms <- euro_bxo_data |>
select(region, long, lat, group, continent) |>
group_by(continent, region) |>
summarize(nm_x=median(long, na.rm=T),
nm_y=median(lat, na.rm=T)) |>
filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
# merge datasets using an inner_join
euro_bxo_data <- inner_join(euro_bxo_data, euro_nms)
## Joining, by = c("region", "continent")
# create Europe box office plot with country labels
(euro_bxo_map <- euro_bxo_data |>
ggplot(aes(x=long, y=lat,
group=group,
fill=wknd_gross)) +
geom_polygon() +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
labs(fill= "$1000",
title="Weekend Gross ($ Thousands) in European Countries",
subtitle="Weekend Ending April 10, 2022 - Data are Log-transformed",
caption="Data Source: https://www.boxofficemojo.com/") +
scale_fill_continuous(type = "viridis", trans="log",
breaks =c(1,10,100,1000,5000)) +
# add country labels with shadowtext
# size determined by trial and error
geom_shadowtext(aes(x=nm_x, y=nm_y,
label=region),
color="white",
check_overlap = T,
show.legend = F,
size=3) +
# adjust size of all map text
theme(plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 10),
legend.title = element_text(size = 15)))
On Tuesday (4/19), a student asked me about creating state maps
Together (during office hours) we drafted a map of their project data.
There were details that I knew would take more time…
Yesterday, I tinkered…
Creating a composed visualization takes time
This code could be useful for any project that has data by state.
For example
Average costs and expenditures by state
Demographics
Voting records
Sports/Arts/Entertainment investments and expenditures
etc.
state_stats
is a great dataset in the usdata
package
state labels were created the abbreviations
# state polygons (from R)
us_states <- map_data("state") |>
select(long:region) |>
rename("state" = "region")
# many useful variables in this dataset
state_abbr <- state_stats |>
select(state, abbr) |>
mutate(state = tolower(state))
# data by county (aggregated by state)
state_pop <- county_2019 |>
select(state, pop) |>
mutate(state=tolower(state),
popM = pop/1000000) |>
group_by(state) |>
summarize(st_popM = sum(popM, na.rm=T)) |>
full_join(state_abbr)
## Joining, by = "state"
# used left join because the lat and long info
# is missing for Hawaii and Alaksa
statepop_map <- left_join(us_states, state_pop)
## Joining, by = "state"
Using the median of each state lat and long data did not work
Instead I googled ‘state midpoint lat and long’ and found this website
Copied data and pasted into Excel
Saved as .csv file named state_coords.csv
(included)
state_coords
did not include DC so I googled that
too and added it.
Added state_coords
to state_match
to
verify agreement and added DC there.
state_match_check
Final dataset for plot created:
state2019pop_map
# https://www.latlong.net/category/states-236-14.html
state_coords <- read_csv("state_coords.csv", show_col_types = F,
col_names = c("state", "m_lat", "m_long")) |>
mutate(state = gsub(", USA", "", state, fixed=T),
state = gsub(", the USA", "", state, fixed=T),
state = gsub(", the US", "", state, fixed=T),
state = tolower(state))
# save values for dc
state <- "district of columbia"
m_lat <- 38.9072
m_long <- -77.0369
# create dc dataset with 1 observation
dc <- tibble(state, m_lat, m_long)
# add dc to state_coords
state_coords <- bind_rows(state_coords, dc)
# remove dc values and tibble from Global Environment
rm(dc, state, m_lat, m_long)
statepop_map <- left_join(statepop_map, state_coords)
## Joining, by = "state"
Similar to plots from Tuesday with a few changes
Added borders to states by adding color="darkgrey"
to geom_polygon
command.
Used State abbreviations for state labels.
Made State text labels smaller (Size = 2)
Changed breaks for log scaled population legend
These details seem minor but they take time and trial and error.
There was an engagement question asking for additional clarification on the log transformation
I show both the Europe and US maps with and without it to clarify the benefit
If you have right-skewed data, a log transformation is very helpful.
Reminder: log
in R is LN, Natural Log
# plot of un-logged data
# transformation and breaks statement added
(st_pop <- statepop_map |>
ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis")+
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm")) +
# add state abbreviations with shadowtext
# size determined by trial and error
geom_shadowtext(aes(x=m_long, y=m_lat,
label=abbr),
color="white",
check_overlap = T,
show.legend = F,
size=2) +
labs(fill= "Pop. in Millions", title="Population by State",
subtitle="Unit is 1 Million People",
caption= "Not Shown: HI: 1.42 Million AK: 0.74 Million
Data Source: https://CRAN.R-project.org/package=usdata"))
# plot of logged data
# transformation and breaks statement added
(st_lpop <- statepop_map |>
ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis", trans="log",
breaks=c(0,1,2,3,5,10,20,30))+
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm")) +
# add state abbreviations with shadowtext
# size determined by trial and error
geom_shadowtext(aes(x=m_long, y=m_lat,
label=abbr),
color="white",
check_overlap = T,
show.legend = F,
size=2) +
labs(fill= "Pop. in Millions", title="Population by State",
subtitle="Unit is 1 Million People - Date are Log-transformed",
caption= "Not Shown: HI: 1.42 Million AK: 0.74 Million
Data Source: https://CRAN.R-project.org/package=usdata"))
At this point, much of this should be review
Next week, I will spend a little time walking you through a consulting project
Project involved segmenting and rejoining data in multiple ways
Documentation is key
Take good notes
I use a Markdown notebook for documenting work that I will not show to a client
(not to be printed)
For your project:
code_data_ output folder should have:
all raw .csv files needed
any .png or other graphics files needed
dashboard.rmd file
dashboard.html file
Outer folder should have
code_data_output folder (see above)
Complete and accurate README.txt file
.Rproj file that is functional
Other files can be stored in project outer file if needed
Examples
HW5 - Part 1 Solution
This lecture
If you want to publish your dashboard or any HTML file you create in R, you can do so for free.
R has a public online repository called RPubs.
I am not requiring students to use it for their projects, but it is useful if you want post something online and provide the link to it.
As an in class exercise, I am asking you each to create an account and publish your HW 5 dashboard file (see below).
Open your HW 5 - Part 1.Rmd file and knit it to create your dashboard.
Make sure this file has your name in the header.
It is okay if you haven’t revised it yet for HW 5 - Part 2
Click the Rpubs icon, create a free account, and publish your html file.
Yes
.Submit the link to your published file on Blackboard.
A Link to your published file must be submitted by Friday 4/22/22 at midnight to count for class participation today.
Ask me questions about your project (Others may benefit)
I have 1-2 Examples
I will also demo knitting R Markdown to different formats:
Word document with Table of Contents
Powerpoint presentation
NOTE: HTML is ideal and flexible (and can easily be published online), but other formats are some times appropriate.
Now that you are (almost) done with BUA 455 and more so, when you graduate you have a very useful set of skills
I will spend a little time talking about how to explain those skills to other people
Preview: It took me decades to figure out how to talk about what I do.