# this line specifies options for default options for all R Chunks
knitr::opts_chunk$set(echo=T,
highlight=T)
# suppress scientific notation
options(scipen=100,
getSymbols.warning4.0 = FALSE)
# install helper package (pacman), if needed
if (!require("pacman")) install.packages("pacman", repos = "http://lib.stat.cmu.edu/R/CRAN/")
## Loading required package: pacman
# install and load required packages
# pacman should be first package in parentheses and then list others
pacman::p_load(pacman,tidyverse, ggthemes, magrittr, knitr, lubridate, gridExtra, RColorBrewer,
flexdashboard, maps, usdata, countrycode, mapproj, shadowtext, grid)
# verify packages (comment out in finished documents)
p_loaded()
## [1] "shadowtext" "mapproj" "countrycode" "usdata"
## [5] "maps" "flexdashboard" "RColorBrewer" "gridExtra"
## [9] "knitr" "magrittr" "ggthemes" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr"
## [17] "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "pacman"
HW 2 - Part 5 is due Wednesday, 4/19.
For Question 9, you should document steps to update Panel 6.
Data for this panel are posted on Blackboard for HW 5 - Part 2
There is a 2 day grace period, if needed.
Quiz 2 grading is progressing (I hope to be done this weekend).
Projects should be progressing as well.
If you have data management or project management questions, get in touch with me or course TAs ASAP.
Plan out tasks and how to accomplish them.
We are here to help with tasks where you might be stymied, but don’t wait until the last day.
Presentations will be on 4/25 and 4/27
One day off from standard schedule so today’s notes represent two lectures.
Notes are streamlined and I will not cover them all in detail
Rather than deleting notes and code that might be useful to some students all notes are provided.
There were a few questions about formatting so I added formatting details to plots
There many options for formatting depending on data and visualization goal
Also see: Final Side-by-Side Boxplots and Code from Week 2 (Slide 13)
world <- map_data("world") |> select(!subregion) # world geo info
intbxo <- read_csv("intl_bxo.csv", show_col_types = F, skip=7) |> # import/tidy bxo
select(1,6) |>
rename("region" = "Area", "wknd_gross" = "Weekend Gross") |>
filter(!is.na(wknd_gross)) |>
mutate(wknd_gross = gsub("$", "", wknd_gross, fixed = T),
wknd_gross = gsub(",", "", wknd_gross, fixed = T) |> as.numeric())
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `wknd_gross = as.numeric(gsub(",", "", wknd_gross, fixed = T))`.
## Caused by warning:
## ! NAs introduced by coercion
world_bxo_data <- left_join(intbxo, world) |> # join datasets
filter(!is.na(wknd_gross))
## Joining with `by = join_by(region)`
world_bxo_data$continent = countrycode(sourcevar = world_bxo_data$region, # retrieve continents
origin = "country.name",
destination = "continent")
## Warning: Some values were not matched unambiguously: Central America, Middle East Other, Serbia and Montenegro, West Indies
head(world_bxo_data, 3)
## # A tibble: 3 × 7
## region wknd_gross long lat group order continent
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <chr>
## 1 Mexico 21797271 -91.7 18.7 970 60731 Americas
## 2 Mexico 21797271 -91.8 18.7 970 60732 Americas
## 3 Mexico 21797271 -91.8 18.7 970 60733 Americas
Most of the plot code that follows is review
There are a few new details:
shadowtext
labels (see below)
modifying size of text elements (mentioned but not emphasized)
NOTES:
The R package shadowtext
includes the command
geom_shadowtext
shadowtext
is useful for creating visible labels for
all countries regardless of color
Deciding on units ($1000) and transformation (log
)
took some trial and error.
asia_bxo_data <- world_bxo_data |> # create asia box office dataset
filter(continent=="Asia") |>
mutate(Gross = as.integer(wknd_gross),
wknd_gross = wknd_gross/1000)
asia_nms <- asia_bxo_data |> # create dataset of country names
select(region, long, lat, group, continent) |> # median lat and long used for position
group_by(continent, region) |>
summarize(nm_x=median(long, na.rm=T),
nm_y=median(lat, na.rm=T)) |>
filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
asia_bxo_data <- inner_join(asia_bxo_data, asia_nms) # merge datasets using an inner_join
## Joining with `by = join_by(region, continent)`
asia_bxo_map <- asia_bxo_data |> # Creates the map that follows
ggplot(aes(x=long, y=lat, group=group, fill=wknd_gross)) +
geom_polygon() +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
labs(fill= "Gross ($1K)",
title="Weekend Gross ($ Thousands) in Asian Countries",
subtitle="Weekend Ending 4/9/2023 - Data are Log-transformed",
caption="Data Source: https://www.boxofficemojo.com") +
scale_fill_continuous(type = "viridis", trans="log",
breaks =c(1,10,100,1000,10000)) +
geom_shadowtext(aes(x=nm_x, y=nm_y,label=region),
color="white",check_overlap = T,
show.legend = F, size=4) +
theme(plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 10),
legend.text = element_text(size = 12),
legend.title = element_text(size = 15),
plot.background = element_rect(colour = "darkgrey", fill=NA, size=2)) # adds a boarder
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Creates data for Europe Map
euro_bxo_data <- world_bxo_data |> # create Europe box office dataset
filter(continent=="Europe" & region != "Russia") |>
mutate(Gross = as.integer(wknd_gross),
wknd_gross = wknd_gross/1000)
euro_nms <- euro_bxo_data |> # create dataset of country names
select(region, long, lat, group, continent) |> # median lat and long used for position
group_by(continent, region) |>
summarize(nm_x=median(long, na.rm=T),
nm_y=median(lat, na.rm=T)) |>
filter(!is.na(nm_x) | !is.na(nm_y))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
euro_bxo_data <- inner_join(euro_bxo_data, euro_nms) # merge datasets using an inner_join
## Joining with `by = join_by(region, continent)`
Student will create this map or interactive one by adapting Asia Map code
Examples of Data that can be plotted by state
Average costs and expenditures by state of specific goods or services
Demographic data
Voting and tex information
Sports/Arts/Entertainment/Education investments and expenditures
Will also show a map of data filtered by region
us_states <- map_data("state") |> # state polygons (from R)
select(long:region) |>
rename("state" = "region")
state_abbr <- state_stats |> # many useful variables in this dataset
select(state, abbr) |>
mutate(state = tolower(state))
state_pop <- county_2019 |> # data by county (aggregated by state)
select(state, pop) |>
mutate(state=tolower(state),
popM = pop/1000000) |>
group_by(state) |>
summarize(st_popM = sum(popM, na.rm=T)) |>
full_join(state_abbr)
## Joining with `by = join_by(state)`
statepop_map <- left_join(us_states, state_pop) # used left join to filter to lower 48 states
## Joining with `by = join_by(state)`
# lat/long not available for Hi and AK
In the previous maps (by country) country labels were added to the static map using each polygon’s (country) median latitude and longitude
Medians don’t work well for U.S. because many states are oddly shaped and small.
Alternative: use centroid for each state polygon
Centroid is another term for midpoint
Saved data as .csv file named state_coords.csv
(included)
Data did not include D.C. but those coordinates were found elsewhere
Added state_coords
to state_match
to
verify agreement and added DC there.
New dataset created: state_match_check
Final dataset for plot created:
state2019pop_map
state_coords <- read_csv("state_coords.csv", show_col_types = F,
col_names = c("state", "m_lat", "m_long")) |>
mutate(state = gsub(", USA", "", state, fixed=T),
state = gsub(", the USA", "", state, fixed=T),
state = gsub(", the US", "", state, fixed=T),
state = tolower(state))
state <- "district of columbia" # save values for dc
m_lat <- 38.9072
m_long <- -77.0369
dc <- tibble(state, m_lat, m_long) # create dataset of dc data ( 1 obs)
state_coords <- bind_rows(state_coords, dc) # add dc to state_coords
rm(dc, state, m_lat, m_long) # remove temporary values from global
statepop_map <- left_join(statepop_map, state_coords) # centroids to data
## Joining with `by = join_by(state)`
Similar to plots from Tuesday with a few changes
Added borders to states by adding color="darkgrey"
to geom_polygon
command.
Used State abbreviations for state labels.
Made State text labels smaller (Size = 2)
Changed breaks for log scaled population legend
These details seem minor but they take time and trial and error.
st_pop <- statepop_map |>
ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis") +
geom_shadowtext(aes(x=m_long, y=m_lat, label=abbr),
color="white", check_overlap = T,
show.legend = F, size=4) +
labs(fill= "Pop. in Millions", title="Population by State",
subtitle="Unit is 1 Million People",
caption= "Not Shown: HI: 1.42 Million AK: 0.74 Million
Data Source: https://CRAN.R-project.org/package=usdata") +
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm"),
plot.background = element_rect(colour = "darkgrey", fill=NA, size=2),
plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15))
st_lpop <- statepop_map |>
ggplot(aes(x=long, y=lat, group=group, fill=st_popM)) +
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis", trans="log",
breaks=c(0,1,2,3,5,10,20,35)) +
geom_shadowtext(aes(x=m_long, y=m_lat, label=abbr),
color="white", check_overlap = T,
show.legend = F, size=4) +
labs(fill= "Pop. in Millions", title="Population by State",
subtitle="Unit is 1 Million People - Log Transformed",
caption= "Not Shown: HI: 1.42 Million AK: 0.74 Million
Data Source: https://CRAN.R-project.org/package=usdata") +
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm"),
plot.background = element_rect(colour = "darkgrey", fill=NA, size=2),
plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15))
In this class, we visualize data using ggplot
,
hchart
, dygraph
If you want to explore but (not present) data, you can also use base graphics for quick plots
ggplot
par(mfrow=c(2,1)) # stacks base graph plots
hist(statepop_map$st_popM, main="")
hist(log(statepop_map$st_popM), main="")
par(mfrow=c(1,1)) # resets base graph options
edu <- read_csv("education by state.csv", skip=3, show_col_types = F, # import data
col_names = c("state", "pop_over_25", "pop_hs", "pct_hs",
"pop_bachelor", "pct_bachelor",
"pop_advanced","pct_advanced"))
edu1 <- edu |>
select(state, pop_bachelor, pct_bachelor) |>
mutate(state = str_trim(state) |> tolower(),
pop_bachelor1K = pop_bachelor/1000,
pct_bachelor = gsub("%","", pct_bachelor, fixed = T) |> as.numeric()) |>
filter(state %in% c("maine", "massachusetts", "connecticut" , "rhode island",
"vermont", "new hampshire", "new york", "new jersey", "pennsylvania",
"delaware")) |> glimpse()
## Rows: 10
## Columns: 4
## $ state <chr> "vermont", "rhode island", "pennsylvania", "new york", …
## $ pop_bachelor <dbl> 172272, 260275, 2917402, 5166218, 2551765, 368237, 2181…
## $ pct_bachelor <dbl> 38.66, 34.84, 32.31, 37.81, 41.22, 37.58, 44.98, 33.19,…
## $ pop_bachelor1K <dbl> 172.272, 260.275, 2917.402, 5166.218, 2551.765, 368.237…
TurningPoint Session ID: bua455s22
What exploratory plot command (base R code shown) is good for checking if the variable you want to plot is right skewed and might need to be log transformed?
Based on the histogram for the northeastern area of the U.S, which includes only 10 states, do these data appear skewed?
In the chunk below we start from scratch with state data. This chunk does not depend on the data being imported and managed in a previous chunk.
us_states <- map_data("state") |> # state polygons (from R)
select(long:region) |> rename("state" = "region")
state_abbr <- state_stats |> # state abbreviations
select(state, abbr) |> mutate(state = tolower(state))
edu1 <- left_join(edu1, state_abbr) # left join to maintain filter to NE states
## Joining with `by = join_by(state)`
edu_NE_map <- left_join(edu1, us_states) # left join to maintain filter to NE states
## Joining with `by = join_by(state)`
state_coords <- read_csv("state_coords.csv", show_col_types = F, # add in state midpoints (centroids)
col_names = c("state", "m_lat", "m_long")) |>
mutate(state = gsub(", USA", "", state, fixed=T),
state = gsub(", the USA", "", state, fixed=T),
state = gsub(", the US", "", state, fixed=T),
state = tolower(state))
edu_NE_map <- left_join(edu_NE_map, state_coords) # left join to maintain filter to NE states
## Joining with `by = join_by(state)`
ne_edu_pop <- edu_NE_map |>
ggplot(aes(x=long, y=lat, group=group, fill=pop_bachelor1K)) + # pop in 1000s
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis", trans="log", # log transformation
breaks = c(100, 500, 1000, 5000)) +
geom_shadowtext(aes(x=m_long, y=m_lat, label=abbr),
color="white", check_overlap = T, show.legend = F, size=4) +
labs(fill= "Unit: 1000 People",
title="NE States: Pop. with a Bachelor's Degree") +
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm"),
plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15))
ne_edu_pct <- edu_NE_map |>
ggplot(aes(x=long, y=lat, group=group, fill=pct_bachelor)) + # percent data
geom_polygon(color="darkgrey") +
theme_map() +
coord_map("albers", lat0 = 39, lat1 = 45) +
scale_fill_continuous(type = "viridis", # no transformation needed
breaks = c(32, 34, 36, 38, 40, 42, 44)) +
geom_shadowtext(aes(x=m_long, y=m_lat, label=abbr),
color="white", check_overlap = T, show.legend = F, size=4) +
labs(fill= "Unit: %", title="NE States: Percent with a Bachelor's Degree") +
theme(legend.position = "bottom",
legend.key.width = unit(1, "cm"),
plot.title = element_text(size = 20),
plot.subtitle = element_text(size = 15),
plot.caption = element_text(size = 15),
legend.text = element_text(size = 15),
legend.title = element_text(size = 15))
Some of this should be review
Next week, we will talk about managing a long term consulting project
Managing files over time
Segmenting and rejoining poorly formatted data
Documenting steps as you progress
Addressing client needs as they eveolve and update requests
Documentation is key
I use a Markdown files for everything, even work I don’t present to client.
All raw .csv files needed
NO data management should be done in Excel.
Any .png or other graphics files needed
Dashboard .rmd file (R Markdown file)
Dashboard .html file (Dashboard presentation)
NOTE: As you progress in data management (after this course), code, data, and output may be in separate folders
code_data_output folder (see above)
Complete and accurate README.txt file
.Rproj file that is functional
Other files can be stored in project Outer file if needed.
If other files are stored in this outer folder they must be catalogued in the README file
If you want to publish your dashboard or any HTML file you create in R, you can do so for free.
R has a public online repository called RPubs.
Rpubs is very useful if you want post an html file online and provide the link to it.
This is particularly useful if for work like the project dashboards.
As an in class exercise, I will ask you each to create an account and publish your HW 5 - Part 1 dashboard html file
This exercise will be useful because it allows you to see how this publication process works
You will see how publishing changes the appearance of your panels and text.
Once you post your final dashboard you may want to include it as a link in your resume and/or LinkedIn profile.
1.
Open your HW 5 - Part 1.Rmd file and
knit it to create your dashboard.
Make sure this file has your name in the header.
It is okay if you haven’t revised it yet for HW 5 - Part 2
If you don’t have HW 5 - Part 1 done, you can use your html file from another HW Assignment, e.g. HW 4.
2.
Click the Rpubs icon, create a free
account, and publish your html file.
Yes
.3.
Submit the link to your published
file on Blackboard.
4.
A Link to your published file must
be submitted by Friday 4/14/22 at midnight to count for class
participation today’s lecture.
Ask me questions about your project (Others may benefit)
I have some essential and some optional topics including
details and recommendations for writing both project memos.
managing a consulting project from beginning to end.
formatting complex tables using the gt
package.
Time permitting, I will also demonstrate and discuss
knitting R Markdown to different formats: word, Powerpoint, etc.
Next Generation RStudio publishing using Quarto
Now that you are (almost) done with BUA 455 and more so, when you graduate you have a very useful set of skills
Explaining these skills to others is a challenge
I will spend a little time talking about how to explain those skills to other people
Preview: It took me decades to figure out how to talk about what I do.
Increased interest in Data Science and Analytics has resulted in better terminology
.bg-azure.b–dark_cyan.ba.bw2.br3.shadow-5.ph2[
HW 5 - Part 2
More with Geographic Data
Project Management
Publishing Work on RPubs
You may submit an ‘Engagement Question’ about each lecture until midnight on the day of the lecture. A minimum of four submissions are required during the semester.