# this line specifies options for default options for all R Chunks
knitr::opts_chunk$set(echo=T, highlight=T)
## Setup ====
# suppress scientific notation
# suppress getSymbols warning
options(scipen=100,
"getSymbols.warning4.0"=FALSE)
# install and load packages we'll need
if (!require("pacman")) install.packages("pacman", repos = "http://cran.us.r-project.org")
## Loading required package: pacman
p_load(tidyverse, ggthemes, magrittr, lubridate, gridExtra,
flexdashboard, knitr, RColorBrewer, maps, usdata, mapproj,
shadowtext)
# tidyverse - a large suite of packages that work together
# ggthemes - smaller add-on for tidyverse graphics package, ggplot2
# magrittr - needed for piping
# lubridate - needed for dealing with dates
# gridExtra - useful for making grids of plots in ggplot
# flexdashboard - allows for creating a dashboard html file from an R Markdown file
# knitr - needed for creating tables using kable
# RColorBrewer - needed for modifying color palette
# maps - a package with map data by country, us state, and us county
# usdata - us census information
# mapproj - useful for inncluding map projections in plots
# shadowtext - useful for creating readable text labels on maps and plots.
# verify packages
# remove # in front of p_loaded if needed
# p_loaded()
Only ONE Group was in agreement about preferring to go on Tuesday.
Most groups do not want to go next week
Some group responses were baffling.
This first chunk creates the datasets we will use for the plot.
# state2abbr is in usdata package
# map_data is in maps package
us_states <- map_data("state") |>
select(long, lat, group, order, region) |>
rename("state" = "region",
"lon" = "long") |>
mutate(st_abbr = state2abbr(state))
# filter data to states of interest by abbreviation
us_west <- us_states |>
filter(st_abbr %in% c("OR", "CA", "WA", "ID", "NV", "AZ")) |>
glimpse()
## Rows: 1,749
## Columns: 6
## $ lon <dbl> -114.6374, -114.6431, -114.6030, -114.5744, -114.5858, -114.59~
## $ lat <dbl> 35.01918, 35.10512, 35.12231, 35.17961, 35.23690, 35.28274, 35~
## $ group <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,~
## $ order <int> 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 21~
## $ state <chr> "arizona", "arizona", "arizona", "arizona", "arizona", "arizon~
## $ st_abbr <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "A~
# create separate data set of state abbreviations at median lat and lon of each state
us_west_nms <- us_west |>
select(st_abbr, lon, lat, group) |>
group_by(st_abbr) |>
summarize(lon=median(lon, na.rm=T),
lat=median(lat, na.rm=T))
# import college data
# create state abbreviation
# filter rows and select variables
college <- read.csv("college.csv") |>
rename("st_abbr" = "state") |>
filter(st_abbr %in% c("OR", "CA", "WA", "ID", "NV", "AZ")) |>
select(name, city, st_abbr, lat, lon, undergrads) |>
# create factor variable (optional but may be useful)
mutate(ug_size_F = ifelse(undergrads < 5000, "< 5K", NA),
ug_size_F = ifelse(undergrads >= 5000 & undergrads < 10000, "5K - 10K", ug_size_F),
ug_size_F = ifelse(undergrads >= 10000 & undergrads < 20000, "10K - 20K", ug_size_F),
ug_size_F = ifelse(undergrads >= 20000 & undergrads < 30000, "20K - 30K", ug_size_F),
ug_size_F = ifelse(undergrads >= 30000, "> 30K", ug_size_F),
# specifies order of factor variable with levels statement
ug_size_F = factor(ug_size_F, levels = c("< 5K", "5K - 10K", "10K - 20K",
"20K - 30K", "> 30K"))) |>
glimpse()
## Rows: 122
## Columns: 7
## $ name <chr> "Arizona State University-Polytechnic", "Arizona State Univ~
## $ city <chr> "Mesa", "Glendale", "Phoenix", "Tempe", "Phoenix", "Prescot~
## $ st_abbr <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "CA", "CA", "CA", "CA",~
## $ lat <dbl> 33.41518, 33.53865, 33.44838, 33.42551, 33.44838, 34.54002,~
## $ lon <dbl> -111.8315, -112.1860, -112.0740, -111.9400, -112.0740, -112~
## $ undergrads <int> 3726, 3280, 9113, 39316, 620, 454, 1635, 983, 7018, 1293, 2~
## $ ug_size_F <fct> < 5K, < 5K, 5K - 10K, > 30K, < 5K, < 5K, < 5K, < 5K, 5K - 1~
# summarize data to find number of colleges in each state
college_smry <- college |>
group_by(st_abbr) |>
summarize(no_colleges = n())
# merge college counts with us_west data to add fill data to state polygons
us_west <- full_join(us_west, college_smry) |>
glimpse()
## Joining, by = "st_abbr"
## Rows: 1,749
## Columns: 7
## $ lon <dbl> -114.6374, -114.6431, -114.6030, -114.5744, -114.5858, -11~
## $ lat <dbl> 35.01918, 35.10512, 35.12231, 35.17961, 35.23690, 35.28274~
## $ group <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
## $ order <int> 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215~
## $ state <chr> "arizona", "arizona", "arizona", "arizona", "arizona", "ar~
## $ st_abbr <chr> "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ", "AZ"~
## $ no_colleges <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6~
This second chunk creates the map plot with multiple layers of data.
# layer 1 is polygon layer of 6 wester states
# fiull is number of colleges
(us_west_colleges <- ggplot(data=us_west) +
geom_polygon(aes(x=lon, y=lat,
group=group,
fill=no_colleges)) +
# layer 2 is a scatterplot of colleges with size of point being number of undergrads
# can use original numeric data or created factor variable
# alpha determines transparency
geom_point(data=college, aes(x=lon, y=lat,
size=ug_size_F), col="lightgreen", alpha=.33) +
# layer 3 add labels for each state using shadowtext
geom_shadowtext(data=us_west_nms, aes(x=lon, y=lat, label=st_abbr),
color="white", check_overlap = T) +
# theme map is useful map plots
theme_map() +
# coord_map projects the lat and lon data correctly
coord_map("albers", lat0 = 39, lat1 = 45) +
# viridis changes the colotr scheme
scale_fill_continuous(type = "viridis") +
# legend.position = "right" moves the legend to the right
theme(legend.position = "right") +
# labs statement labels all components from ALL map layers
labs(fill="Number of Colleges",
size="Number of Undergrads",
title="Map of Colleges in Western States",
subtitle="Example of Map with Multiple Data Layers",
caption="Data Sources: R and LinkedIn Learning"))
## Warning: Using size for a discrete variable is not advised.
***
The United states geographic data is from the R package maps
.
The United Stated abbreviations command is from the R package us_data
.
A note about the ‘College’ data: college.csv is a dataset that was created by Professor Mike Chapple (University of Notre Dame) from publicly available data sources.
This particular course was recorded in 2018, but most of the concepts and code are still very useful.
Professor Chapple explains the concepts in a straightforward way and goes into some plot formatting details that are difficult to find information on.
The project description has detailed guidance on what is expected for each component of the project. If you have additional questions, please let me know.
I have created two basic templates for the memos which, when combined with the project description should provide sufficient guidance.
On Tuesday I covered how to find citation information for R and R Packages, but I did not mention R Studio.
Below, I review how to cite R, R Packages, and RStudio
# This chunk header includes eval=FALSE to prevent an error. See below.
# citing R
citation()
# citing a specific package like maps
citation("maps")
# citing RStudio
# This code can be run from R Markdown but will cause an error
# when you knit the R Markdown file.
RStudio.Version()
Some topics, like web scraping were not feasible for this course this semester.
You should be aware of this terminology and that this is reasonable to do, but requires a little more coding knowledge and patience.
Web scraping is using coding to ‘scrape’ data from the html code used to create a website where data are shown.
You can see the html code for any website by typing Ctrl+Shift+C (may be Cmd+Shift+C for MACs)
Here is a tutorial that walks you through the web scraping process.
Here is an Rpubs demo based on the tutorial.
Sports Data Analysts commonly use web scraping and there is a graduate class that covers it here at SU in the Falk MS curriculum.
If you want to publish your dashboard or any HTML file you create in R, you can do so for free.
R has a public online repository called RPubs.
I am not requiring students to use it for their projects, but it is useful if you want post something online and provide the link to it.
As a final in class exercise, I am asking you each to create an account and publish one html file (see below).
Select Any HTML file you have created in this class from an R Markdown file.
The file should be one that has your name on it.
Any of the html files you created in HW 3, HW 4, or HW 5 will work.
Click the Rpubs icon, create a free account, and publish your html file.
Yes
.Submit the link to your published file on Blackboard.
A Link to your published file (any html file with your name) must be submitted by Tuesday, 12/7/21 at midnight to count for class participation.
NOTES:
You are welcome and encouraged (but not required) to publish your group project dashboard on Rpubs.
This may be helpful (or not) when presenting your dashboard to the class.
If you are going to go further in data science, you are likely to spend some time on github, which is free and useful.
github is an online code sharing and code development platform.
Many R packages start as development code on github and over time they are refined and published.
Some of you may have already found data or code there.
Once you create free account, you can learn more about how it works in this tutorial.
Two great resources for expanding your skills are:
Not free, but very worth the subscription fee if you want to learn new skills
Data Camp has also published an excellent paper that I show to my students to give them a framework for describing their analytical skills to potential employers
I will give students time during class next week to complete evaluations and I will send out multiple emails.
Evaluations for this course (and all courses) are very important so that we can contimue to improve the material.
We are grateful for your insights and help.