title: “R Week 07 Assignment” author: “William Cornejo” date: “3/22/2025” format: html: toc: true toc-location: left code-fold: true code-summary: “Show the code” code-tools: true —
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(sp)
library(classInt)
library(RColorBrewer)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
##
## Attaching package: 'ggmap'
##
##
## The following object is masked from 'package:magrittr':
##
## inset
library(leaflet)
library(ggrepel)
library(tmap)
library(mapview)
library(ggplot2)
library(ggpubr)
library(patchwork)
# For webscraping
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(httr)
library(polite)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: spatstat.univar
## spatstat.univar 3.1-2
## Loading required package: spatstat.geom
## spatstat.geom 3.3-6
##
## Attaching package: 'spatstat.geom'
##
## The following object is masked from 'package:patchwork':
##
## area
##
## The following objects are masked from 'package:ggpubr':
##
## border, rotate
##
## Loading required package: spatstat.random
## spatstat.random 3.3-3
## Loading required package: spatstat.explore
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## spatstat.explore 3.4-2
## Loading required package: spatstat.model
## Loading required package: rpart
## spatstat.model 3.3-5
## Loading required package: spatstat.linnet
## spatstat.linnet 3.2-5
##
## spatstat 3.3-2
## For an introduction to spatstat, type 'beginner'
library(writexl)
parse_phish_html() selects the purchase detail html tag and grabs the venue, city, state, or country of the concert. Then the accompanying coordinates are gathered and a spatial feature object is created.
count_shows_by_state() groups the concert spatial points by the state polygons, and creates a new sf object which shows concerts by state.
#1. Create concert sf points
parse_phish_html <- function(phish_html) {
# Get html container with necessary information
location <- phish_html %>% html_nodes("div.purchase-details") %>% html_text()
# Lists to hold concert info, will be used as columns to create sf points.
venues <- character()
cities <- character()
states <- character()
countries <- character()
for (i in location) {
# Clean every row entry, which is a string
clean_text <- i %>%
str_trim() %>%
str_replace_all("\\t|\\n", "") %>%
str_replace(" map$", "")
# Separate string by commas
split_data <- str_split(clean_text, ", ", simplify = TRUE)
# String will be further split based on if a lowercase directly precedes
# a uppercase letter. It just happens in different elements so each case is
# handled. Once split again then the data is added to the correct list.
if(length(split_data) == 2){
# For certain cases like
#"Phish Phamily Phrolic at Anastasios' HouseNJ" "United States"
vs <- str_split(split_data[1], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
venues <- c(venues, vs[1])
cities <- c(cities, 'Not Available')
states <- c(states, vs[2])
countries <- c(countries, split_data[2])
}
if (length(split_data) == 5) {
ec <- str_split(split_data[3], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
venues <- c(venues, paste(split_data[1], ", ", split_data[2], " at " , ec[1]))
cities <- c(cities, ec[2])
states <- c(states, split_data[4])
countries <- c(countries, split_data[5])
} else if (length(split_data) == 3) {
vc <- str_split(split_data[1], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
venues <- c(venues, vc[1])
cities <- c(cities, vc[2])
states <- c(states, split_data[2])
countries <- c(countries, split_data[3])
} else if (length(split_data) == 4) {
ec <- str_split(split_data[2], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
venues <- c(venues, paste(split_data[1], " at ", ec[1]))
cities <- c(cities, ec[2])
states <- c(states, split_data[3])
countries <- c(countries, split_data[4])
}
}
# Get list of html containers with coordinates
coords_url <- phish_html %>%
html_nodes("div.purchase-show-location") %>%
html_nodes("a") %>%
html_attr("href")
# Clean entries
clean_coords <- sub(".*to:", "", coords_url)
latlon <- str_split(clean_coords, "\\+", simplify = TRUE)
# Create df with everything collected
concert_df <- data.frame(
venue = venues,
city = cities,
state = states,
country = countries,
lat = as.numeric(latlon[, 1]),
lon = as.numeric(latlon[, 2]),
stringsAsFactors = FALSE
)
# Create sf from df and coords
concert_sf <- st_as_sf(concert_df, coords = c("lon", "lat"), crs = 4326)
return(concert_sf)
}
#2. State merge
count_shows_by_state <- function(concert_sf, states_sf, state_column = "STATE_ABBR") {
# Join each concert point to a state polygon
concert_with_state <- st_join(concert_sf, states_sf)
# Count the number of shows per state
show_counts <- concert_with_state %>%
group_by(across(all_of(state_column))) %>%
summarize(show_count = n(), .groups = "drop")
# Drop geometry for joining with full state geometries
show_counts_df <- st_drop_geometry(show_counts)
# Join show counts back to the full state shapefile
states_joined <- left_join(states_sf, show_counts_df, by = state_column)
return(states_joined)
}
# GEOJSON file with State polygons
states_sf <- st_read("data/data/US_State_Boundaries.geojson")
## Reading layer `US_State_Boundaries' from data source
## `C:\Users\wcornejo\Documents\gtech78520\data\data\US_State_Boundaries.geojson'
## using driver `GeoJSON'
## Simple feature collection with 53 features and 16 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -179.1474 ymin: 17.6744 xmax: 179.7784 ymax: 71.38921
## Geodetic CRS: WGS 84
# Remove Alaska and Hawaii since there aren't concerts
states_sf_filt <- states_sf %>%
filter(!(STATE_ABBR %in% c("AK", "HI")))
# Years, use 90, 91, 92
phish_90 <- GET("https://phish.com/tours/1990")
phish_91 <- GET("https://phish.com/tours/1991")
phish_92 <- GET("https://phish.com/tours/1992")
# Text data
phish_text90 <- content(phish_90, "text", encoding = "UTF-8")
phish_text91 <- content(phish_91, "text", encoding = "UTF-8")
phish_text92 <- content(phish_92, "text", encoding = "UTF-8")
# Read text data
phish_html90 <- read_html(phish_text90)
phish_html91 <- read_html(phish_text91)
phish_html92 <- read_html(phish_text92)
# Create spatial point layer for each year
phish90_sf <- parse_phish_html(phish_html90)
phish91_sf <- parse_phish_html(phish_html91)
phish92_sf <- parse_phish_html(phish_html92)
# Some of 1992 are not in US, so remove them
phish92_sf_filt <- phish92_sf %>%
filter(country == 'United States')
# Aggregated over three years object
all_phish_sf <- rbind(phish90_sf, phish91_sf, phish92_sf_filt)
# Mutate data
phish90_sfa <- phish90_sf %>%
mutate(year = 1990)
phish91_sfa <- phish91_sf %>%
mutate(year = 1991)
phish92_sfa <- phish92_sf_filt %>%
mutate(year = 1992)
# Combine all into one
all_phish_sfa <- rbind(phish90_sfa, phish91_sfa, phish92_sfa)
# Write data to .xlsx file
all_phish_df <- all_phish_sfa %>%
st_drop_geometry() %>% #
select(year, city, state, country, venue) %>%
arrange(year, state, city)
write_xlsx(all_phish_df, "phish_concerts_1990_1992.xlsx")
phish90_states <- count_shows_by_state(phish90_sf, states_sf_filt)
phish91_states <- count_shows_by_state(phish91_sf, states_sf_filt)
phish92_states <- count_shows_by_state(phish92_sf_filt, states_sf_filt)
all_phish_states <- count_shows_by_state(all_phish_sf, states_sf_filt)
Below are plots for Phish concerts in 1990. This process will be repeated for years 1990-92, with a fourth aggregated map.
ggplot(phish90_states) +
geom_sf(aes(fill = show_count), color = "white") +
scale_fill_viridis_c(
option = "plasma",
na.value = "grey90",
name = "Number of Shows"
) +
theme_minimal() +
labs(
title = "Phish Concerts by U.S. State (1990)",
x = "Longitude",
y = "Latitude"
)+
theme(
panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right"
)
ggplot(phish91_states) +
geom_sf(aes(fill = show_count), color = "white") +
scale_fill_viridis_c(
option = "plasma",
na.value = "grey90",
name = "Number of Shows"
) +
theme_minimal() +
labs(
title = "Phish Concerts by U.S. State (1991)",
x = "Longitude",
y = "Latitude"
)+
theme(
panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right"
)
ggplot(phish92_states) +
geom_sf(aes(fill = show_count), color = "white") +
scale_fill_viridis_c(
option = "plasma",
na.value = "grey90",
name = "Number of Shows"
) +
theme_minimal(base_size=12) +
labs(
title = "Phish Concerts by U.S. State (1992)",
x = "Longitude",
y = "Latitude"
)+
theme(
panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "right"
)
mapview(all_phish_states,
zcol = "show_count",
legend = TRUE,
layer.name = "Number of Shows",
na.color = "gray90",
popup = leafpop::popupTable(
all_phish_states,
zcol = c("NAME", "show_count"),
feature.id = FALSE,
row.numbers = FALSE
))