title: “R Week 07 Assignment” author: “William Cornejo” date: “3/22/2025” format: html: toc: true toc-location: left code-fold: true code-summary: “Show the code” code-tools: true —

Setting up packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(sp)
library(classInt)
library(RColorBrewer)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
## 
## Attaching package: 'ggmap'
## 
## 
## The following object is masked from 'package:magrittr':
## 
##     inset
library(leaflet)
library(ggrepel)
library(tmap)
library(mapview)

library(ggplot2)
library(ggpubr)
library(patchwork)

# For webscraping
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(httr)
library(polite)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: spatstat.univar
## spatstat.univar 3.1-2
## Loading required package: spatstat.geom
## spatstat.geom 3.3-6
## 
## Attaching package: 'spatstat.geom'
## 
## The following object is masked from 'package:patchwork':
## 
##     area
## 
## The following objects are masked from 'package:ggpubr':
## 
##     border, rotate
## 
## Loading required package: spatstat.random
## spatstat.random 3.3-3
## Loading required package: spatstat.explore
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## spatstat.explore 3.4-2
## Loading required package: spatstat.model
## Loading required package: rpart
## spatstat.model 3.3-5
## Loading required package: spatstat.linnet
## spatstat.linnet 3.2-5
## 
## spatstat 3.3-2 
## For an introduction to spatstat, type 'beginner'
library(writexl) 

Functions to work with web data

parse_phish_html() selects the purchase detail html tag and grabs the venue, city, state, or country of the concert. Then the accompanying coordinates are gathered and a spatial feature object is created.

count_shows_by_state() groups the concert spatial points by the state polygons, and creates a new sf object which shows concerts by state.

#1. Create concert sf points
parse_phish_html <- function(phish_html) {
  # Get html container with necessary information
  location <- phish_html %>% html_nodes("div.purchase-details") %>% html_text()
  
  # Lists to hold concert info, will be used as columns to create sf points.
  venues <- character()
  cities <- character()
  states <- character()
  countries <- character()
  
  for (i in location) {
    # Clean every row entry, which is a string
    clean_text <- i %>%
      str_trim() %>%
      str_replace_all("\\t|\\n", "") %>%
      str_replace(" map$", "")
    
    # Separate string by commas
    split_data <- str_split(clean_text, ", ", simplify = TRUE)
    # String will be further split based on if a lowercase directly precedes
    # a uppercase letter. It just happens in different elements so each case is
    # handled. Once split again then the data is added to the correct list.
    
    if(length(split_data) == 2){
      # For certain cases like
      #"Phish Phamily Phrolic at Anastasios' HouseNJ" "United States"
      vs <- str_split(split_data[1], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
      venues <- c(venues, vs[1])
      cities <- c(cities, 'Not Available')
      states <- c(states, vs[2])
      countries <- c(countries, split_data[2])
    }
    if (length(split_data) == 5) {
      ec <- str_split(split_data[3], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
      venues <- c(venues, paste(split_data[1], ", ", split_data[2], " at " , ec[1]))
      cities <- c(cities, ec[2])
      states <- c(states, split_data[4])
      countries <- c(countries, split_data[5])
    } else if (length(split_data) == 3) {
      vc <- str_split(split_data[1], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
      venues <- c(venues, vc[1])
      cities <- c(cities, vc[2])
      states <- c(states, split_data[2])
      countries <- c(countries, split_data[3])
    } else if (length(split_data) == 4) {
      ec <- str_split(split_data[2], "(?<=[a-z])(?=[A-Z])", simplify=TRUE)
      venues <- c(venues, paste(split_data[1], " at ", ec[1]))
      cities <- c(cities, ec[2])
      states <- c(states, split_data[3])
      countries <- c(countries, split_data[4])
    }
  }
  
  # Get list of html containers with coordinates
  coords_url <- phish_html %>%
    html_nodes("div.purchase-show-location") %>% 
    html_nodes("a") %>% 
    html_attr("href") 
  
  # Clean entries
  clean_coords <- sub(".*to:", "", coords_url)
  latlon <- str_split(clean_coords, "\\+", simplify = TRUE)
  # Create df with everything collected
  concert_df <- data.frame(
    venue = venues,
    city = cities,
    state = states,
    country = countries,
    lat = as.numeric(latlon[, 1]),
    lon = as.numeric(latlon[, 2]),
    stringsAsFactors = FALSE
  )
  # Create sf from df and coords
  concert_sf <- st_as_sf(concert_df, coords = c("lon", "lat"), crs = 4326)
  return(concert_sf)
}

#2. State merge
count_shows_by_state <- function(concert_sf, states_sf, state_column = "STATE_ABBR") {
  # Join each concert point to a state polygon
  concert_with_state <- st_join(concert_sf, states_sf)
  
  # Count the number of shows per state
  show_counts <- concert_with_state %>%
    group_by(across(all_of(state_column))) %>%
    summarize(show_count = n(), .groups = "drop")
  
  # Drop geometry for joining with full state geometries
  show_counts_df <- st_drop_geometry(show_counts)
  
  # Join show counts back to the full state shapefile
  states_joined <- left_join(states_sf, show_counts_df, by = state_column)
  
  return(states_joined)
}

Read data from Phish concerts page. Get 1990, 1991, and 1992.

# GEOJSON file with State polygons
states_sf <- st_read("data/data/US_State_Boundaries.geojson")
## Reading layer `US_State_Boundaries' from data source 
##   `C:\Users\wcornejo\Documents\gtech78520\data\data\US_State_Boundaries.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 53 features and 16 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1474 ymin: 17.6744 xmax: 179.7784 ymax: 71.38921
## Geodetic CRS:  WGS 84
# Remove Alaska and Hawaii since there aren't concerts
states_sf_filt <- states_sf %>%
  filter(!(STATE_ABBR %in% c("AK", "HI")))

# Years, use 90, 91, 92
phish_90 <- GET("https://phish.com/tours/1990")
phish_91 <- GET("https://phish.com/tours/1991")
phish_92 <- GET("https://phish.com/tours/1992")

# Text data
phish_text90 <- content(phish_90, "text", encoding = "UTF-8")
phish_text91 <- content(phish_91, "text", encoding = "UTF-8")
phish_text92 <- content(phish_92, "text", encoding = "UTF-8")

# Read text data
phish_html90 <- read_html(phish_text90)
phish_html91 <- read_html(phish_text91)
phish_html92 <- read_html(phish_text92)

# Create spatial point layer for each year
phish90_sf <- parse_phish_html(phish_html90)
phish91_sf <- parse_phish_html(phish_html91)
phish92_sf <- parse_phish_html(phish_html92)

# Some of 1992 are not in US, so remove them
phish92_sf_filt <- phish92_sf %>%
  filter(country == 'United States')
# Aggregated over three years object
all_phish_sf <- rbind(phish90_sf, phish91_sf, phish92_sf_filt)

Making Excel spreadsheet

# Mutate data
phish90_sfa <- phish90_sf %>%
  mutate(year = 1990)
phish91_sfa <- phish91_sf %>%
  mutate(year = 1991)
phish92_sfa <- phish92_sf_filt %>%
  mutate(year = 1992)

# Combine all into one
all_phish_sfa <- rbind(phish90_sfa, phish91_sfa, phish92_sfa)

# Write data to .xlsx file
all_phish_df <- all_phish_sfa %>%
  st_drop_geometry() %>%  #
  select(year, city, state, country, venue) %>%
  arrange(year, state, city)
write_xlsx(all_phish_df, "phish_concerts_1990_1992.xlsx")

Merge concert data with state geojson

phish90_states <- count_shows_by_state(phish90_sf, states_sf_filt)
phish91_states <- count_shows_by_state(phish91_sf, states_sf_filt)
phish92_states <- count_shows_by_state(phish92_sf_filt, states_sf_filt)
all_phish_states <- count_shows_by_state(all_phish_sf, states_sf_filt)

Plots

Below are plots for Phish concerts in 1990. This process will be repeated for years 1990-92, with a fourth aggregated map.

1990

ggplot(phish90_states) +
  geom_sf(aes(fill = show_count), color = "white") +
  scale_fill_viridis_c(
    option = "plasma",
    na.value = "grey90",
    name = "Number of Shows"
  ) +
  theme_minimal() +
  labs(
    title = "Phish Concerts by U.S. State (1990)",
    x = "Longitude",
    y = "Latitude"
  )+
    theme(
    panel.grid = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "right"
  )

1991

ggplot(phish91_states) +
  geom_sf(aes(fill = show_count), color = "white") +
  scale_fill_viridis_c(
    option = "plasma",
    na.value = "grey90",
    name = "Number of Shows"
  ) +
  theme_minimal() +
  labs(
    title = "Phish Concerts by U.S. State (1991)",
    x = "Longitude",
    y = "Latitude"
  )+
    theme(
    panel.grid = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "right"
  )

1992

ggplot(phish92_states) +
  geom_sf(aes(fill = show_count), color = "white") +
  scale_fill_viridis_c(
    option = "plasma",
    na.value = "grey90",
    name = "Number of Shows"
  ) +
  theme_minimal(base_size=12) +
  labs(
    title = "Phish Concerts by U.S. State (1992)",
    x = "Longitude",
    y = "Latitude"
  )+
    theme(
    panel.grid = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "right"
  )

Concerts aggregated over 1990-1992

mapview(all_phish_states, 
        zcol = "show_count",
        legend = TRUE,
        layer.name = "Number of Shows",
        na.color = "gray90",
        popup = leafpop::popupTable(  
          all_phish_states,
          zcol = c("NAME", "show_count"), 
          feature.id = FALSE,
          row.numbers = FALSE
        ))

Further Analysis

Top 5 states with most concerts across the three years.

phish_yearly_state <- all_phish_sfa %>%
  group_by(year, state) %>%
  summarize(show_count = n(), .groups = "drop")
top5_states <- phish_yearly_state %>%
  arrange(desc(show_count)) %>%
  head(10)

ggplot(top5_states, aes(x = year, y = show_count, group = state, color = state)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(
    title = "Phish Concerts by State (1990–1992)",
    x = "Year",
    y = "Number of Shows"
  )

Get top 5 most visited venues.

top_venues <- all_phish_sf %>%
  count(venue, sort = TRUE) %>%
  head(5)

ggplot(top_venues, aes(x = reorder(venue, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Top 5 Most Visited Venues (1990–1992)",
    x = "Venue",
    y = "Number of Concerts"
  ) +
  theme_minimal()