#Load required packages
library(dplyr) library(tidyr) library(stringr) library(ggplot2) library(sf)
#Phish concert data
concert_data <- read.csv("PhishConcert - Sheet1.csv", stringsAsFactors = FALSE)
#Display the structure of the loaded data
str(concert_data) cat("Loaded", nrow(concert_data), "concert records\n")
#Display the first few rows of the data head(concert_data)
#Display the first few rows of the data head(mock_data)
#Parse the date column and extract year phish_data <- concert_data %>% mutate( date = as.Date(concerts), year = as.numeric(format(date, "%Y")) ) %>% filter(year >= 1992 & year <= 1994) # Filter for the years of interest (already in range)
phish_data <- phish_data %>% mutate( # Clean city field - remove any extra quotes or commas city = str_replace_all(city, '"|,', ''), # Clean venue field - remove any extra quotes or commas venue = str_replace_all(venue, '"|,', ''), # Ensure all values are properly capitalized state = toupper(state), # Standardize country field country = toupper(country) )
#Filter for US concerts only us_concerts <- phish_data %>% filter(country == "USA")
#Display the cleaned data print(phish_data)
#Check for any issues with state codes unique(phish_data$state)
#Count concerts by state and year state_counts <- us_concerts %>% group_by(state, year) %>% summarise(concert_count = n(), .groups = "drop") %>% arrange(year, desc(concert_count))
#Display the counts print(state_counts)
#Get US state boundaries us_states <- st_as_sf(us_map("states"))
#Function to create map for a specific year create_map_for_year <- function(year_value) { # Filter data for the specific year year_data <- state_counts %>% filter(year == year_value)
# Join the data with map data map_data <- us_states %>% left_join(year_data, by = c("abbr" = "state"))
# Replace NA values with 0 map_data$concert_count[is.na(map_data$concert_count)] <- 0
# Create the map ggplot(map_data) + geom_sf(aes(fill = concert_count), color = "white", size = 0.2) + scale_fill_viridis_c(option = "plasma", name = "Number of Concerts", direction = -1, guide = guide_colorbar(barwidth = 10, barheight = 0.5), breaks = pretty_breaks()) + labs(title = paste("Phish Concerts in", year_value), subtitle = "Number of concerts per U.S. state", caption = "Data source: Mock data based on ConcertArchives.org") + theme_minimal() + theme( plot.title = element_text(size = 16, face = "bold"), plot.subtitle = element_text(size = 12), legend.position = "bottom", axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank() ) }
#Create maps for each year map_1992 <- create_map_for_year(1992) map_1993 <- create_map_for_year(1993) map_1994 <- create_map_for_year(1994)
#Display the maps print(map_1992) print(map_1993) print(map_1994)
#Calculate the total concerts by state across all years total_by_state <- state_counts %>% group_by(state) %>% summarise(total_concerts = sum(concert_count), .groups = "drop") %>% arrange(desc(total_concerts))
#Display the top 10 states top_10_states <- total_by_state %>% head(10) print(top_10_states)
#Calculate the percentage of concerts in each region regions <- data.frame( state = state.abb, region = c("Northeast", "South", "West", "West", "West", "West", "Northeast", "South", "South", "West", "South", "Northeast", "Midwest", "Midwest", "Midwest", "Northeast", "South", "South", "Northeast", "South", "Northeast", "Midwest", "Midwest", "South", "Midwest", "West", "Midwest", "West", "Northeast", "Northeast", "South", "West", "Northeast", "South", "Midwest", "South", "West", "Northeast", "Northeast", "South", "Midwest", "South", "South", "West", "Northeast", "South", "Northeast", "West", "South", "Midwest") )
#Join region data with state counts region_counts <- total_by_state %>% left_join(regions, by = "state") %>% filter(!is.na(region)) %>% # Remove any states that might not have matched group_by(region) %>% summarise(total_concerts = sum(total_concerts), .groups = "drop") %>% mutate(percentage = total_concerts / sum(total_concerts) * 100) %>% arrange(desc(percentage))
#Display regional distribution
print(region_counts)
#Create a bar chart of regional distribution
ggplot(region_counts, aes(x = reorder(region, -percentage), y = percentage)) + geom_bar(stat = "identity", fill = "steelblue") + geom_text(aes(label = sprintf("%.1f%%", percentage)), vjust = -0.5) + labs(title = "Regional Distribution of Phish Concerts (1992-1994)", x = "Region", y = "Percentage of Concerts") + theme_minimal() + theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
#Calculate the year-over-year change in concert counts by state
yoy_change <- state_counts %>% arrange(state, year) %>% group_by(state) %>% mutate( prev_year_count = lag(concert_count), yoy_change = concert_count - prev_year_count, yoy_percent = ifelse(prev_year_count > 0, round((concert_count - prev_year_count) / prev_year_count * 100, 1), NA) ) %>% filter(!is.na(yoy_change)) %>% ungroup()
#Display states with the largest increase and decrease top_increases <- yoy_change %>% arrange(desc(yoy_change)) %>% head(5)
top_decreases <- yoy_change %>% arrange(yoy_change) %>% head(5)
print("States with largest increase in concerts:") print(top_increases)
print("States with largest decrease in concerts:") print(top_decreases)
#Create a faceted map showing all three years all_years_data <- us_states %>% left_join(state_counts, by = c("abbr" = "state"))
#Replace NA values with 0 all_years_data$concert_count[is.na(all_years_data$concert_count)] <- 0
#Create the faceted map faceted_map <- ggplot(all_years_data) + geom_sf(aes(fill = concert_count), color = "white", size = 0.2) + scale_fill_viridis_c(option = "plasma", name = "Number of Concerts", direction = -1, guide = guide_colorbar(barwidth = 10, barheight = 0.5), breaks = pretty_breaks()) + facet_wrap(~ year, ncol = 3) + labs(title = "Phish Concerts (1992-1994)", subtitle = "Geographic distribution across U.S. states", caption = "Data source: Phish concert data") + theme_minimal() + theme( plot.title = element_text(size = 16, face = "bold"), plot.subtitle = element_text(size = 12), legend.position = "bottom", axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank(), strip.text = element_text(size = 12, face = "bold") )
print(faceted_map)
cat(“— SUMMARY OF FINDINGS —”) cat(“Total number of US concerts analyzed:”, sum(total_by_state\(total_concerts), "\n") cat("Top concert states (1992-1994):", paste(head(top_10_states\)state, 5), collapse = “,”), “”) cat(“Regional distribution:”) for(i in 1:nrow(region_counts)) { cat(” “, region_counts\(region[i], ": ", round(region_counts\)percentage[i], 1),”%“, sep =”“) }
write.csv(phish_data, “phish_cleaned_data_1992_1994.csv”, row.names = FALSE) write.csv(state_counts, “phish_state_counts_1992_1994.csv”, row.names = FALSE)
summary_table <- state_counts %>% pivot_wider(names_from = year, values_from = concert_count, values_fill = 0) %>% mutate(Total = rowSums(across(where(is.numeric)))) %>% arrange(desc(Total))
print(summary_table)
cat(“complete. Maps and data files have been saved.”)