knitr::opts_chunk$set(echo = TRUE)
# Install packages if not already
if (!require(tidyverse)) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require(lubridate)) install.packages("lubridate")

library(tidyverse)
library(lubridate)
# Read in the actual CSV file
data <- read.csv("DodgersData.csv")

# Convert variables
data$day_of_week <- factor(data$day_of_week, 
                           levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

# Create a "Promotion" variable if any promo occurred (cap, shirt, fireworks, bobblehead)
data <- data %>%
  mutate(promotion = if_else(cap == "YES" | shirt == "YES" | fireworks == "YES" | bobblehead == "YES", "Yes", "No"))
ggplot(data, aes(x = day_of_week, y = attend, fill = day_of_week)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance by Day of the Week",
       x = "Day",
       y = "Attendance") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data, aes(x = temp, y = attend)) +
  geom_point(color = "darkred", alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Relationship Between Temperature and Attendance",
       x = "Temperature (°F)",
       y = "Attendance") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

data %>%
  mutate(month = factor(month, levels = c("APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT"))) %>%
  group_by(month) %>%
  summarize(total_attendance = sum(attend)) %>%
  ggplot(aes(x = month, y = total_attendance, group = 1)) +
  geom_line(color = "darkgreen", linewidth = 1) +
  geom_point(color = "purple", size = 3) +
  labs(title = "Total Attendance by Month",
       x = "Month", y = "Total Attendance") +
  theme_minimal()

ggplot(data, aes(x = promotion, y = attend, fill = promotion)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance with vs Without Promotions",
       x = "Promotion Day?",
       y = "Attendance") +
  theme_minimal()

top_opponents <- data %>%
  group_by(opponent) %>%
  summarise(avg_attendance = mean(attend)) %>%
  arrange(desc(avg_attendance)) %>%
  top_n(8, avg_attendance)

data %>%
  filter(opponent %in% top_opponents$opponent) %>%
  ggplot(aes(x = reorder(opponent, -attend), y = attend, fill = opponent)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance by Opponent (Top 8)",
       x = "Opponent", y = "Attendance") +
  theme_minimal()

model <- lm(attend ~ temp + day_of_week + promotion, data = data)
summary(model)
## 
## Call:
## lm(formula = attend ~ temp + day_of_week + promotion, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17676.2  -4040.7   -440.4   3909.7  15121.0 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          28724.60    6999.72   4.104 0.000106 ***
## temp                    73.72      92.55   0.797 0.428310    
## day_of_weekTuesday    7215.01    2851.05   2.531 0.013571 *  
## day_of_weekWednesday  2607.21    2682.41   0.972 0.334322    
## day_of_weekThursday   2161.33    3558.55   0.607 0.545521    
## day_of_weekFriday    -4205.56    3259.57  -1.290 0.201102    
## day_of_weekSaturday   7380.09    2634.37   2.801 0.006529 ** 
## day_of_weekSunday     5368.76    2691.70   1.995 0.049878 *  
## promotionYes         10460.05    2067.22   5.060 3.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6570 on 72 degrees of freedom
## Multiple R-squared:  0.4357, Adjusted R-squared:  0.373 
## F-statistic: 6.948 on 8 and 72 DF,  p-value: 9.546e-07