knitr::opts_chunk$set(echo = TRUE)
# Install packages if not already
if (!require(tidyverse)) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.4     âś” readr     2.1.5
## âś” forcats   1.0.0     âś” stringr   1.5.1
## âś” ggplot2   3.5.1     âś” tibble    3.2.1
## âś” lubridate 1.9.4     âś” tidyr     1.3.1
## âś” purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require(lubridate)) install.packages("lubridate")

library(tidyverse)
library(lubridate)
# Read in the actual CSV file
data <- read.csv("DodgersData.csv")

Introduction

For this tutorial, I expanded on Discussion 5 which used the data DodgersData.csv to analyze data of various Dodger game characteristics including days of the week, number of attendees, opponents, etc. In this enhanced version of the original Dodgers baseball game attendance analysis, several improvements have been made to increase the depth and insight of the data analysis. A new variable, promotion, was created to analyze the effect of promotional events (such as cap giveaways, bobbleheads, or fireworks) on attendance. Additionally, new visualizations were introduced to analyze attendance by opponent and to evaluate the impact of promotions. A linear regression model was also added to predict attendance based on temperature, day of the week, and promotions. These enhancements provide a more comprehensive view of the factors influencing game attendance and improve the overall analytical depth of the original tutorial.

# Convert variables
data$day_of_week <- factor(data$day_of_week, 
                           levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

# Create a "Promotion" variable if any promo occurred (cap, shirt, fireworks, bobblehead)
data <- data %>%
  mutate(promotion = if_else(cap == "YES" | shirt == "YES" | fireworks == "YES" | bobblehead == "YES", "Yes", "No"))
ggplot(data, aes(x = day_of_week, y = attend, fill = day_of_week)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance by Day of the Week",
       x = "Day",
       y = "Attendance") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data, aes(x = temp, y = attend)) +
  geom_point(color = "darkred", alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Relationship Between Temperature and Attendance",
       x = "Temperature (°F)",
       y = "Attendance") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

data %>%
  mutate(month = factor(month, levels = c("APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT"))) %>%
  group_by(month) %>%
  summarize(total_attendance = sum(attend)) %>%
  ggplot(aes(x = month, y = total_attendance, group = 1)) +
  geom_line(color = "darkgreen", linewidth = 1) +
  geom_point(color = "purple", size = 3) +
  labs(title = "Total Attendance by Month",
       x = "Month", y = "Total Attendance") +
  theme_minimal()

ggplot(data, aes(x = promotion, y = attend, fill = promotion)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance with vs Without Promotions",
       x = "Promotion Day?",
       y = "Attendance") +
  theme_minimal()

top_opponents <- data %>%
  group_by(opponent) %>%
  summarise(avg_attendance = mean(attend)) %>%
  arrange(desc(avg_attendance)) %>%
  top_n(8, avg_attendance)

data %>%
  filter(opponent %in% top_opponents$opponent) %>%
  ggplot(aes(x = reorder(opponent, -attend), y = attend, fill = opponent)) +
  geom_boxplot(show.legend = FALSE) +
  labs(title = "Attendance by Opponent (Top 8)",
       x = "Opponent", y = "Attendance") +
  theme_minimal()

model <- lm(attend ~ temp + day_of_week + promotion, data = data)
summary(model)
## 
## Call:
## lm(formula = attend ~ temp + day_of_week + promotion, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17676.2  -4040.7   -440.4   3909.7  15121.0 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          28724.60    6999.72   4.104 0.000106 ***
## temp                    73.72      92.55   0.797 0.428310    
## day_of_weekTuesday    7215.01    2851.05   2.531 0.013571 *  
## day_of_weekWednesday  2607.21    2682.41   0.972 0.334322    
## day_of_weekThursday   2161.33    3558.55   0.607 0.545521    
## day_of_weekFriday    -4205.56    3259.57  -1.290 0.201102    
## day_of_weekSaturday   7380.09    2634.37   2.801 0.006529 ** 
## day_of_weekSunday     5368.76    2691.70   1.995 0.049878 *  
## promotionYes         10460.05    2067.22   5.060 3.09e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6570 on 72 degrees of freedom
## Multiple R-squared:  0.4357, Adjusted R-squared:  0.373 
## F-statistic: 6.948 on 8 and 72 DF,  p-value: 9.546e-07

Analysis

The analysis explored several key variables that may influence game attendance, including day of the week, temperature, promotional events, and opposing teams. Using boxplots and scatterplots, we visually examined how attendance varied across different categories and conditions. I also added a new promotion variable to capture whether any promotional item was offered during a game. Lastly, a linear regression model was built to identify which factors had the strongest statistical relationship with attendance figures.

Interpretations, Findings, and Conclusion

The findings suggest that promotional events significantly boost attendance, with games offering giveaways like bobbleheads or fireworks drawing larger crowds. Weekend games also showed higher attendance compared to weekdays. Higher temperatures appeared to be somewhat associated with better attendance due to fans preferring to attend games in nicer weather. The regression model confirmed that both promotional events and the day of the week are strong predictors of attendance. These insights can help the Dodgers’ marketing team better schedule promotional campaigns and target high-impact game days to maximize fan turnout and revenue