Loading data

rm(list=ls())# clear memory

library(lattice) # Graphics Package
library(ggplot2) # Graphical Package

library(readr)

DodgersData <- read_csv("dodgers_randomized.csv")
## Rows: 81 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): month, day_of_week, opponent, skies, day_night, cap, shirt, firewor...
## dbl (3): day, attend, temp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data cleanup and exploratory analysis

Evaluate the Structure and Re-Level the factor variables for “Day Of Week”” and “Month”” in the right order

# Check the structure for Dorder Data
str(DodgersData)
## spc_tbl_ [81 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ month      : chr [1:81] "APR" "APR" "APR" "APR" ...
##  $ day        : num [1:81] 29 26 8 44 18 2 2 12 14 33 ...
##  $ attend     : num [1:81] 43749 34074 42033 24249 43873 ...
##  $ day_of_week: chr [1:81] "Tuesday" "Wednesday" "Thursday" "Friday" ...
##  $ opponent   : chr [1:81] "Pirates" "Pirates" "Pirates" "Padres" ...
##  $ temp       : num [1:81] 79 93 78 86 72 59 68 77 84 77 ...
##  $ skies      : chr [1:81] "Clear" "Cloudy" "Cloudy" "Cloudy" ...
##  $ day_night  : chr [1:81] "Day" "Night" "Night" "Night" ...
##  $ cap        : chr [1:81] "NO" "NO" "NO" "NO" ...
##  $ shirt      : chr [1:81] "NO" "NO" "NO" "NO" ...
##  $ fireworks  : chr [1:81] "NO" "NO" "NO" "YES" ...
##  $ bobblehead : chr [1:81] "NO" "NO" "NO" "NO" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   month = col_character(),
##   ..   day = col_double(),
##   ..   attend = col_double(),
##   ..   day_of_week = col_character(),
##   ..   opponent = col_character(),
##   ..   temp = col_double(),
##   ..   skies = col_character(),
##   ..   day_night = col_character(),
##   ..   cap = col_character(),
##   ..   shirt = col_character(),
##   ..   fireworks = col_character(),
##   ..   bobblehead = col_character()
##   .. )
##  - attr(*, "problems")=<pointer: 0x55b763d4c260>
head(DodgersData)
## # A tibble: 6 × 12
##   month   day attend day_of_week opponent  temp skies  day_night cap   shirt
##   <chr> <dbl>  <dbl> <chr>       <chr>    <dbl> <chr>  <chr>     <chr> <chr>
## 1 APR      29  43749 Tuesday     Pirates     79 Clear  Day       NO    NO   
## 2 APR      26  34074 Wednesday   Pirates     93 Cloudy Night     NO    NO   
## 3 APR       8  42033 Thursday    Pirates     78 Cloudy Night     NO    NO   
## 4 APR      44  24249 Friday      Padres      86 Cloudy Night     NO    NO   
## 5 APR      18  43873 Saturday    Padres      72 Cloudy Night     NO    NO   
## 6 APR       2  42101 Sunday      Padres      59 Clear  Day       NO    NO   
## # ℹ 2 more variables: fireworks <chr>, bobblehead <chr>
# Convert to factors
DodgersData$day_of_week <- factor(DodgersData$day_of_week)
DodgersData$month <- factor(DodgersData$month)

# Evaluate the factor levels for day_of_week
levels(DodgersData$day_of_week)
## [1] "Friday"    "Monday"    "Saturday"  "Sunday"    "Thursday"  "Tuesday"  
## [7] "Wednesday"
# Evaluate the factor levels for month
levels(DodgersData$month)
## [1] "APR" "AUG" "JUL" "JUN" "MAY" "OCT" "SEP"
# First 10 rows of the data frame
head(DodgersData, 10)
## # A tibble: 10 × 12
##    month   day attend day_of_week opponent   temp skies  day_night cap   shirt
##    <fct> <dbl>  <dbl> <fct>       <chr>     <dbl> <chr>  <chr>     <chr> <chr>
##  1 APR      29  43749 Tuesday     Pirates      79 Clear  Day       NO    NO   
##  2 APR      26  34074 Wednesday   Pirates      93 Cloudy Night     NO    NO   
##  3 APR       8  42033 Thursday    Pirates      78 Cloudy Night     NO    NO   
##  4 APR      44  24249 Friday      Padres       86 Cloudy Night     NO    NO   
##  5 APR      18  43873 Saturday    Padres       72 Cloudy Night     NO    NO   
##  6 APR       2  42101 Sunday      Padres       59 Clear  Day       NO    NO   
##  7 APR       2  36895 Monday      Braves       68 Cloudy Night     NO    NO   
##  8 APR      12  52119 Tuesday     Braves       77 Cloudy Night     NO    NO   
##  9 APR      14  53269 Wednesday   Braves       84 Cloudy Night     NO    NO   
## 10 APR      33  32586 Friday      Nationals    77 Clear  Night     NO    NO   
## # ℹ 2 more variables: fireworks <chr>, bobblehead <chr>

Let R identify the temperature, the attendance, the opponent, and the promotion (i.e., bobblehead) for the 20th home game of the season.

DodgersData[20, c("temp", "attend", "opponent", "bobblehead")]
## # A tibble: 1 × 4
##    temp attend opponent bobblehead
##   <dbl>  <dbl> <chr>    <chr>     
## 1    80  40389 Snakes   YES

Let R identify the average value for attendance.

meanattend <- mean(DodgersData$attend)
meanattend
## [1] 39395.8

Let R identify the number of promotions

promotions <- sum(DodgersData$bobblehead=="YES")
promotions
## [1] 11
## in-class notes

EDA

# Exploratory Analysis

# Create separate dataset for EDA only, to allow numeric analysis without disrupting the original character data in the models
EDAdata <- DodgersData


# 0. Convert promotion columns from "YES"/"NO" to numeric 1/0
EDAdata$cap        <- ifelse(EDAdata$cap == "YES", 1, 0)
EDAdata$shirt      <- ifelse(EDAdata$shirt == "YES", 1, 0)
EDAdata$fireworks  <- ifelse(EDAdata$fireworks == "YES", 1, 0)
EDAdata$bobblehead <- ifelse(EDAdata$bobblehead == "YES", 1, 0)


# 1. Count each promotion type
promotion_counts <- colSums(EDAdata[, c("cap", "shirt", "fireworks", "bobblehead")])
promotion_counts
##        cap      shirt  fireworks bobblehead 
##          2          3         14         11
# 2. Count games by month
table(DodgersData$month)
## 
## APR AUG JUL JUN MAY OCT SEP 
##  12  15  12   9  18   3  12
EDAdata$month <- factor(EDAdata$month)
table(EDAdata$month)
## 
## APR AUG JUL JUN MAY OCT SEP 
##  12  15  12   9  18   3  12
# 3. Count games by day of week
EDAdata$day_of_week <- factor(EDAdata$day_of_week)
table(EDAdata$day_of_week)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##        13        12        13        13         5        13        12
# 4. Attendance distribution and sellouts (assuming 56,000 capacity)
summary(EDAdata$attend)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19933   33545   39462   39396   43949   63600
max(EDAdata$attend)
## [1] 63600
sum(EDAdata$attend >= 56000)
## [1] 1
# 5. Number of bobblehead promotions on Tuesday nights
sum(EDAdata$bobblehead == 1 &
    EDAdata$day_of_week == "Tuesday" &
    EDAdata$day_night == "Night")
## [1] 6

Exploratory analysis

The results show that in 2012 there were a few promotions (see the last four columns):

Cap Shirt Fireworks Bobblehead

We have data from April to September for games played in the Day or Night under Clear or Cloudy Skys.

Dodger Stadium has a capacity of about 56,000. Looking at the entire (sample) data shows that the stadium filled up only once in 2012. There were only two cap promotions, three shirt promotions - not enough data for any inferences. Fireworks and Bobblehead promotions have happened a few times, 14 and 11 respectively.

Furthermore, of the eleven bobblehead promotions, most of them (six) were on Tuesday nights.

Evaluate Attendance by Weather

#Evaluate attendance by weather
ggplot(DodgersData, aes(x=temp, y=attend/1000, color=fireworks)) +
geom_point() +
facet_wrap(day_night~skies) +
ggtitle("Dodgers Attendance By Temperature By Time of Game and Skies") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Temperature (Degree Farenheit)") +
ylab("Attendance (Thousands)")

Strip Plot of Attendance by opponent or visiting team

#Strip Plot of Attendance by opponent or visiting team
ggplot(DodgersData, aes(x=attend/1000, y=opponent, color=day_night)) +
geom_point() +
ggtitle("Dodgers Attendance By Opponent") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Attendance (Thousands)") +
ylab("Opponent (Visiting Team)")

Design Predictive Model - our original model

To advise the management if promotions impact attendance we will need to identify if there is a positive effect, and if there is a positive effect how much of an effect it is.

# Create a model with the bobblehead variable entered last
my.model <- {attend ~ month + day_of_week + bobblehead}
# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = DodgersData) # use all available data
print(summary(my.model.fit))
## 
## Call:
## lm(formula = my.model, data = DodgersData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16867.1  -5248.7   -303.9   5389.6  18274.3 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          38486.15    3360.34  11.453   <2e-16 ***
## monthAUG             -1474.05    3414.68  -0.432    0.667    
## monthJUL              3137.52    3664.33   0.856    0.395    
## monthJUN               -49.29    3883.35  -0.013    0.990    
## monthMAY              1575.13    3255.95   0.484    0.630    
## monthOCT             -2851.23    5750.23  -0.496    0.622    
## monthSEP             -1501.57    3582.83  -0.419    0.676    
## day_of_weekMonday    -4390.30    3559.25  -1.233    0.222    
## day_of_weekSaturday   -212.04    3471.18  -0.061    0.951    
## day_of_weekSunday     5264.44    3448.61   1.527    0.132    
## day_of_weekThursday   -858.40    4804.90  -0.179    0.859    
## day_of_weekTuesday    -843.06    3817.56  -0.221    0.826    
## day_of_weekWednesday  2326.59    3531.98   0.659    0.512    
## bobbleheadYES         2831.00    3438.27   0.823    0.413    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8697 on 67 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  -0.006614 
## F-statistic: 0.9596 on 13 and 67 DF,  p-value: 0.4992

Re-level factors for interpretability

# Re-level factors for interpretability
DodgersData$month <- factor(DodgersData$month,
levels = c("APR","MAY","JUN","JUL","AUG","SEP","OCT"))
DodgersData$day_of_week <- factor(DodgersData$day_of_week,
levels = c("Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday","Sunday"))
# Define model formula
my.model2 <- attend ~ month + day_of_week + bobblehead
my.model.fit2 <-lm(my.model2, data = DodgersData)
print(summary(my.model.fit2))
## 
## Call:
## lm(formula = my.model2, data = DodgersData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16867.1  -5248.7   -303.9   5389.6  18274.3 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          34095.85    3583.63   9.514 4.63e-14 ***
## monthMAY              1575.13    3255.95   0.484  0.63013    
## monthJUN               -49.29    3883.35  -0.013  0.98991    
## monthJUL              3137.52    3664.33   0.856  0.39492    
## monthAUG             -1474.05    3414.68  -0.432  0.66736    
## monthSEP             -1501.57    3582.83  -0.419  0.67648    
## monthOCT             -2851.23    5750.23  -0.496  0.62163    
## day_of_weekTuesday    3547.24    3839.99   0.924  0.35892    
## day_of_weekWednesday  6716.89    3572.57   1.880  0.06444 .  
## day_of_weekThursday   3531.90    4954.02   0.713  0.47836    
## day_of_weekFriday     4390.30    3559.25   1.233  0.22170    
## day_of_weekSaturday   4178.26    3626.65   1.152  0.25338    
## day_of_weekSunday     9654.74    3562.19   2.710  0.00853 ** 
## bobbleheadYES         2831.00    3438.27   0.823  0.41321    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8697 on 67 degrees of freedom
## Multiple R-squared:  0.157,  Adjusted R-squared:  -0.006614 
## F-statistic: 0.9596 on 13 and 67 DF,  p-value: 0.4992

Add fireworks as a second promotion variable

# Define model formula
my.model3 <- attend ~ month + day_of_week + fireworks + bobblehead
my.model.fit3 <-lm(my.model3, data = DodgersData)
print(summary(my.model.fit3))
## 
## Call:
## lm(formula = my.model3, data = DodgersData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16874.0  -5178.3   -268.1   5447.5  18266.1 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          34117.15    3617.84   9.430 7.53e-14 ***
## monthMAY              1569.60    3280.86   0.478   0.6339    
## monthJUN               -54.48    3912.81  -0.014   0.9889    
## monthJUL              3058.32    3790.61   0.807   0.4227    
## monthAUG             -1474.05    3440.23  -0.428   0.6697    
## monthSEP             -1505.93    3609.95  -0.417   0.6779    
## monthOCT             -2842.98    5793.95  -0.491   0.6253    
## day_of_weekTuesday    3538.93    3869.77   0.915   0.3638    
## day_of_weekWednesday  6636.58    3703.45   1.792   0.0777 .  
## day_of_weekThursday   3507.84    4997.92   0.702   0.4852    
## day_of_weekFriday     3498.89   10322.77   0.339   0.7357    
## day_of_weekSaturday   4163.92    3657.11   1.139   0.2590    
## day_of_weekSunday     9647.20    3589.78   2.687   0.0091 ** 
## fireworksYES           878.95    9544.70   0.092   0.9269    
## bobbleheadYES         2845.46    3467.56   0.821   0.4148    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8762 on 66 degrees of freedom
## Multiple R-squared:  0.1571, Adjusted R-squared:  -0.02173 
## F-statistic: 0.8784 on 14 and 66 DF,  p-value: 0.5847

Predicted attendance for a Saturday afternoon in July with no promotion

# Saturday afternoon in July WITH bobblehead
new_yes <- data.frame(month = "JUL",
                      day_of_week = "Saturday",
                      bobblehead = "YES")
# Saturday afternoon in July WITHOUT bobblehead
new_no <- data.frame(month = "JUL",
                     day_of_week = "Saturday",
                     bobblehead = "NO")
pred_yes <- predict(my.model.fit2, newdata = new_yes,
                    interval = "prediction")
pred_no <- predict(my.model.fit2, newdata = new_no,
                   interval = "prediction")
## cat("With bobblehead: ", round(pred_yes[1]), "\n")
cat("Without bobblehead: ", round(pred_no[1]), "\n")
## Without bobblehead:  41412

Recommendations

I recommend raising ticket prices for games played in the daytime during summer to maximize revenue. The coefficients in the regression model are higher for the months of May, June, and July; and the scatter plots show higher attendance when the temperatures are higher, with daytime games having higher attendance than nighttime games. This suggests higher demand and willingness to pay for games played at these times.