rm(list=ls())# clear memory
library(lattice) # Graphics Package
library(ggplot2) # Graphical Package
library(readr)
DodgersData <- read_csv("dodgers_randomized.csv")
## Rows: 81 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): month, day_of_week, opponent, skies, day_night, cap, shirt, firewor...
## dbl (3): day, attend, temp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Evaluate the Structure and Re-Level the factor variables for “Day Of Week”” and “Month”” in the right order
# Check the structure for Dorder Data
str(DodgersData)
## spc_tbl_ [81 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ month : chr [1:81] "APR" "APR" "APR" "APR" ...
## $ day : num [1:81] 29 26 8 44 18 2 2 12 14 33 ...
## $ attend : num [1:81] 43749 34074 42033 24249 43873 ...
## $ day_of_week: chr [1:81] "Tuesday" "Wednesday" "Thursday" "Friday" ...
## $ opponent : chr [1:81] "Pirates" "Pirates" "Pirates" "Padres" ...
## $ temp : num [1:81] 79 93 78 86 72 59 68 77 84 77 ...
## $ skies : chr [1:81] "Clear" "Cloudy" "Cloudy" "Cloudy" ...
## $ day_night : chr [1:81] "Day" "Night" "Night" "Night" ...
## $ cap : chr [1:81] "NO" "NO" "NO" "NO" ...
## $ shirt : chr [1:81] "NO" "NO" "NO" "NO" ...
## $ fireworks : chr [1:81] "NO" "NO" "NO" "YES" ...
## $ bobblehead : chr [1:81] "NO" "NO" "NO" "NO" ...
## - attr(*, "spec")=
## .. cols(
## .. month = col_character(),
## .. day = col_double(),
## .. attend = col_double(),
## .. day_of_week = col_character(),
## .. opponent = col_character(),
## .. temp = col_double(),
## .. skies = col_character(),
## .. day_night = col_character(),
## .. cap = col_character(),
## .. shirt = col_character(),
## .. fireworks = col_character(),
## .. bobblehead = col_character()
## .. )
## - attr(*, "problems")=<pointer: 0x55b763d4c260>
head(DodgersData)
## # A tibble: 6 × 12
## month day attend day_of_week opponent temp skies day_night cap shirt
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 APR 29 43749 Tuesday Pirates 79 Clear Day NO NO
## 2 APR 26 34074 Wednesday Pirates 93 Cloudy Night NO NO
## 3 APR 8 42033 Thursday Pirates 78 Cloudy Night NO NO
## 4 APR 44 24249 Friday Padres 86 Cloudy Night NO NO
## 5 APR 18 43873 Saturday Padres 72 Cloudy Night NO NO
## 6 APR 2 42101 Sunday Padres 59 Clear Day NO NO
## # ℹ 2 more variables: fireworks <chr>, bobblehead <chr>
# Convert to factors
DodgersData$day_of_week <- factor(DodgersData$day_of_week)
DodgersData$month <- factor(DodgersData$month)
# Evaluate the factor levels for day_of_week
levels(DodgersData$day_of_week)
## [1] "Friday" "Monday" "Saturday" "Sunday" "Thursday" "Tuesday"
## [7] "Wednesday"
# Evaluate the factor levels for month
levels(DodgersData$month)
## [1] "APR" "AUG" "JUL" "JUN" "MAY" "OCT" "SEP"
# First 10 rows of the data frame
head(DodgersData, 10)
## # A tibble: 10 × 12
## month day attend day_of_week opponent temp skies day_night cap shirt
## <fct> <dbl> <dbl> <fct> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 APR 29 43749 Tuesday Pirates 79 Clear Day NO NO
## 2 APR 26 34074 Wednesday Pirates 93 Cloudy Night NO NO
## 3 APR 8 42033 Thursday Pirates 78 Cloudy Night NO NO
## 4 APR 44 24249 Friday Padres 86 Cloudy Night NO NO
## 5 APR 18 43873 Saturday Padres 72 Cloudy Night NO NO
## 6 APR 2 42101 Sunday Padres 59 Clear Day NO NO
## 7 APR 2 36895 Monday Braves 68 Cloudy Night NO NO
## 8 APR 12 52119 Tuesday Braves 77 Cloudy Night NO NO
## 9 APR 14 53269 Wednesday Braves 84 Cloudy Night NO NO
## 10 APR 33 32586 Friday Nationals 77 Clear Night NO NO
## # ℹ 2 more variables: fireworks <chr>, bobblehead <chr>
DodgersData[20, c("temp", "attend", "opponent", "bobblehead")]
## # A tibble: 1 × 4
## temp attend opponent bobblehead
## <dbl> <dbl> <chr> <chr>
## 1 80 40389 Snakes YES
meanattend <- mean(DodgersData$attend)
meanattend
## [1] 39395.8
promotions <- sum(DodgersData$bobblehead=="YES")
promotions
## [1] 11
## in-class notes
# Exploratory Analysis
# Create separate dataset for EDA only, to allow numeric analysis without disrupting the original character data in the models
EDAdata <- DodgersData
# 0. Convert promotion columns from "YES"/"NO" to numeric 1/0
EDAdata$cap <- ifelse(EDAdata$cap == "YES", 1, 0)
EDAdata$shirt <- ifelse(EDAdata$shirt == "YES", 1, 0)
EDAdata$fireworks <- ifelse(EDAdata$fireworks == "YES", 1, 0)
EDAdata$bobblehead <- ifelse(EDAdata$bobblehead == "YES", 1, 0)
# 1. Count each promotion type
promotion_counts <- colSums(EDAdata[, c("cap", "shirt", "fireworks", "bobblehead")])
promotion_counts
## cap shirt fireworks bobblehead
## 2 3 14 11
# 2. Count games by month
table(DodgersData$month)
##
## APR AUG JUL JUN MAY OCT SEP
## 12 15 12 9 18 3 12
EDAdata$month <- factor(EDAdata$month)
table(EDAdata$month)
##
## APR AUG JUL JUN MAY OCT SEP
## 12 15 12 9 18 3 12
# 3. Count games by day of week
EDAdata$day_of_week <- factor(EDAdata$day_of_week)
table(EDAdata$day_of_week)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 13 12 13 13 5 13 12
# 4. Attendance distribution and sellouts (assuming 56,000 capacity)
summary(EDAdata$attend)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19933 33545 39462 39396 43949 63600
max(EDAdata$attend)
## [1] 63600
sum(EDAdata$attend >= 56000)
## [1] 1
# 5. Number of bobblehead promotions on Tuesday nights
sum(EDAdata$bobblehead == 1 &
EDAdata$day_of_week == "Tuesday" &
EDAdata$day_night == "Night")
## [1] 6
The results show that in 2012 there were a few promotions (see the last four columns):
Cap Shirt Fireworks Bobblehead
We have data from April to September for games played in the Day or Night under Clear or Cloudy Skys.
Dodger Stadium has a capacity of about 56,000. Looking at the entire (sample) data shows that the stadium filled up only once in 2012. There were only two cap promotions, three shirt promotions - not enough data for any inferences. Fireworks and Bobblehead promotions have happened a few times, 14 and 11 respectively.
Furthermore, of the eleven bobblehead promotions, most of them (six) were on Tuesday nights.
#Evaluate attendance by weather
ggplot(DodgersData, aes(x=temp, y=attend/1000, color=fireworks)) +
geom_point() +
facet_wrap(day_night~skies) +
ggtitle("Dodgers Attendance By Temperature By Time of Game and Skies") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Temperature (Degree Farenheit)") +
ylab("Attendance (Thousands)")
#Strip Plot of Attendance by opponent or visiting team
ggplot(DodgersData, aes(x=attend/1000, y=opponent, color=day_night)) +
geom_point() +
ggtitle("Dodgers Attendance By Opponent") +
theme(plot.title = element_text(lineheight=3, face="bold", color="black", size=10)) +
xlab("Attendance (Thousands)") +
ylab("Opponent (Visiting Team)")
To advise the management if promotions impact attendance we will need to identify if there is a positive effect, and if there is a positive effect how much of an effect it is.
# Create a model with the bobblehead variable entered last
my.model <- {attend ~ month + day_of_week + bobblehead}
# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = DodgersData) # use all available data
print(summary(my.model.fit))
##
## Call:
## lm(formula = my.model, data = DodgersData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16867.1 -5248.7 -303.9 5389.6 18274.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38486.15 3360.34 11.453 <2e-16 ***
## monthAUG -1474.05 3414.68 -0.432 0.667
## monthJUL 3137.52 3664.33 0.856 0.395
## monthJUN -49.29 3883.35 -0.013 0.990
## monthMAY 1575.13 3255.95 0.484 0.630
## monthOCT -2851.23 5750.23 -0.496 0.622
## monthSEP -1501.57 3582.83 -0.419 0.676
## day_of_weekMonday -4390.30 3559.25 -1.233 0.222
## day_of_weekSaturday -212.04 3471.18 -0.061 0.951
## day_of_weekSunday 5264.44 3448.61 1.527 0.132
## day_of_weekThursday -858.40 4804.90 -0.179 0.859
## day_of_weekTuesday -843.06 3817.56 -0.221 0.826
## day_of_weekWednesday 2326.59 3531.98 0.659 0.512
## bobbleheadYES 2831.00 3438.27 0.823 0.413
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8697 on 67 degrees of freedom
## Multiple R-squared: 0.157, Adjusted R-squared: -0.006614
## F-statistic: 0.9596 on 13 and 67 DF, p-value: 0.4992
# Re-level factors for interpretability
DodgersData$month <- factor(DodgersData$month,
levels = c("APR","MAY","JUN","JUL","AUG","SEP","OCT"))
DodgersData$day_of_week <- factor(DodgersData$day_of_week,
levels = c("Monday","Tuesday","Wednesday",
"Thursday","Friday","Saturday","Sunday"))
# Define model formula
my.model2 <- attend ~ month + day_of_week + bobblehead
my.model.fit2 <-lm(my.model2, data = DodgersData)
print(summary(my.model.fit2))
##
## Call:
## lm(formula = my.model2, data = DodgersData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16867.1 -5248.7 -303.9 5389.6 18274.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34095.85 3583.63 9.514 4.63e-14 ***
## monthMAY 1575.13 3255.95 0.484 0.63013
## monthJUN -49.29 3883.35 -0.013 0.98991
## monthJUL 3137.52 3664.33 0.856 0.39492
## monthAUG -1474.05 3414.68 -0.432 0.66736
## monthSEP -1501.57 3582.83 -0.419 0.67648
## monthOCT -2851.23 5750.23 -0.496 0.62163
## day_of_weekTuesday 3547.24 3839.99 0.924 0.35892
## day_of_weekWednesday 6716.89 3572.57 1.880 0.06444 .
## day_of_weekThursday 3531.90 4954.02 0.713 0.47836
## day_of_weekFriday 4390.30 3559.25 1.233 0.22170
## day_of_weekSaturday 4178.26 3626.65 1.152 0.25338
## day_of_weekSunday 9654.74 3562.19 2.710 0.00853 **
## bobbleheadYES 2831.00 3438.27 0.823 0.41321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8697 on 67 degrees of freedom
## Multiple R-squared: 0.157, Adjusted R-squared: -0.006614
## F-statistic: 0.9596 on 13 and 67 DF, p-value: 0.4992
# Define model formula
my.model3 <- attend ~ month + day_of_week + fireworks + bobblehead
my.model.fit3 <-lm(my.model3, data = DodgersData)
print(summary(my.model.fit3))
##
## Call:
## lm(formula = my.model3, data = DodgersData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16874.0 -5178.3 -268.1 5447.5 18266.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34117.15 3617.84 9.430 7.53e-14 ***
## monthMAY 1569.60 3280.86 0.478 0.6339
## monthJUN -54.48 3912.81 -0.014 0.9889
## monthJUL 3058.32 3790.61 0.807 0.4227
## monthAUG -1474.05 3440.23 -0.428 0.6697
## monthSEP -1505.93 3609.95 -0.417 0.6779
## monthOCT -2842.98 5793.95 -0.491 0.6253
## day_of_weekTuesday 3538.93 3869.77 0.915 0.3638
## day_of_weekWednesday 6636.58 3703.45 1.792 0.0777 .
## day_of_weekThursday 3507.84 4997.92 0.702 0.4852
## day_of_weekFriday 3498.89 10322.77 0.339 0.7357
## day_of_weekSaturday 4163.92 3657.11 1.139 0.2590
## day_of_weekSunday 9647.20 3589.78 2.687 0.0091 **
## fireworksYES 878.95 9544.70 0.092 0.9269
## bobbleheadYES 2845.46 3467.56 0.821 0.4148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8762 on 66 degrees of freedom
## Multiple R-squared: 0.1571, Adjusted R-squared: -0.02173
## F-statistic: 0.8784 on 14 and 66 DF, p-value: 0.5847
# Saturday afternoon in July WITH bobblehead
new_yes <- data.frame(month = "JUL",
day_of_week = "Saturday",
bobblehead = "YES")
# Saturday afternoon in July WITHOUT bobblehead
new_no <- data.frame(month = "JUL",
day_of_week = "Saturday",
bobblehead = "NO")
pred_yes <- predict(my.model.fit2, newdata = new_yes,
interval = "prediction")
pred_no <- predict(my.model.fit2, newdata = new_no,
interval = "prediction")
## cat("With bobblehead: ", round(pred_yes[1]), "\n")
cat("Without bobblehead: ", round(pred_no[1]), "\n")
## Without bobblehead: 41412
I recommend raising ticket prices for games played in the daytime during summer to maximize revenue. The coefficients in the regression model are higher for the months of May, June, and July; and the scatter plots show higher attendance when the temperatures are higher, with daytime games having higher attendance than nighttime games. This suggests higher demand and willingness to pay for games played at these times.