# Faith, Brian MSDS 457 Marketing & Revenue Analytics
library (ggplot2)
library(lattice)
library(plyr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mlbattend <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/mlbattend.csv")
str(mlbattend)
## 'data.frame': 21 obs. of 7 variables:
## $ year : int 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 ...
## $ time_pnine : int 173 177 174 172 166 167 166 169 171 170 ...
## $ runs_pg : num 10.17 10.28 9.55 9.24 9.46 ...
## $ pitches_pa : num 3.74 3.76 3.73 3.74 3.74 3.77 3.74 3.76 3.77 3.81 ...
## $ pitchers_pg: num 3.56 3.54 3.63 3.63 3.67 3.76 3.71 3.85 3.97 3.92 ...
## $ attendance : int 70139380 71358907 72581101 67944389 67630052 73022972 74915268 76043902 79484718 78624315 ...
## $ attend_pg : int 28887 29377 29881 28006 27831 30075 30816 31306 32696 32382 ...
summary(mlbattend)
## year time_pnine runs_pg pitches_pa
## Min. :1999 Min. :166.0 Min. : 8.130 Min. :3.730
## 1st Qu.:2004 1st Qu.:170.0 1st Qu.: 8.770 1st Qu.:3.760
## Median :2009 Median :173.0 Median : 9.240 Median :3.820
## Mean :2009 Mean :174.2 Mean : 9.196 Mean :3.808
## 3rd Qu.:2014 3rd Qu.:178.0 3rd Qu.: 9.590 3rd Qu.:3.830
## Max. :2019 Max. :185.0 Max. :10.280 Max. :3.930
## pitchers_pg attendance attend_pg
## Min. :3.540 Min. :67630052 Min. :27831
## 1st Qu.:3.710 1st Qu.:71358907 1st Qu.:29377
## Median :3.920 Median :73159044 Median :30131
## Mean :3.908 Mean :72953538 Mean :30030
## 3rd Qu.:3.990 3rd Qu.:74027037 3rd Qu.:30451
## Max. :4.410 Max. :79484718 Max. :32696
teamyoy <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/teamyoy.csv")
royals <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/royals.csv")
str(royals)
## 'data.frame': 161 obs. of 22 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 1 2 8 9 10 11 12 13 21 22 ...
## $ dayofweek : chr "Thursday" "Saturday" "Monday" "Tuesday" ...
## $ month : chr "March" "March" "April" "April" ...
## $ day : int 29 31 9 10 11 12 13 14 24 25 ...
## $ opp : chr "CHW" "CHW" "SEA" "SEA" ...
## $ attendance : int 36517 17564 12324 14850 14314 14714 15011 15876 16555 13389 ...
## $ temp : int 45 45 42 60 71 79 69 42 73 50 ...
## $ skies : chr "Cloudy" "Clear" "Cloudy" "Clear" ...
## $ day_night : chr "Day" "Night" "Night" "Night" ...
## $ opptype : chr "DIV" "DIV" "AL" "AL" ...
## $ interlg : chr "NO" "NO" "NO" "NO" ...
## $ div : chr "YES" "YES" "NO" "NO" ...
## $ amlg : chr "NO" "NO" "YES" "YES" ...
## $ wins : int 0 0 3 3 3 3 3 3 5 5 ...
## $ losses : int 1 2 5 6 7 8 9 10 16 17 ...
## $ winpct : num 0 0 0.375 0.333 0.3 0.273 0.25 0.231 0.238 0.227 ...
## $ giveaway : chr "NO" "NO" "NO" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
# Table 1 Attendance Correlation Summary Table
cor(mlbattend)
## year time_pnine runs_pg pitches_pa pitchers_pg
## year 1.00000000 0.6833377 -0.6190586 0.94587944 0.95416273
## time_pnine 0.68333773 1.0000000 -0.2189719 0.74784246 0.66500291
## runs_pg -0.61905859 -0.2189719 1.0000000 -0.43491475 -0.41122903
## pitches_pa 0.94587944 0.7478425 -0.4349147 1.00000000 0.92865790
## pitchers_pg 0.95416273 0.6650029 -0.4112290 0.92865790 1.00000000
## attendance 0.05526261 -0.2322568 -0.1781325 -0.06681834 0.05746695
## attend_pg 0.05128909 -0.2341930 -0.1748932 -0.06991366 0.05384182
## attendance attend_pg
## year 0.05526261 0.05128909
## time_pnine -0.23225681 -0.23419297
## runs_pg -0.17813250 -0.17489322
## pitches_pa -0.06681834 -0.06991366
## pitchers_pg 0.05746695 0.05384182
## attendance 1.00000000 0.99992992
## attend_pg 0.99992992 1.00000000
# Figure 1 Game Duration by Year Scatterplot
plot(mlbattend$year, mlbattend$time_pnine, las = 1, xlab = " ",
ylab = "Time Per 9 Innings (mins)", main = "Game Duration by Year",
col = "goldenrod1", pch = 20, cex = 3, xaxp =c(1999,2019,5))
lines(lowess(mlbattend$year, mlbattend$time_pnine), col = "red", lty = 2)

# Figure 2 Attendance Year over Year by Team
barplot(teamyoy$yoy_diff/1000 ~ teamyoy$team,
xlab = "Attendance Change (thousands)", ylab = " ",
main = "2018-2019 Attendance Change",horiz = TRUE,
col = ifelse((teamyoy$yoy_diff/1000)<0,"red3","seagreen"), xlim = c(-600, 600),las = 1,
cex.names = .7)

# Max and Min YoY Change Teams
with(teamyoy, teamyoy$team[which.max(teamyoy$yoy_diff)])
## [1] "PHI"
max(teamyoy$yoy_diff)
## [1] 569297
with(teamyoy, teamyoy$team[which.min(teamyoy$yoy_diff)])
## [1] "TOR"
min(teamyoy$yoy_diff)
## [1] -575137
# Environmental Analysis
royals$dayofweek <- with(data=royals,
ifelse ((dayofweek == "Monday"),1,
ifelse ((dayofweek == "Tuesday"),2,
ifelse ((dayofweek == "Wednesday"),3,
ifelse ((dayofweek == "Thursday"),4,
ifelse ((dayofweek == "Friday"),5,
ifelse ((dayofweek == "Saturday"),6,7)))))))
royals$dayofweek <- factor(royals$dayofweek, levels=1:7,
labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))
# Figure 3 Box Plot of Royals Attendance by Day of Week
with(data=royals,plot(dayofweek, (attendance/1000),
xlab = " ", ylab = "Attendance (thousands)",
main = "Royals Attendance by Day of Week 2018-2019",
col = "dodgerblue", las = 1))

royals$month <- with(data=royals,
ifelse ((month == "March"),3,
ifelse ((month == "April"),4,
ifelse ((month == "May"),5,
ifelse ((month == "June"),6,
ifelse ((month == "July"),7,
ifelse ((month == "August"),8,9)))))))
royals$month <- factor(royals$month, levels=3:9,
labels = c("March","April", "May", "June", "July", "Aug", "Sept"))
# Figure 4 Box plot of Attendance by Month
with(data=royals,plot(month,(attendance/1000),
xlab = " ",ylab = "Attendance (thousands)",
main = "Royals Attendance by Month 2018-2019",
col = "goldenrod1", las = 1))

by(royals[, 7], royals[,'month'], median)
## royals[, "month"]: March
## [1] 17564
## ------------------------------------------------------------
## royals[, "month"]: April
## [1] 14782
## ------------------------------------------------------------
## royals[, "month"]: May
## [1] 20620.5
## ------------------------------------------------------------
## royals[, "month"]: June
## [1] 20889
## ------------------------------------------------------------
## royals[, "month"]: July
## [1] 18379
## ------------------------------------------------------------
## royals[, "month"]: Aug
## [1] 18385
## ------------------------------------------------------------
## royals[, "month"]: Sept
## [1] 17875
cor(royals$attendance, royals$temp)
## [1] 0.2445549
# Figure 5 Weather, Fireworks, and Attendance Scatter Plot Lattice
group.labels <- c("No Fireworks","Fireworks")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","red")
xyplot(attendance/1000 ~ temp | skies + day_night,
data = royals, groups = fireworks, pch = group.symbols,
aspect = 1, cex = 1.5, col = group.colors, fill = group.fill,
layout = c(2, 2), type = c("p","g"),
strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1),
xlab = "Temperature (Degrees Fahrenheit)",
ylab = "Attendance (thousands)",
main = "Weather, Fireworks, and Attendance",
key = list(space = "right",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols), col = rev(group.colors),
fill = rev(group.fill))))

# Figure 6 Attendance by Visiting Team
group.labels <- c("Day","Night")
group.symbols <- c(1,8)
group.symbols.size <- c(1.25,1)
bwplot(opp ~ attendance/1000, data = royals, groups = day_night,
xlab = "Attendance (thousands)", ylab = " ",
main = "Attendance by Opponent and Game Time, 2018-2019",
panel = function(x, y, groups, subscripts, ...)
{panel.grid(h = (length(levels(royals$opp)) - 1), v = -1)
panel.stripplot(x, y, groups = groups, subscripts = subscripts,
cex = group.symbols.size, pch = group.symbols,
col = "darkblue")},
key = list(space = "right",
text = list(group.labels,col = "black"),
points = list(pch = group.symbols, cex = group.symbols.size,
col = "darkblue")))

# Figure 7 Attendance by Visiting Team by Opponent Type Category
royals$opptype <- with(data=royals,
ifelse ((opptype == "INT"),1,
ifelse ((opptype == "DIV"),2,3)))
royals$opptype <- factor(royals$opptype, levels=1:3,
labels = c("Interleague","Division", "American League"))
group.labels <- c("Day","Night")
group.symbols <- c(1,8)
group.symbols.size <- c(1.25,1)
bwplot(opptype ~ attendance/1000, data = royals, groups = day_night,
xlab = "Attendance (thousands)", ylab = " ",
main = "Attendance by Opponent Type",
panel = function(x, y, groups, subscripts, ...)
{panel.grid(h = (length(levels(royals$opptype)) - 1), v = -1)
panel.stripplot(x, y, groups = groups, subscripts = subscripts,
cex = group.symbols.size, pch = group.symbols, col = "darkblue")},
key = list(space = "right",
text = list(group.labels,col = "black"),
points = list(pch = group.symbols, cex = group.symbols.size,
col = "darkblue")))

# Promotional Analysis
# Summary tables for promotional variables by day of week
with(royals, table(bobblehead,dayofweek))
## dayofweek
## bobblehead Mon Tue Wed Thur Fri Sat Sun
## NO 15 24 25 19 23 22 25
## YES 0 0 0 0 1 7 0
with(royals, table(giveaway,dayofweek))
## dayofweek
## giveaway Mon Tue Wed Thur Fri Sat Sun
## NO 15 12 25 14 16 22 21
## YES 0 12 0 5 8 7 4
with(royals, table(giveaway_50th, dayofweek))
## dayofweek
## giveaway_50th Mon Tue Wed Thur Fri Sat Sun
## NO 15 24 25 19 24 24 20
## YES 0 0 0 0 0 5 5
with(royals, table(theme,dayofweek))
## dayofweek
## theme Mon Tue Wed Thur Fri Sat Sun
## NO 15 23 22 19 12 19 16
## YES 0 1 3 0 12 10 9
# Bobbleheads
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
levels=c(1,2), labels=c("TRAIN","TEST"))
royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame': 107 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 1 2 9 22 23 24 25 26 31 33 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
## $ month : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
## $ day : int 29 31 10 25 26 27 28 28 3 5 ...
## $ opp : chr "CHW" "CHW" "SEA" "MIL" ...
## $ attendance : int 36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
## $ temp : int 45 45 60 50 67 75 64 67 76 83 ...
## $ skies : chr "Cloudy" "Clear" "Clear" "Cloudy" ...
## $ day_night : chr "Day" "Night" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
## $ interlg : chr "NO" "NO" "NO" "YES" ...
## $ div : chr "YES" "YES" "NO" "NO" ...
## $ amlg : chr "NO" "NO" "YES" "NO" ...
## $ wins : int 0 0 3 5 5 5 5 6 9 10 ...
## $ losses : int 1 2 6 17 18 19 20 20 22 23 ...
## $ winpct : num 0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
## $ giveaway : chr "NO" "NO" "NO" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame': 54 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 8 10 11 12 13 21 27 32 41 44 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
## $ month : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
## $ day : int 9 11 12 13 14 24 29 4 14 18 ...
## $ opp : chr "SEA" "SEA" "LAA" "LAA" ...
## $ attendance : int 12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
## $ temp : int 42 71 79 69 42 73 69 77 82 84 ...
## $ skies : chr "Cloudy" "Cloudy" "Clear" "Cloudy" ...
## $ day_night : chr "Night" "Day" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
## $ interlg : chr "NO" "NO" "NO" "NO" ...
## $ div : chr "NO" "NO" "NO" "NO" ...
## $ amlg : chr "YES" "YES" "YES" "YES" ...
## $ wins : int 3 3 3 3 3 5 7 10 13 14 ...
## $ losses : int 5 7 8 9 10 16 20 22 28 30 ...
## $ winpct : num 0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
## $ giveaway : chr "NO" "NO" "YES" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with bobblehead entered last
my.model <- {attendance ~ month + dayofweek + bobblehead}
# fit the model to the training set
train.model.fit <- lm(my.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
##
## Call:
## lm(formula = my.model, data = royals.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11708.2 -3102.7 -346.2 2634.8 14071.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21999 2890 7.611 2.19e-11 ***
## monthApril -9785 2695 -3.631 0.000462 ***
## monthMay -3377 2717 -1.243 0.216968
## monthJune -1754 2670 -0.657 0.512941
## monthJuly -3237 2639 -1.227 0.223040
## monthAug -3731 2679 -1.393 0.166996
## monthSept -6552 2721 -2.408 0.017997 *
## dayofweekTue 1115 1904 0.586 0.559512
## dayofweekWed -1029 1887 -0.546 0.586657
## dayofweekThur 2405 1924 1.249 0.214630
## dayofweekFri 3155 1960 1.610 0.110774
## dayofweekSat 3242 1904 1.703 0.091880 .
## dayofweekSun 1365 1901 0.718 0.474722
## bobbleheadYES 3161 2624 1.205 0.231430
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4626 on 93 degrees of freedom
## Multiple R-squared: 0.347, Adjusted R-squared: 0.2557
## F-statistic: 3.801 on 13 and 93 DF, p-value: 6.907e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
newdata = royals.test)
# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
round((with(royals.test,cor(attendance,predict_attendance)^2)),
digits=3),"\n",sep="")
##
## Proportion of Test Set Variance Accounted for: 0.257
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)
# Figure 9 Generate predictive modeling visual - Bobbleheads
group.labels <- c("No Bobbleheads","Bobbleheads")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","dodgerblue")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
data = royals.plotting.frame, groups = bobblehead, cex = 1.25,
pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(8,8,39,39,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
main = "Regression Model, Bobbleheads and Attendance",
key = list(space = "right",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))

# Bobblehead
# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = royals) # use all available data
print(summary(my.model.fit))
##
## Call:
## lm(formula = my.model, data = royals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10164.8 -3020.2 -183.8 2210.9 15718.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18745.1 2469.0 7.592 3.35e-12 ***
## monthApril -6262.2 2302.0 -2.720 0.00731 **
## monthMay -451.2 2342.5 -0.193 0.84753
## monthJune 354.8 2314.1 0.153 0.87835
## monthJuly -327.1 2298.2 -0.142 0.88701
## monthAug -989.3 2323.9 -0.426 0.67095
## monthSept -3562.5 2298.9 -1.550 0.12338
## dayofweekTue 781.4 1519.6 0.514 0.60787
## dayofweekWed 339.9 1508.2 0.225 0.82200
## dayofweekThur 2826.3 1616.4 1.749 0.08246 .
## dayofweekFri 4568.4 1525.7 2.994 0.00323 **
## dayofweekSat 4952.7 1558.9 3.177 0.00181 **
## dayofweekSun 2674.7 1523.6 1.756 0.08125 .
## bobbleheadYES 1969.5 1922.8 1.024 0.30738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4609 on 147 degrees of freedom
## Multiple R-squared: 0.3278, Adjusted R-squared: 0.2684
## F-statistic: 5.515 on 13 and 147 DF, p-value: 4.293e-08
# tests statistical significance of the bobblehead promotion
# type I anova computes sums of squares for sequential tests
print(anova(my.model.fit))
## Analysis of Variance Table
##
## Response: attendance
## Df Sum Sq Mean Sq F value Pr(>F)
## month 6 870448549 145074758 6.8281 2.09e-06 ***
## dayofweek 6 630539342 105089890 4.9462 0.0001222 ***
## bobblehead 1 22291687 22291687 1.0492 0.3073767
## Residuals 147 3123249950 21246598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ",
round(my.model.fit$coefficients[length(my.model.fit$coefficients)],
digits = 0),"\n",sep="")
##
## Estimated Effect of Bobblehead Promotion on Attendance: 1969
# Giveaways
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
levels=c(1,2), labels=c("TRAIN","TEST"))
royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame': 107 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 1 2 9 22 23 24 25 26 31 33 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
## $ month : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
## $ day : int 29 31 10 25 26 27 28 28 3 5 ...
## $ opp : chr "CHW" "CHW" "SEA" "MIL" ...
## $ attendance : int 36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
## $ temp : int 45 45 60 50 67 75 64 67 76 83 ...
## $ skies : chr "Cloudy" "Clear" "Clear" "Cloudy" ...
## $ day_night : chr "Day" "Night" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
## $ interlg : chr "NO" "NO" "NO" "YES" ...
## $ div : chr "YES" "YES" "NO" "NO" ...
## $ amlg : chr "NO" "NO" "YES" "NO" ...
## $ wins : int 0 0 3 5 5 5 5 6 9 10 ...
## $ losses : int 1 2 6 17 18 19 20 20 22 23 ...
## $ winpct : num 0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
## $ giveaway : chr "NO" "NO" "NO" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame': 54 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 8 10 11 12 13 21 27 32 41 44 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
## $ month : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
## $ day : int 9 11 12 13 14 24 29 4 14 18 ...
## $ opp : chr "SEA" "SEA" "LAA" "LAA" ...
## $ attendance : int 12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
## $ temp : int 42 71 79 69 42 73 69 77 82 84 ...
## $ skies : chr "Cloudy" "Cloudy" "Clear" "Cloudy" ...
## $ day_night : chr "Night" "Day" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
## $ interlg : chr "NO" "NO" "NO" "NO" ...
## $ div : chr "NO" "NO" "NO" "NO" ...
## $ amlg : chr "YES" "YES" "YES" "YES" ...
## $ wins : int 3 3 3 3 3 5 7 10 13 14 ...
## $ losses : int 5 7 8 9 10 16 20 22 28 30 ...
## $ winpct : num 0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
## $ giveaway : chr "NO" "NO" "YES" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with giveaway entered last
give.model <- {attendance ~ month + dayofweek + giveaway}
# fit the model to the training set
train.model.fit <- lm(give.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
##
## Call:
## lm(formula = give.model, data = royals.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11945.4 -2982.2 -416.8 2832.3 13911.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21899.2 2908.3 7.530 3.21e-11 ***
## monthApril -9746.4 2708.6 -3.598 0.000516 ***
## monthMay -3246.8 2733.9 -1.188 0.238017
## monthJune -1713.4 2700.0 -0.635 0.527249
## monthJuly -3179.0 2664.8 -1.193 0.235913
## monthAug -3472.1 2676.8 -1.297 0.197804
## monthSept -6627.7 2759.1 -2.402 0.018290 *
## dayofweekTue 561.0 2044.9 0.274 0.784426
## dayofweekWed -1017.7 1895.4 -0.537 0.592574
## dayofweekThur 2266.9 1949.3 1.163 0.247838
## dayofweekFri 2751.0 2048.2 1.343 0.182503
## dayofweekSat 3579.2 1880.1 1.904 0.060035 .
## dayofweekSun 1222.5 1919.9 0.637 0.525833
## giveawayYES 907.9 1190.1 0.763 0.447457
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4647 on 93 degrees of freedom
## Multiple R-squared: 0.3409, Adjusted R-squared: 0.2488
## F-statistic: 3.7 on 13 and 93 DF, p-value: 9.7e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
newdata = royals.test)
# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
round((with(royals.test,cor(attendance,predict_attendance)^2)),
digits=3),"\n",sep="")
##
## Proportion of Test Set Variance Accounted for: 0.259
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)
# Figure 10 Generate predictive modeling visual - Giveaway
group.labels <- c("No Giveaway","Giveaway")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","skyblue")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
data = royals.plotting.frame, groups = giveaway,
cex = 1.25, pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(8,8,39,39,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
main = "Regression Model, Giveaways and Attendance",
key = list(space = "right",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))

# Giveaways
# use the full data set to obtain an estimate of the increase in
# attendance due to giveaways, controlling for other factors
give.model.fit <- lm(give.model, data = royals) # use all available data
print(summary(give.model.fit))
##
## Call:
## lm(formula = give.model, data = royals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10439.8 -2946.9 -59.2 2154.9 15281.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18686.8 2477.5 7.543 4.41e-12 ***
## monthApril -6271.3 2311.1 -2.714 0.007453 **
## monthMay -177.5 2322.8 -0.076 0.939197
## monthJune 364.4 2329.2 0.156 0.875884
## monthJuly -342.6 2319.0 -0.148 0.882741
## monthAug -827.6 2320.3 -0.357 0.721855
## monthSept -3650.9 2319.5 -1.574 0.117633
## dayofweekTue 478.3 1598.0 0.299 0.765115
## dayofweekWed 335.7 1511.5 0.222 0.824560
## dayofweekThur 2678.7 1642.8 1.631 0.105125
## dayofweekFri 4453.0 1560.5 2.853 0.004949 **
## dayofweekSat 5286.0 1507.1 3.507 0.000601 ***
## dayofweekSun 2594.5 1534.5 1.691 0.092998 .
## giveawayYES 615.3 969.2 0.635 0.526503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4619 on 147 degrees of freedom
## Multiple R-squared: 0.3249, Adjusted R-squared: 0.2652
## F-statistic: 5.442 on 13 and 147 DF, p-value: 5.647e-08
# tests statistical significance of the giveaway
# type I anova computes sums of squares for sequential tests
print(anova(give.model.fit))
## Analysis of Variance Table
##
## Response: attendance
## Df Sum Sq Mean Sq F value Pr(>F)
## month 6 870448549 145074758 6.7983 2.227e-06 ***
## dayofweek 6 630539342 105089890 4.9246 0.0001281 ***
## giveaway 1 8601065 8601065 0.4031 0.5265031
## Residuals 147 3136940572 21339732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Giveaways on Attendance: ",
round(give.model.fit$coefficients[length(give.model.fit$coefficients)],
digits = 0),"\n",sep="")
##
## Estimated Effect of Giveaways on Attendance: 615
# Giveaway 50th
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
levels=c(1,2), labels=c("TRAIN","TEST"))
royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame': 107 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 1 2 9 22 23 24 25 26 31 33 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
## $ month : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
## $ day : int 29 31 10 25 26 27 28 28 3 5 ...
## $ opp : chr "CHW" "CHW" "SEA" "MIL" ...
## $ attendance : int 36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
## $ temp : int 45 45 60 50 67 75 64 67 76 83 ...
## $ skies : chr "Cloudy" "Clear" "Clear" "Cloudy" ...
## $ day_night : chr "Day" "Night" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
## $ interlg : chr "NO" "NO" "NO" "YES" ...
## $ div : chr "YES" "YES" "NO" "NO" ...
## $ amlg : chr "NO" "NO" "YES" "NO" ...
## $ wins : int 0 0 3 5 5 5 5 6 9 10 ...
## $ losses : int 1 2 6 17 18 19 20 20 22 23 ...
## $ winpct : num 0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
## $ giveaway : chr "NO" "NO" "NO" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame': 54 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 8 10 11 12 13 21 27 32 41 44 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
## $ month : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
## $ day : int 9 11 12 13 14 24 29 4 14 18 ...
## $ opp : chr "SEA" "SEA" "LAA" "LAA" ...
## $ attendance : int 12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
## $ temp : int 42 71 79 69 42 73 69 77 82 84 ...
## $ skies : chr "Cloudy" "Cloudy" "Clear" "Cloudy" ...
## $ day_night : chr "Night" "Day" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
## $ interlg : chr "NO" "NO" "NO" "NO" ...
## $ div : chr "NO" "NO" "NO" "NO" ...
## $ amlg : chr "YES" "YES" "YES" "YES" ...
## $ wins : int 3 3 3 3 3 5 7 10 13 14 ...
## $ losses : int 5 7 8 9 10 16 20 22 28 30 ...
## $ winpct : num 0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
## $ giveaway : chr "NO" "NO" "YES" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with 50th giveaway entered last
give50.model <- {attendance ~ month + dayofweek + giveaway_50th}
# fit the model to the training set
train.model.fit <- lm(give50.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
##
## Call:
## lm(formula = give50.model, data = royals.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11972.7 -3183.6 -441.7 2609.9 14012.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21679 2893 7.494 3.81e-11 ***
## monthApril -9618 2708 -3.552 0.000602 ***
## monthMay -3015 2712 -1.112 0.269041
## monthJune -1530 2680 -0.571 0.569282
## monthJuly -2858 2636 -1.084 0.281111
## monthAug -3352 2672 -1.254 0.212834
## monthSept -6240 2730 -2.286 0.024533 *
## dayofweekTue 1101 1915 0.575 0.566637
## dayofweekWed -1031 1898 -0.543 0.588311
## dayofweekThur 2460 1935 1.271 0.206793
## dayofweekFri 3156 1972 1.601 0.112799
## dayofweekSat 3827 1838 2.082 0.040089 *
## dayofweekSun 1079 1974 0.547 0.585976
## giveaway_50thYES 1540 2599 0.593 0.554886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4653 on 93 degrees of freedom
## Multiple R-squared: 0.3393, Adjusted R-squared: 0.2469
## F-statistic: 3.673 on 13 and 93 DF, p-value: 0.0001062
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
newdata = royals.test)
# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
round((with(royals.test,cor(attendance,predict_attendance)^2)),
digits=3),"\n",sep="")
##
## Proportion of Test Set Variance Accounted for: 0.274
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)
# Figure 11 Generate predictive modeling visual - 50th Giveaway
group.labels <- c("No 50th Giveaway","50th Anniversary Giveaway ")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","goldenrod1")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
data = royals.plotting.frame, groups = giveaway_50th, cex = 1.25,
pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(8,8,39,39,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
main = "Regression Model, 50th Giveaways and Attendance",
key = list(space = "right",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))

# 50th Giveaway
# use the full data set to obtain an estimate of the increase in
# attendance due to 50th giveaways, controlling for other factors
give50.model.fit <- lm(give50.model, data = royals) # use all available data
print(summary(give50.model.fit))
##
## Call:
## lm(formula = give50.model, data = royals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10311.1 -3077.7 -44.7 1837.3 14905.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18777.6 2461.6 7.628 2.75e-12 ***
## monthApril -6372.3 2300.1 -2.770 0.006321 **
## monthMay -217.6 2305.4 -0.094 0.924935
## monthJune 255.3 2311.7 0.110 0.912203
## monthJuly -344.0 2290.7 -0.150 0.880844
## monthAug -1030.2 2315.8 -0.445 0.657080
## monthSept -3798.0 2305.2 -1.648 0.101579
## dayofweekTue 796.1 1516.1 0.525 0.600310
## dayofweekWed 343.4 1504.7 0.228 0.819812
## dayofweekThur 2834.1 1612.5 1.758 0.080893 .
## dayofweekFri 4674.3 1519.6 3.076 0.002503 **
## dayofweekSat 5066.5 1507.1 3.362 0.000987 ***
## dayofweekSun 2268.8 1553.5 1.460 0.146311
## giveaway_50thYES 2146.3 1628.4 1.318 0.189528
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4599 on 147 degrees of freedom
## Multiple R-squared: 0.3309, Adjusted R-squared: 0.2718
## F-statistic: 5.593 on 13 and 147 DF, p-value: 3.208e-08
# tests statistical significance of the 50th giveaway promotion
# type I anova computes sums of squares for sequential tests
print(anova(give50.model.fit))
## Analysis of Variance Table
##
## Response: attendance
## Df Sum Sq Mean Sq F value Pr(>F)
## month 6 870448549 145074758 6.8599 1.953e-06 ***
## dayofweek 6 630539342 105089890 4.9692 0.0001162 ***
## giveaway_50th 1 36741588 36741588 1.7373 0.1895277
## Residuals 147 3108800049 21148300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of 50th Giveaway on Attendance: ",
round(give50.model.fit$coefficients[length(give50.model.fit$coefficients)],
digits = 0),"\n",sep="")
##
## Estimated Effect of 50th Giveaway on Attendance: 2146
# Themes
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
levels=c(1,2), labels=c("TRAIN","TEST"))
royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame': 107 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 1 2 9 22 23 24 25 26 31 33 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
## $ month : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
## $ day : int 29 31 10 25 26 27 28 28 3 5 ...
## $ opp : chr "CHW" "CHW" "SEA" "MIL" ...
## $ attendance : int 36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
## $ temp : int 45 45 60 50 67 75 64 67 76 83 ...
## $ skies : chr "Cloudy" "Clear" "Clear" "Cloudy" ...
## $ day_night : chr "Day" "Night" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
## $ interlg : chr "NO" "NO" "NO" "YES" ...
## $ div : chr "YES" "YES" "NO" "NO" ...
## $ amlg : chr "NO" "NO" "YES" "NO" ...
## $ wins : int 0 0 3 5 5 5 5 6 9 10 ...
## $ losses : int 1 2 6 17 18 19 20 20 22 23 ...
## $ winpct : num 0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
## $ giveaway : chr "NO" "NO" "NO" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame': 54 obs. of 23 variables:
## $ season : int 2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
## $ game : int 8 10 11 12 13 21 27 32 41 44 ...
## $ dayofweek : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
## $ month : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
## $ day : int 9 11 12 13 14 24 29 4 14 18 ...
## $ opp : chr "SEA" "SEA" "LAA" "LAA" ...
## $ attendance : int 12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
## $ temp : int 42 71 79 69 42 73 69 77 82 84 ...
## $ skies : chr "Cloudy" "Cloudy" "Clear" "Cloudy" ...
## $ day_night : chr "Night" "Day" "Night" "Night" ...
## $ opptype : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
## $ interlg : chr "NO" "NO" "NO" "NO" ...
## $ div : chr "NO" "NO" "NO" "NO" ...
## $ amlg : chr "YES" "YES" "YES" "YES" ...
## $ wins : int 3 3 3 3 3 5 7 10 13 14 ...
## $ losses : int 5 7 8 9 10 16 20 22 28 30 ...
## $ winpct : num 0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
## $ giveaway : chr "NO" "NO" "YES" "NO" ...
## $ giveaway_50th: chr "NO" "NO" "NO" "NO" ...
## $ fireworks : chr "NO" "NO" "NO" "NO" ...
## $ bobblehead : chr "NO" "NO" "NO" "NO" ...
## $ theme : chr "NO" "NO" "NO" "NO" ...
## $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with 50th giveaway entered last
theme.model <- {attendance ~ month + dayofweek + theme}
# fit the model to the training set
train.model.fit <- lm(theme.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
##
## Call:
## lm(formula = theme.model, data = royals.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11581.6 -3056.4 -238.1 2709.2 14014.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22200.3 2898.1 7.660 1.73e-11 ***
## monthApril -9948.3 2696.1 -3.690 0.000378 ***
## monthMay -3312.3 2700.7 -1.226 0.223128
## monthJune -1883.0 2671.5 -0.705 0.482665
## monthJuly -3512.4 2657.7 -1.322 0.189538
## monthAug -3876.2 2682.6 -1.445 0.151842
## monthSept -6884.8 2740.4 -2.512 0.013718 *
## dayofweekTue 1087.7 1899.6 0.573 0.568298
## dayofweekWed -1068.8 1882.8 -0.568 0.571611
## dayofweekThur 2329.7 1922.1 1.212 0.228567
## dayofweekFri 2327.5 2053.1 1.134 0.259869
## dayofweekSat 2914.3 1958.0 1.488 0.140029
## dayofweekSun 413.4 2022.5 0.204 0.838496
## themeYES 1833.5 1343.0 1.365 0.175478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4615 on 93 degrees of freedom
## Multiple R-squared: 0.3498, Adjusted R-squared: 0.2589
## F-statistic: 3.849 on 13 and 93 DF, p-value: 5.88e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
newdata = royals.test)
# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
round((with(royals.test,cor(attendance,predict_attendance)^2)),
digits=3),"\n",sep="")
##
## Proportion of Test Set Variance Accounted for: 0.22
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)
# Figure 12 Generate predictive modeling visual - Theme
group.labels <- c("No Theme","Theme Night")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","cyan")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
data = royals.plotting.frame, groups = theme, cex = 1.25,
pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(8,8,39,39,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
main = "Regression Model, Theme Nights and Attendance",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))

# Theme Nights
# use the full data set to obtain an estimate of the increase in
# attendance due to theme nights, controlling for other factors
theme.model.fit <- lm(theme.model, data = royals) # use all available data
print(summary(theme.model.fit))
##
## Call:
## lm(formula = theme.model, data = royals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10361.5 -3083.0 -66.9 1965.6 15402.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18673.9 2476.5 7.540 4.47e-12 ***
## monthApril -6243.8 2308.8 -2.704 0.007652 **
## monthMay -135.9 2317.9 -0.059 0.953320
## monthJune 345.2 2334.8 0.148 0.882664
## monthJuly -301.0 2311.8 -0.130 0.896579
## monthAug -869.9 2326.8 -0.374 0.709052
## monthSept -3624.2 2315.8 -1.565 0.119741
## dayofweekTue 763.3 1523.5 0.501 0.617086
## dayofweekWed 263.4 1516.2 0.174 0.862331
## dayofweekThur 2839.3 1620.0 1.753 0.081742 .
## dayofweekFri 4352.9 1606.1 2.710 0.007522 **
## dayofweekSat 5220.6 1530.7 3.411 0.000837 ***
## dayofweekSun 2468.6 1569.8 1.573 0.117984
## themeYES 617.9 1007.3 0.613 0.540575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4620 on 147 degrees of freedom
## Multiple R-squared: 0.3248, Adjusted R-squared: 0.265
## F-statistic: 5.439 on 13 and 147 DF, p-value: 5.711e-08
# tests statistical significance of the theme night promotion
# type I anova computes sums of squares for sequential tests
print(anova(theme.model.fit))
## Analysis of Variance Table
##
## Response: attendance
## Df Sum Sq Mean Sq F value Pr(>F)
## month 6 870448549 145074758 6.7971 2.233e-06 ***
## dayofweek 6 630539342 105089890 4.9237 0.0001284 ***
## theme 1 8030141 8030141 0.3762 0.5405749
## Residuals 147 3137511495 21343616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Theme Nights on Attendance: ",
round(theme.model.fit$coefficients[length(theme.model.fit$coefficients)],
digits = 0),"\n",sep="")
##
## Estimated Effect of Theme Nights on Attendance: 618