# Faith, Brian MSDS 457 Marketing & Revenue Analytics
library (ggplot2)
library(lattice)
library(plyr)
library(dplyr) 
## Warning: package 'dplyr' was built under R version 4.1.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mlbattend <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/mlbattend.csv")
str(mlbattend)
## 'data.frame':    21 obs. of  7 variables:
##  $ year       : int  1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 ...
##  $ time_pnine : int  173 177 174 172 166 167 166 169 171 170 ...
##  $ runs_pg    : num  10.17 10.28 9.55 9.24 9.46 ...
##  $ pitches_pa : num  3.74 3.76 3.73 3.74 3.74 3.77 3.74 3.76 3.77 3.81 ...
##  $ pitchers_pg: num  3.56 3.54 3.63 3.63 3.67 3.76 3.71 3.85 3.97 3.92 ...
##  $ attendance : int  70139380 71358907 72581101 67944389 67630052 73022972 74915268 76043902 79484718 78624315 ...
##  $ attend_pg  : int  28887 29377 29881 28006 27831 30075 30816 31306 32696 32382 ...
summary(mlbattend)
##       year        time_pnine       runs_pg         pitches_pa   
##  Min.   :1999   Min.   :166.0   Min.   : 8.130   Min.   :3.730  
##  1st Qu.:2004   1st Qu.:170.0   1st Qu.: 8.770   1st Qu.:3.760  
##  Median :2009   Median :173.0   Median : 9.240   Median :3.820  
##  Mean   :2009   Mean   :174.2   Mean   : 9.196   Mean   :3.808  
##  3rd Qu.:2014   3rd Qu.:178.0   3rd Qu.: 9.590   3rd Qu.:3.830  
##  Max.   :2019   Max.   :185.0   Max.   :10.280   Max.   :3.930  
##   pitchers_pg      attendance         attend_pg    
##  Min.   :3.540   Min.   :67630052   Min.   :27831  
##  1st Qu.:3.710   1st Qu.:71358907   1st Qu.:29377  
##  Median :3.920   Median :73159044   Median :30131  
##  Mean   :3.908   Mean   :72953538   Mean   :30030  
##  3rd Qu.:3.990   3rd Qu.:74027037   3rd Qu.:30451  
##  Max.   :4.410   Max.   :79484718   Max.   :32696
teamyoy <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/teamyoy.csv")

royals <-read.csv("C:/Users/Brian Faith/Desktop/Northwestern/MSDS 401 Q2/R Data/royals.csv")
str(royals)
## 'data.frame':    161 obs. of  22 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  1 2 8 9 10 11 12 13 21 22 ...
##  $ dayofweek    : chr  "Thursday" "Saturday" "Monday" "Tuesday" ...
##  $ month        : chr  "March" "March" "April" "April" ...
##  $ day          : int  29 31 9 10 11 12 13 14 24 25 ...
##  $ opp          : chr  "CHW" "CHW" "SEA" "SEA" ...
##  $ attendance   : int  36517 17564 12324 14850 14314 14714 15011 15876 16555 13389 ...
##  $ temp         : int  45 45 42 60 71 79 69 42 73 50 ...
##  $ skies        : chr  "Cloudy" "Clear" "Cloudy" "Clear" ...
##  $ day_night    : chr  "Day" "Night" "Night" "Night" ...
##  $ opptype      : chr  "DIV" "DIV" "AL" "AL" ...
##  $ interlg      : chr  "NO" "NO" "NO" "NO" ...
##  $ div          : chr  "YES" "YES" "NO" "NO" ...
##  $ amlg         : chr  "NO" "NO" "YES" "YES" ...
##  $ wins         : int  0 0 3 3 3 3 3 3 5 5 ...
##  $ losses       : int  1 2 5 6 7 8 9 10 16 17 ...
##  $ winpct       : num  0 0 0.375 0.333 0.3 0.273 0.25 0.231 0.238 0.227 ...
##  $ giveaway     : chr  "NO" "NO" "NO" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
# Table 1 Attendance Correlation Summary Table
cor(mlbattend)
##                    year time_pnine    runs_pg  pitches_pa pitchers_pg
## year         1.00000000  0.6833377 -0.6190586  0.94587944  0.95416273
## time_pnine   0.68333773  1.0000000 -0.2189719  0.74784246  0.66500291
## runs_pg     -0.61905859 -0.2189719  1.0000000 -0.43491475 -0.41122903
## pitches_pa   0.94587944  0.7478425 -0.4349147  1.00000000  0.92865790
## pitchers_pg  0.95416273  0.6650029 -0.4112290  0.92865790  1.00000000
## attendance   0.05526261 -0.2322568 -0.1781325 -0.06681834  0.05746695
## attend_pg    0.05128909 -0.2341930 -0.1748932 -0.06991366  0.05384182
##              attendance   attend_pg
## year         0.05526261  0.05128909
## time_pnine  -0.23225681 -0.23419297
## runs_pg     -0.17813250 -0.17489322
## pitches_pa  -0.06681834 -0.06991366
## pitchers_pg  0.05746695  0.05384182
## attendance   1.00000000  0.99992992
## attend_pg    0.99992992  1.00000000
# Figure 1 Game Duration by Year Scatterplot
plot(mlbattend$year, mlbattend$time_pnine, las = 1, xlab = " ", 
     ylab = "Time Per 9 Innings (mins)", main = "Game Duration by Year",
     col = "goldenrod1", pch = 20,  cex = 3, xaxp =c(1999,2019,5))
lines(lowess(mlbattend$year, mlbattend$time_pnine), col = "red", lty = 2)

# Figure 2 Attendance Year over Year by Team
barplot(teamyoy$yoy_diff/1000 ~ teamyoy$team,
        xlab = "Attendance Change (thousands)", ylab = " ",
        main = "2018-2019 Attendance Change",horiz = TRUE,
        col = ifelse((teamyoy$yoy_diff/1000)<0,"red3","seagreen"), xlim = c(-600, 600),las = 1,
        cex.names = .7)

# Max and Min YoY Change Teams 
with(teamyoy, teamyoy$team[which.max(teamyoy$yoy_diff)])
## [1] "PHI"
max(teamyoy$yoy_diff)
## [1] 569297
with(teamyoy, teamyoy$team[which.min(teamyoy$yoy_diff)])
## [1] "TOR"
min(teamyoy$yoy_diff)
## [1] -575137
# Environmental Analysis 

royals$dayofweek <- with(data=royals,
                              ifelse ((dayofweek == "Monday"),1,
                              ifelse ((dayofweek == "Tuesday"),2,
                              ifelse ((dayofweek == "Wednesday"),3,
                              ifelse ((dayofweek == "Thursday"),4,
                              ifelse ((dayofweek == "Friday"),5,
                              ifelse ((dayofweek == "Saturday"),6,7)))))))
royals$dayofweek <- factor(royals$dayofweek, levels=1:7,
                    labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))

# Figure 3 Box Plot of Royals Attendance by Day of Week
with(data=royals,plot(dayofweek, (attendance/1000),
                       xlab = " ", ylab = "Attendance (thousands)",
                      main = "Royals Attendance by Day of Week 2018-2019",
                       col = "dodgerblue", las = 1))

royals$month <- with(data=royals,
                              ifelse ((month == "March"),3,
                              ifelse ((month == "April"),4,
                              ifelse ((month == "May"),5,
                              ifelse ((month == "June"),6,
                              ifelse ((month == "July"),7,
                              ifelse ((month == "August"),8,9)))))))
royals$month <- factor(royals$month, levels=3:9,
              labels = c("March","April", "May", "June", "July", "Aug", "Sept"))

# Figure 4 Box plot of Attendance by Month
with(data=royals,plot(month,(attendance/1000), 
                      xlab = " ",ylab = "Attendance (thousands)", 
                      main = "Royals Attendance by Month 2018-2019",
                      col = "goldenrod1", las = 1))

by(royals[, 7], royals[,'month'], median)
## royals[, "month"]: March
## [1] 17564
## ------------------------------------------------------------ 
## royals[, "month"]: April
## [1] 14782
## ------------------------------------------------------------ 
## royals[, "month"]: May
## [1] 20620.5
## ------------------------------------------------------------ 
## royals[, "month"]: June
## [1] 20889
## ------------------------------------------------------------ 
## royals[, "month"]: July
## [1] 18379
## ------------------------------------------------------------ 
## royals[, "month"]: Aug
## [1] 18385
## ------------------------------------------------------------ 
## royals[, "month"]: Sept
## [1] 17875
cor(royals$attendance, royals$temp)
## [1] 0.2445549
# Figure 5 Weather, Fireworks, and Attendance Scatter Plot Lattice
group.labels <- c("No Fireworks","Fireworks")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","red")
xyplot(attendance/1000 ~ temp | skies + day_night,
       data = royals, groups = fireworks, pch = group.symbols,
       aspect = 1, cex = 1.5, col = group.colors, fill = group.fill,
       layout = c(2, 2), type = c("p","g"),
       strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1),
       xlab = "Temperature (Degrees Fahrenheit)",
       ylab = "Attendance (thousands)",
       main = "Weather, Fireworks, and Attendance",
       key = list(space = "right",
                  text = list(rev(group.labels),col = rev(group.colors)),
                  points = list(pch = rev(group.symbols), col = rev(group.colors),
                                fill = rev(group.fill))))

# Figure 6 Attendance by Visiting Team
group.labels <- c("Day","Night")
group.symbols <- c(1,8)
group.symbols.size <- c(1.25,1)
bwplot(opp ~ attendance/1000, data = royals, groups = day_night,
       xlab = "Attendance (thousands)", ylab = " ",
       main = "Attendance by Opponent and Game Time, 2018-2019",
       panel = function(x, y, groups, subscripts, ...)
           
       {panel.grid(h = (length(levels(royals$opp)) - 1), v = -1)
         panel.stripplot(x, y, groups = groups, subscripts = subscripts,
                         cex = group.symbols.size, pch = group.symbols, 
                         col = "darkblue")},
       key = list(space = "right",
                  text = list(group.labels,col = "black"),
                  points = list(pch = group.symbols, cex = group.symbols.size,
                                col = "darkblue")))

# Figure 7 Attendance by Visiting Team by Opponent Type Category
royals$opptype <- with(data=royals,
                       ifelse ((opptype == "INT"),1,
                               ifelse ((opptype == "DIV"),2,3)))
royals$opptype <- factor(royals$opptype, levels=1:3,
    labels = c("Interleague","Division", "American League"))
group.labels <- c("Day","Night")

group.symbols <- c(1,8)
group.symbols.size <- c(1.25,1)
bwplot(opptype ~ attendance/1000, data = royals, groups = day_night,
       xlab = "Attendance (thousands)", ylab = " ", 
       main = "Attendance by Opponent Type",
       panel = function(x, y, groups, subscripts, ...)
       {panel.grid(h = (length(levels(royals$opptype)) - 1), v = -1)
         panel.stripplot(x, y, groups = groups, subscripts = subscripts,
        cex = group.symbols.size, pch = group.symbols, col = "darkblue")},
       key = list(space = "right",
                  text = list(group.labels,col = "black"),
                  points = list(pch = group.symbols, cex = group.symbols.size,
                                col = "darkblue")))

# Promotional Analysis 

# Summary tables for promotional variables by day of week
with(royals, table(bobblehead,dayofweek))
##           dayofweek
## bobblehead Mon Tue Wed Thur Fri Sat Sun
##        NO   15  24  25   19  23  22  25
##        YES   0   0   0    0   1   7   0
with(royals, table(giveaway,dayofweek))
##         dayofweek
## giveaway Mon Tue Wed Thur Fri Sat Sun
##      NO   15  12  25   14  16  22  21
##      YES   0  12   0    5   8   7   4
with(royals, table(giveaway_50th, dayofweek))
##              dayofweek
## giveaway_50th Mon Tue Wed Thur Fri Sat Sun
##           NO   15  24  25   19  24  24  20
##           YES   0   0   0    0   0   5   5
with(royals, table(theme,dayofweek))
##      dayofweek
## theme Mon Tue Wed Thur Fri Sat Sun
##   NO   15  23  22   19  12  19  16
##   YES   0   1   3    0  12  10   9
# Bobbleheads
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
                   rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
                                levels=c(1,2), labels=c("TRAIN","TEST"))

royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame':    107 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  1 2 9 22 23 24 25 26 31 33 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
##  $ day          : int  29 31 10 25 26 27 28 28 3 5 ...
##  $ opp          : chr  "CHW" "CHW" "SEA" "MIL" ...
##  $ attendance   : int  36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
##  $ temp         : int  45 45 60 50 67 75 64 67 76 83 ...
##  $ skies        : chr  "Cloudy" "Clear" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Day" "Night" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
##  $ interlg      : chr  "NO" "NO" "NO" "YES" ...
##  $ div          : chr  "YES" "YES" "NO" "NO" ...
##  $ amlg         : chr  "NO" "NO" "YES" "NO" ...
##  $ wins         : int  0 0 3 5 5 5 5 6 9 10 ...
##  $ losses       : int  1 2 6 17 18 19 20 20 22 23 ...
##  $ winpct       : num  0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
##  $ giveaway     : chr  "NO" "NO" "NO" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame':    54 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  8 10 11 12 13 21 27 32 41 44 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ day          : int  9 11 12 13 14 24 29 4 14 18 ...
##  $ opp          : chr  "SEA" "SEA" "LAA" "LAA" ...
##  $ attendance   : int  12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
##  $ temp         : int  42 71 79 69 42 73 69 77 82 84 ...
##  $ skies        : chr  "Cloudy" "Cloudy" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Night" "Day" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
##  $ interlg      : chr  "NO" "NO" "NO" "NO" ...
##  $ div          : chr  "NO" "NO" "NO" "NO" ...
##  $ amlg         : chr  "YES" "YES" "YES" "YES" ...
##  $ wins         : int  3 3 3 3 3 5 7 10 13 14 ...
##  $ losses       : int  5 7 8 9 10 16 20 22 28 30 ...
##  $ winpct       : num  0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
##  $ giveaway     : chr  "NO" "NO" "YES" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with bobblehead entered last
my.model <- {attendance ~ month + dayofweek + bobblehead}
# fit the model to the training set
train.model.fit <- lm(my.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
## 
## Call:
## lm(formula = my.model, data = royals.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11708.2  -3102.7   -346.2   2634.8  14071.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      21999       2890   7.611 2.19e-11 ***
## monthApril       -9785       2695  -3.631 0.000462 ***
## monthMay         -3377       2717  -1.243 0.216968    
## monthJune        -1754       2670  -0.657 0.512941    
## monthJuly        -3237       2639  -1.227 0.223040    
## monthAug         -3731       2679  -1.393 0.166996    
## monthSept        -6552       2721  -2.408 0.017997 *  
## dayofweekTue      1115       1904   0.586 0.559512    
## dayofweekWed     -1029       1887  -0.546 0.586657    
## dayofweekThur     2405       1924   1.249 0.214630    
## dayofweekFri      3155       1960   1.610 0.110774    
## dayofweekSat      3242       1904   1.703 0.091880 .  
## dayofweekSun      1365       1901   0.718 0.474722    
## bobbleheadYES     3161       2624   1.205 0.231430    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4626 on 93 degrees of freedom
## Multiple R-squared:  0.347,  Adjusted R-squared:  0.2557 
## F-statistic: 3.801 on 13 and 93 DF,  p-value: 6.907e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
                                       newdata = royals.test)

# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
    round((with(royals.test,cor(attendance,predict_attendance)^2)),
          digits=3),"\n",sep="")
## 
## Proportion of Test Set Variance Accounted for: 0.257
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)

# Figure 9 Generate predictive modeling visual - Bobbleheads
group.labels <- c("No Bobbleheads","Bobbleheads")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","dodgerblue")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
       data = royals.plotting.frame, groups = bobblehead, cex = 1.25,
       pch = group.symbols, col = group.colors, fill = group.fill,
       layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
       aspect=1, type = c("p","g"),
       panel=function(x,y, ...)
       {panel.xyplot(x,y,...)
         panel.segments(8,8,39,39,col="black",cex=2)
       },
       strip=function(...) strip.default(..., style=1),
       xlab = "Actual Attendance (thousands)",
       ylab = "Predicted Attendance (thousands)",
       main = "Regression Model, Bobbleheads and Attendance",
       key = list(space = "right",
                  text = list(rev(group.labels),col = rev(group.colors)),
                  points = list(pch = rev(group.symbols),
                                col = rev(group.colors),
                                fill = rev(group.fill))))

# Bobblehead
# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = royals)  # use all available data
print(summary(my.model.fit))
## 
## Call:
## lm(formula = my.model, data = royals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10164.8  -3020.2   -183.8   2210.9  15718.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18745.1     2469.0   7.592 3.35e-12 ***
## monthApril     -6262.2     2302.0  -2.720  0.00731 ** 
## monthMay        -451.2     2342.5  -0.193  0.84753    
## monthJune        354.8     2314.1   0.153  0.87835    
## monthJuly       -327.1     2298.2  -0.142  0.88701    
## monthAug        -989.3     2323.9  -0.426  0.67095    
## monthSept      -3562.5     2298.9  -1.550  0.12338    
## dayofweekTue     781.4     1519.6   0.514  0.60787    
## dayofweekWed     339.9     1508.2   0.225  0.82200    
## dayofweekThur   2826.3     1616.4   1.749  0.08246 .  
## dayofweekFri    4568.4     1525.7   2.994  0.00323 ** 
## dayofweekSat    4952.7     1558.9   3.177  0.00181 ** 
## dayofweekSun    2674.7     1523.6   1.756  0.08125 .  
## bobbleheadYES   1969.5     1922.8   1.024  0.30738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4609 on 147 degrees of freedom
## Multiple R-squared:  0.3278, Adjusted R-squared:  0.2684 
## F-statistic: 5.515 on 13 and 147 DF,  p-value: 4.293e-08
# tests statistical significance of the bobblehead promotion
# type I anova computes sums of squares for sequential tests
print(anova(my.model.fit))
## Analysis of Variance Table
## 
## Response: attendance
##             Df     Sum Sq   Mean Sq F value    Pr(>F)    
## month        6  870448549 145074758  6.8281  2.09e-06 ***
## dayofweek    6  630539342 105089890  4.9462 0.0001222 ***
## bobblehead   1   22291687  22291687  1.0492 0.3073767    
## Residuals  147 3123249950  21246598                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ",
    round(my.model.fit$coefficients[length(my.model.fit$coefficients)],
          digits = 0),"\n",sep="")
## 
## Estimated Effect of Bobblehead Promotion on Attendance: 1969
# Giveaways
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
                   rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
                               levels=c(1,2), labels=c("TRAIN","TEST"))

royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame':    107 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  1 2 9 22 23 24 25 26 31 33 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
##  $ day          : int  29 31 10 25 26 27 28 28 3 5 ...
##  $ opp          : chr  "CHW" "CHW" "SEA" "MIL" ...
##  $ attendance   : int  36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
##  $ temp         : int  45 45 60 50 67 75 64 67 76 83 ...
##  $ skies        : chr  "Cloudy" "Clear" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Day" "Night" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
##  $ interlg      : chr  "NO" "NO" "NO" "YES" ...
##  $ div          : chr  "YES" "YES" "NO" "NO" ...
##  $ amlg         : chr  "NO" "NO" "YES" "NO" ...
##  $ wins         : int  0 0 3 5 5 5 5 6 9 10 ...
##  $ losses       : int  1 2 6 17 18 19 20 20 22 23 ...
##  $ winpct       : num  0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
##  $ giveaway     : chr  "NO" "NO" "NO" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame':    54 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  8 10 11 12 13 21 27 32 41 44 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ day          : int  9 11 12 13 14 24 29 4 14 18 ...
##  $ opp          : chr  "SEA" "SEA" "LAA" "LAA" ...
##  $ attendance   : int  12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
##  $ temp         : int  42 71 79 69 42 73 69 77 82 84 ...
##  $ skies        : chr  "Cloudy" "Cloudy" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Night" "Day" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
##  $ interlg      : chr  "NO" "NO" "NO" "NO" ...
##  $ div          : chr  "NO" "NO" "NO" "NO" ...
##  $ amlg         : chr  "YES" "YES" "YES" "YES" ...
##  $ wins         : int  3 3 3 3 3 5 7 10 13 14 ...
##  $ losses       : int  5 7 8 9 10 16 20 22 28 30 ...
##  $ winpct       : num  0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
##  $ giveaway     : chr  "NO" "NO" "YES" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with giveaway entered last
give.model <- {attendance ~ month + dayofweek + giveaway}
# fit the model to the training set
train.model.fit <- lm(give.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
## 
## Call:
## lm(formula = give.model, data = royals.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11945.4  -2982.2   -416.8   2832.3  13911.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    21899.2     2908.3   7.530 3.21e-11 ***
## monthApril     -9746.4     2708.6  -3.598 0.000516 ***
## monthMay       -3246.8     2733.9  -1.188 0.238017    
## monthJune      -1713.4     2700.0  -0.635 0.527249    
## monthJuly      -3179.0     2664.8  -1.193 0.235913    
## monthAug       -3472.1     2676.8  -1.297 0.197804    
## monthSept      -6627.7     2759.1  -2.402 0.018290 *  
## dayofweekTue     561.0     2044.9   0.274 0.784426    
## dayofweekWed   -1017.7     1895.4  -0.537 0.592574    
## dayofweekThur   2266.9     1949.3   1.163 0.247838    
## dayofweekFri    2751.0     2048.2   1.343 0.182503    
## dayofweekSat    3579.2     1880.1   1.904 0.060035 .  
## dayofweekSun    1222.5     1919.9   0.637 0.525833    
## giveawayYES      907.9     1190.1   0.763 0.447457    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4647 on 93 degrees of freedom
## Multiple R-squared:  0.3409, Adjusted R-squared:  0.2488 
## F-statistic:   3.7 on 13 and 93 DF,  p-value: 9.7e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
                                          newdata = royals.test)

# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
    round((with(royals.test,cor(attendance,predict_attendance)^2)),
          digits=3),"\n",sep="")
## 
## Proportion of Test Set Variance Accounted for: 0.259
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)


# Figure 10 Generate predictive modeling visual - Giveaway
group.labels <- c("No Giveaway","Giveaway")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","skyblue")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
       data = royals.plotting.frame, groups = giveaway, 
       cex = 1.25, pch = group.symbols, col = group.colors, fill = group.fill,
       layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
       aspect=1, type = c("p","g"),
       panel=function(x,y, ...)
       {panel.xyplot(x,y,...)
           panel.segments(8,8,39,39,col="black",cex=2)
       },
       strip=function(...) strip.default(..., style=1),
       xlab = "Actual Attendance (thousands)",
       ylab = "Predicted Attendance (thousands)",
       main = "Regression Model, Giveaways and Attendance",
       key = list(space = "right",
                  text = list(rev(group.labels),col = rev(group.colors)),
                  points = list(pch = rev(group.symbols),
                                col = rev(group.colors),
                                fill = rev(group.fill))))

# Giveaways
# use the full data set to obtain an estimate of the increase in
# attendance due to giveaways, controlling for other factors
give.model.fit <- lm(give.model, data = royals)  # use all available data
print(summary(give.model.fit))
## 
## Call:
## lm(formula = give.model, data = royals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10439.8  -2946.9    -59.2   2154.9  15281.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18686.8     2477.5   7.543 4.41e-12 ***
## monthApril     -6271.3     2311.1  -2.714 0.007453 ** 
## monthMay        -177.5     2322.8  -0.076 0.939197    
## monthJune        364.4     2329.2   0.156 0.875884    
## monthJuly       -342.6     2319.0  -0.148 0.882741    
## monthAug        -827.6     2320.3  -0.357 0.721855    
## monthSept      -3650.9     2319.5  -1.574 0.117633    
## dayofweekTue     478.3     1598.0   0.299 0.765115    
## dayofweekWed     335.7     1511.5   0.222 0.824560    
## dayofweekThur   2678.7     1642.8   1.631 0.105125    
## dayofweekFri    4453.0     1560.5   2.853 0.004949 ** 
## dayofweekSat    5286.0     1507.1   3.507 0.000601 ***
## dayofweekSun    2594.5     1534.5   1.691 0.092998 .  
## giveawayYES      615.3      969.2   0.635 0.526503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4619 on 147 degrees of freedom
## Multiple R-squared:  0.3249, Adjusted R-squared:  0.2652 
## F-statistic: 5.442 on 13 and 147 DF,  p-value: 5.647e-08
# tests statistical significance of the giveaway
# type I anova computes sums of squares for sequential tests
print(anova(give.model.fit))
## Analysis of Variance Table
## 
## Response: attendance
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## month       6  870448549 145074758  6.7983 2.227e-06 ***
## dayofweek   6  630539342 105089890  4.9246 0.0001281 ***
## giveaway    1    8601065   8601065  0.4031 0.5265031    
## Residuals 147 3136940572  21339732                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Giveaways on Attendance: ",
    round(give.model.fit$coefficients[length(give.model.fit$coefficients)],
          digits = 0),"\n",sep="")
## 
## Estimated Effect of Giveaways on Attendance: 615
# Giveaway 50th 
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
                   rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
                               levels=c(1,2), labels=c("TRAIN","TEST"))

royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame':    107 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  1 2 9 22 23 24 25 26 31 33 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
##  $ day          : int  29 31 10 25 26 27 28 28 3 5 ...
##  $ opp          : chr  "CHW" "CHW" "SEA" "MIL" ...
##  $ attendance   : int  36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
##  $ temp         : int  45 45 60 50 67 75 64 67 76 83 ...
##  $ skies        : chr  "Cloudy" "Clear" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Day" "Night" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
##  $ interlg      : chr  "NO" "NO" "NO" "YES" ...
##  $ div          : chr  "YES" "YES" "NO" "NO" ...
##  $ amlg         : chr  "NO" "NO" "YES" "NO" ...
##  $ wins         : int  0 0 3 5 5 5 5 6 9 10 ...
##  $ losses       : int  1 2 6 17 18 19 20 20 22 23 ...
##  $ winpct       : num  0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
##  $ giveaway     : chr  "NO" "NO" "NO" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame':    54 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  8 10 11 12 13 21 27 32 41 44 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ day          : int  9 11 12 13 14 24 29 4 14 18 ...
##  $ opp          : chr  "SEA" "SEA" "LAA" "LAA" ...
##  $ attendance   : int  12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
##  $ temp         : int  42 71 79 69 42 73 69 77 82 84 ...
##  $ skies        : chr  "Cloudy" "Cloudy" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Night" "Day" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
##  $ interlg      : chr  "NO" "NO" "NO" "NO" ...
##  $ div          : chr  "NO" "NO" "NO" "NO" ...
##  $ amlg         : chr  "YES" "YES" "YES" "YES" ...
##  $ wins         : int  3 3 3 3 3 5 7 10 13 14 ...
##  $ losses       : int  5 7 8 9 10 16 20 22 28 30 ...
##  $ winpct       : num  0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
##  $ giveaway     : chr  "NO" "NO" "YES" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with 50th giveaway entered last
give50.model <- {attendance ~ month + dayofweek + giveaway_50th}
# fit the model to the training set
train.model.fit <- lm(give50.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
## 
## Call:
## lm(formula = give50.model, data = royals.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11972.7  -3183.6   -441.7   2609.9  14012.1 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         21679       2893   7.494 3.81e-11 ***
## monthApril          -9618       2708  -3.552 0.000602 ***
## monthMay            -3015       2712  -1.112 0.269041    
## monthJune           -1530       2680  -0.571 0.569282    
## monthJuly           -2858       2636  -1.084 0.281111    
## monthAug            -3352       2672  -1.254 0.212834    
## monthSept           -6240       2730  -2.286 0.024533 *  
## dayofweekTue         1101       1915   0.575 0.566637    
## dayofweekWed        -1031       1898  -0.543 0.588311    
## dayofweekThur        2460       1935   1.271 0.206793    
## dayofweekFri         3156       1972   1.601 0.112799    
## dayofweekSat         3827       1838   2.082 0.040089 *  
## dayofweekSun         1079       1974   0.547 0.585976    
## giveaway_50thYES     1540       2599   0.593 0.554886    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4653 on 93 degrees of freedom
## Multiple R-squared:  0.3393, Adjusted R-squared:  0.2469 
## F-statistic: 3.673 on 13 and 93 DF,  p-value: 0.0001062
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
                                          newdata = royals.test)

# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
    round((with(royals.test,cor(attendance,predict_attendance)^2)),
          digits=3),"\n",sep="")
## 
## Proportion of Test Set Variance Accounted for: 0.274
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)

# Figure 11 Generate predictive modeling visual - 50th Giveaway
group.labels <- c("No 50th Giveaway","50th Anniversary Giveaway ")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","goldenrod1")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
       data = royals.plotting.frame, groups = giveaway_50th, cex = 1.25,
       pch = group.symbols, col = group.colors, fill = group.fill,
       layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
       aspect=1, type = c("p","g"),
       panel=function(x,y, ...)
       {panel.xyplot(x,y,...)
         panel.segments(8,8,39,39,col="black",cex=2)
       },
       strip=function(...) strip.default(..., style=1),
       xlab = "Actual Attendance (thousands)",
       ylab = "Predicted Attendance (thousands)",
       main = "Regression Model, 50th Giveaways and Attendance",
       key = list(space = "right",
                  text = list(rev(group.labels),col = rev(group.colors)),
                  points = list(pch = rev(group.symbols),
                                col = rev(group.colors),
                                fill = rev(group.fill))))

# 50th Giveaway
# use the full data set to obtain an estimate of the increase in
# attendance due to 50th giveaways, controlling for other factors
give50.model.fit <- lm(give50.model, data = royals)  # use all available data
print(summary(give50.model.fit))
## 
## Call:
## lm(formula = give50.model, data = royals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10311.1  -3077.7    -44.7   1837.3  14905.3 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       18777.6     2461.6   7.628 2.75e-12 ***
## monthApril        -6372.3     2300.1  -2.770 0.006321 ** 
## monthMay           -217.6     2305.4  -0.094 0.924935    
## monthJune           255.3     2311.7   0.110 0.912203    
## monthJuly          -344.0     2290.7  -0.150 0.880844    
## monthAug          -1030.2     2315.8  -0.445 0.657080    
## monthSept         -3798.0     2305.2  -1.648 0.101579    
## dayofweekTue        796.1     1516.1   0.525 0.600310    
## dayofweekWed        343.4     1504.7   0.228 0.819812    
## dayofweekThur      2834.1     1612.5   1.758 0.080893 .  
## dayofweekFri       4674.3     1519.6   3.076 0.002503 ** 
## dayofweekSat       5066.5     1507.1   3.362 0.000987 ***
## dayofweekSun       2268.8     1553.5   1.460 0.146311    
## giveaway_50thYES   2146.3     1628.4   1.318 0.189528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4599 on 147 degrees of freedom
## Multiple R-squared:  0.3309, Adjusted R-squared:  0.2718 
## F-statistic: 5.593 on 13 and 147 DF,  p-value: 3.208e-08
# tests statistical significance of the 50th giveaway promotion
# type I anova computes sums of squares for sequential tests
print(anova(give50.model.fit))
## Analysis of Variance Table
## 
## Response: attendance
##                Df     Sum Sq   Mean Sq F value    Pr(>F)    
## month           6  870448549 145074758  6.8599 1.953e-06 ***
## dayofweek       6  630539342 105089890  4.9692 0.0001162 ***
## giveaway_50th   1   36741588  36741588  1.7373 0.1895277    
## Residuals     147 3108800049  21148300                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of 50th Giveaway on Attendance: ",
    round(give50.model.fit$coefficients[length(give50.model.fit$coefficients)],
          digits = 0),"\n",sep="")
## 
## Estimated Effect of 50th Giveaway on Attendance: 2146
# Themes
# Train and Test regimen for model validation
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(royals))),
                   rep(2,length=(nrow(royals) - trunc((2/3)*nrow(royals)))))
royals$training_test <- sample(training_test) # random permutation
royals$training_test <- factor(royals$training_test,
                               levels=c(1,2), labels=c("TRAIN","TEST"))

royals.train <- subset(royals, training_test == "TRAIN")
print(str(royals.train)) # check training data frame
## 'data.frame':    107 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  1 2 9 22 23 24 25 26 31 33 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 4 6 2 3 4 5 6 6 4 6 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 1 1 2 2 2 2 2 2 3 3 ...
##  $ day          : int  29 31 10 25 26 27 28 28 3 5 ...
##  $ opp          : chr  "CHW" "CHW" "SEA" "MIL" ...
##  $ attendance   : int  36517 17564 14850 13389 18315 15395 16971 16070 28866 20708 ...
##  $ temp         : int  45 45 60 50 67 75 64 67 76 83 ...
##  $ skies        : chr  "Cloudy" "Clear" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Day" "Night" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 2 2 3 1 2 2 2 2 2 2 ...
##  $ interlg      : chr  "NO" "NO" "NO" "YES" ...
##  $ div          : chr  "YES" "YES" "NO" "NO" ...
##  $ amlg         : chr  "NO" "NO" "YES" "NO" ...
##  $ wins         : int  0 0 3 5 5 5 5 6 9 10 ...
##  $ losses       : int  1 2 6 17 18 19 20 20 22 23 ...
##  $ winpct       : num  0 0 0.333 0.227 0.217 0.208 0.2 0.231 0.29 0.303 ...
##  $ giveaway     : chr  "NO" "NO" "NO" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
royals.test <- subset(royals, training_test == "TEST")
print(str(royals.test)) # check test data frame
## 'data.frame':    54 obs. of  23 variables:
##  $ season       : int  2018 2018 2018 2018 2018 2018 2018 2018 2018 2018 ...
##  $ game         : int  8 10 11 12 13 21 27 32 41 44 ...
##  $ dayofweek    : Factor w/ 7 levels "Mon","Tue","Wed",..: 1 3 4 5 6 2 7 5 1 5 ...
##  $ month        : Factor w/ 7 levels "March","April",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ day          : int  9 11 12 13 14 24 29 4 14 18 ...
##  $ opp          : chr  "SEA" "SEA" "LAA" "LAA" ...
##  $ attendance   : int  12324 14314 14714 15011 15876 16555 23892 24648 14174 26433 ...
##  $ temp         : int  42 71 79 69 42 73 69 77 82 84 ...
##  $ skies        : chr  "Cloudy" "Cloudy" "Clear" "Cloudy" ...
##  $ day_night    : chr  "Night" "Day" "Night" "Night" ...
##  $ opptype      : Factor w/ 3 levels "Interleague",..: 3 3 3 3 3 1 2 2 3 3 ...
##  $ interlg      : chr  "NO" "NO" "NO" "NO" ...
##  $ div          : chr  "NO" "NO" "NO" "NO" ...
##  $ amlg         : chr  "YES" "YES" "YES" "YES" ...
##  $ wins         : int  3 3 3 3 3 5 7 10 13 14 ...
##  $ losses       : int  5 7 8 9 10 16 20 22 28 30 ...
##  $ winpct       : num  0.375 0.3 0.273 0.25 0.231 0.238 0.259 0.313 0.317 0.318 ...
##  $ giveaway     : chr  "NO" "NO" "YES" "NO" ...
##  $ giveaway_50th: chr  "NO" "NO" "NO" "NO" ...
##  $ fireworks    : chr  "NO" "NO" "NO" "NO" ...
##  $ bobblehead   : chr  "NO" "NO" "NO" "NO" ...
##  $ theme        : chr  "NO" "NO" "NO" "NO" ...
##  $ training_test: Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
# Specify model with 50th giveaway entered last
theme.model <- {attendance ~ month + dayofweek + theme}
# fit the model to the training set
train.model.fit <- lm(theme.model, data = royals.train)
# summary of model fit to the training set
print(summary(train.model.fit))
## 
## Call:
## lm(formula = theme.model, data = royals.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11581.6  -3056.4   -238.1   2709.2  14014.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    22200.3     2898.1   7.660 1.73e-11 ***
## monthApril     -9948.3     2696.1  -3.690 0.000378 ***
## monthMay       -3312.3     2700.7  -1.226 0.223128    
## monthJune      -1883.0     2671.5  -0.705 0.482665    
## monthJuly      -3512.4     2657.7  -1.322 0.189538    
## monthAug       -3876.2     2682.6  -1.445 0.151842    
## monthSept      -6884.8     2740.4  -2.512 0.013718 *  
## dayofweekTue    1087.7     1899.6   0.573 0.568298    
## dayofweekWed   -1068.8     1882.8  -0.568 0.571611    
## dayofweekThur   2329.7     1922.1   1.212 0.228567    
## dayofweekFri    2327.5     2053.1   1.134 0.259869    
## dayofweekSat    2914.3     1958.0   1.488 0.140029    
## dayofweekSun     413.4     2022.5   0.204 0.838496    
## themeYES        1833.5     1343.0   1.365 0.175478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4615 on 93 degrees of freedom
## Multiple R-squared:  0.3498, Adjusted R-squared:  0.2589 
## F-statistic: 3.849 on 13 and 93 DF,  p-value: 5.88e-05
# training set predictions from the model fit to the training set
royals.train$predict_attendance <- predict(train.model.fit)
# test set predictions from the model fit to the training set
royals.test$predict_attendance <- predict(train.model.fit,
                                          newdata = royals.test)

# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
    round((with(royals.test,cor(attendance,predict_attendance)^2)),
          digits=3),"\n",sep="")
## 
## Proportion of Test Set Variance Accounted for: 0.22
# merge the training and test sets for plotting
royals.plotting.frame <- rbind(royals.train,royals.test)

# Figure 12 Generate predictive modeling visual - Theme
group.labels <- c("No Theme","Theme Night")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","cyan")
xyplot(predict_attendance/1000 ~ attendance/1000 | training_test,
       data = royals.plotting.frame, groups = theme, cex = 1.25,
       pch = group.symbols, col = group.colors, fill = group.fill,
       layout = c(2, 1), xlim = c(5,40), ylim = c(5,40),
       aspect=1, type = c("p","g"),
       panel=function(x,y, ...)
       {panel.xyplot(x,y,...)
         panel.segments(8,8,39,39,col="black",cex=2)
       },
       strip=function(...) strip.default(..., style=1),
       xlab = "Actual Attendance (thousands)",
       ylab = "Predicted Attendance (thousands)",
       main = "Regression Model, Theme Nights and Attendance",
       key = list(space = "top",
                  text = list(rev(group.labels),col = rev(group.colors)),
                  points = list(pch = rev(group.symbols),
                                col = rev(group.colors),
                                fill = rev(group.fill))))

# Theme Nights
# use the full data set to obtain an estimate of the increase in
# attendance due to theme nights, controlling for other factors
theme.model.fit <- lm(theme.model, data = royals)  # use all available data
print(summary(theme.model.fit))
## 
## Call:
## lm(formula = theme.model, data = royals)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10361.5  -3083.0    -66.9   1965.6  15402.4 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18673.9     2476.5   7.540 4.47e-12 ***
## monthApril     -6243.8     2308.8  -2.704 0.007652 ** 
## monthMay        -135.9     2317.9  -0.059 0.953320    
## monthJune        345.2     2334.8   0.148 0.882664    
## monthJuly       -301.0     2311.8  -0.130 0.896579    
## monthAug        -869.9     2326.8  -0.374 0.709052    
## monthSept      -3624.2     2315.8  -1.565 0.119741    
## dayofweekTue     763.3     1523.5   0.501 0.617086    
## dayofweekWed     263.4     1516.2   0.174 0.862331    
## dayofweekThur   2839.3     1620.0   1.753 0.081742 .  
## dayofweekFri    4352.9     1606.1   2.710 0.007522 ** 
## dayofweekSat    5220.6     1530.7   3.411 0.000837 ***
## dayofweekSun    2468.6     1569.8   1.573 0.117984    
## themeYES         617.9     1007.3   0.613 0.540575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4620 on 147 degrees of freedom
## Multiple R-squared:  0.3248, Adjusted R-squared:  0.265 
## F-statistic: 5.439 on 13 and 147 DF,  p-value: 5.711e-08
# tests statistical significance of the theme night promotion
# type I anova computes sums of squares for sequential tests
print(anova(theme.model.fit))
## Analysis of Variance Table
## 
## Response: attendance
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## month       6  870448549 145074758  6.7971 2.233e-06 ***
## dayofweek   6  630539342 105089890  4.9237 0.0001284 ***
## theme       1    8030141   8030141  0.3762 0.5405749    
## Residuals 147 3137511495  21343616                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n","Estimated Effect of Theme Nights on Attendance: ",
    round(theme.model.fit$coefficients[length(theme.model.fit$coefficients)],
          digits = 0),"\n",sep="")
## 
## Estimated Effect of Theme Nights on Attendance: 618