#install.packages(c("randomForest","lattice","tree"),dependencies = TRUE)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(lattice)
library(tree)
dodgers <- read.csv("~/Datasets/dodgers.csv")
attach(dodgers)
names(dodgers)
## [1] "month" "day" "attend" "day_of_week" "opponent"
## [6] "temp" "skies" "day_night" "cap" "shirt"
## [11] "fireworks" "bobblehead"
summary(dodgers)
## month day attend day_of_week opponent
## APR:12 Min. : 1.00 Min. :24312 Friday :13 Giants : 9
## AUG:15 1st Qu.: 8.00 1st Qu.:34493 Monday :12 Padres : 9
## JUL:12 Median :15.00 Median :40284 Saturday :13 Rockies : 9
## JUN: 9 Mean :16.14 Mean :41040 Sunday :13 Snakes : 9
## MAY:18 3rd Qu.:25.00 3rd Qu.:46588 Thursday : 5 Cardinals: 7
## OCT: 3 Max. :31.00 Max. :56000 Tuesday :13 Brewers : 4
## SEP:12 Wednesday:12 (Other) :34
## temp skies day_night cap shirt fireworks
## Min. :54.00 Clear :62 Day :15 NO :79 NO :78 NO :67
## 1st Qu.:67.00 Cloudy:19 Night:66 YES: 2 YES: 3 YES:14
## Median :73.00
## Mean :73.15
## 3rd Qu.:79.00
## Max. :95.00
##
## bobblehead
## NO :70
## YES:11
##
##
##
##
##
plot(dodgers$attend/1000~dodgers$temp,las=1,xlab = "Temperature",
ylab = "Attendance(thousands)",col="blue",pch=19,
main = "Temperature by Attendance(thousands)")
Let’s create an ordered month variable and add it to our dataset.
# define an ordered month variable ################
# for plots and data summaries
dodgers$ordered_month <- with(data=dodgers,
ifelse ((month == "APR"),4,
ifelse ((month == "MAY"),5,
ifelse ((month == "JUN"),6,
ifelse ((month == "JUL"),7,
ifelse ((month == "AUG"),8,
ifelse ((month == "SEP"),9,10)))))))
dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10,
labels = c("April", "May", "June", "July", "Aug", "Sept", "Oct"))
Attendance by Temp ordered by Month
plot(dodgers$attend~dodgers$ordered_month,las=1,xlab = "Month",
ylab = "Temp",col="blue",pch=19,
main = "Month by Temp")
Let’s create an ordered day-of-week variable as well. We will use this later on in our data analysis.
# define an ordered day-of-week variable ##############
# for plots and data summaries
dodgers$ordered_day_of_week <- with(data=dodgers,
ifelse ((day_of_week == "Monday"),1,
ifelse ((day_of_week == "Tuesday"),2,
ifelse ((day_of_week == "Wednesday"),3,
ifelse ((day_of_week == "Thursday"),4,
ifelse ((day_of_week == "Friday"),5,
ifelse ((day_of_week == "Saturday"),6,7)))))))
dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, levels=1:7,
labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))
Let’s create a few tables to summarize our data and give ourselves a better overview. These tables will show us what month’s the giveaways took place in and how many of each item was given away.
##Caps
with(dodgers, table(dodgers$cap,ordered_month))
## ordered_month
## April May June July Aug Sept Oct
## NO 12 18 9 11 14 12 3
## YES 0 0 0 1 1 0 0
##T-shirts
with(dodgers, table(dodgers$shirt,ordered_month))
## ordered_month
## April May June July Aug Sept Oct
## NO 11 18 8 12 15 11 3
## YES 1 0 1 0 0 1 0
##Fireworks
with(dodgers, table(dodgers$fireworks,ordered_month)) ##Fireworks on Friday
## ordered_month
## April May June July Aug Sept Oct
## NO 10 15 7 10 12 10 3
## YES 2 3 2 2 3 2 0
Let’s create a lattice plot next. The lattice plot allows us to group variables and look at several variables at once.
# exploratory data analysis displaying many variables ########
# looking at attendance and conditioning on day/night
# the skies and whether or not fireworks are displayed
library(lattice) # used for plotting
# let us prepare a graphical summary of the dodgers data
group.labels <- c("No Fireworks","Fireworks")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","red")
xyplot(attend/1000 ~ temp | skies + day_night,
data = dodgers, groups = fireworks, pch = group.symbols,
aspect = 1, cex = 1.5, col = group.colors, fill = group.fill,
layout = c(2, 2), type = c("p","g"),
strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1),
xlab = "Temperature (Degrees Fahrenheit)",
ylab = "Attendance (thousands)",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols), col = rev(group.colors),
fill = rev(group.fill))))
Lattice plot for cap giveaways:
##Cap Giveaways #######
group.labels <- c("No Cap","Cap")
group.symbols <- c(21,24)
group.colors <- c("blue","blue")
group.fill <- c("green","yellow")
xyplot(attend/1000 ~ temp | skies + day_night,
data = dodgers, groups = cap, pch = group.symbols,
aspect = 1, cex = 1.5, col = group.colors, fill = group.fill,
layout = c(2, 2), type = c("p","g"),
strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1),
xlab = "Temperature (Degrees Fahrenheit)",
ylab = "Attendance (thousands)",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols), col = rev(group.colors),
fill = rev(group.fill))))
Let’s create a testing set and training set to help in analyzing our data.
##Training Set employ training-and-test regimen for model validation ############
set.seed(1234) # set seed for repeatability of training-and-test split
training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))),
rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers)))))
dodgers$training_test <- sample(training_test) # random permutation
dodgers$training_test <- factor(dodgers$training_test,
levels=c(1,2), labels=c("TRAIN","TEST"))
dodgers.train <- subset(dodgers, training_test == "TRAIN")
print(str(dodgers.train)) # check training data frame
## 'data.frame': 54 obs. of 15 variables:
## $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 5 5 5 ...
## $ day : int 10 11 12 23 24 27 29 7 12 14 ...
## $ attend : int 56000 29729 28328 26376 44014 44807 48753 43713 33735 24312 ...
## $ day_of_week : Factor w/ 7 levels "Friday","Monday",..: 6 7 5 2 6 1 4 2 3 2 ...
## $ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 3 3 10 10 7 15 16 ...
## $ temp : int 67 58 57 60 63 66 74 67 65 67 ...
## $ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 1 1 1 1 ...
## $ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 2 1 2 2 2 ...
## $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 2 1 1 1 ...
## $ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 1 1 1 1 ...
## $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ ordered_month : Factor w/ 7 levels "April","May",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ ordered_day_of_week: Factor w/ 7 levels "Mon","Tue","Wed",..: 2 3 4 1 2 5 7 1 6 1 ...
## $ training_test : Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## NULL
dodgers.test <- subset(dodgers, training_test == "TEST")
print(str(dodgers.test)) # check test data frame
## 'data.frame': 27 obs. of 15 variables:
## $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 5 5 5 5 5 ...
## $ day : int 13 14 15 25 28 8 9 11 13 18 ...
## $ attend : int 31601 46549 38359 26345 54242 32799 33993 35591 49124 40906 ...
## $ day_of_week : Factor w/ 7 levels "Friday","Monday",..: 1 3 4 7 3 6 7 1 4 1 ...
## $ opponent : Factor w/ 17 levels "Angels","Astros",..: 11 11 11 3 10 7 7 15 15 5 ...
## $ temp : int 54 57 65 64 71 75 71 65 70 64 ...
## $ skies : Factor w/ 2 levels "Clear ","Cloudy": 2 2 1 2 1 1 1 1 1 1 ...
## $ day_night : Factor w/ 2 levels "Day","Night": 2 2 1 2 2 2 2 2 1 2 ...
## $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ fireworks : Factor w/ 2 levels "NO","YES": 2 1 1 1 1 1 1 2 1 2 ...
## $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 2 1 1 1 1 1 ...
## $ ordered_month : Factor w/ 7 levels "April","May",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ ordered_day_of_week: Factor w/ 7 levels "Mon","Tue","Wed",..: 5 6 7 3 6 2 3 5 7 5 ...
## $ training_test : Factor w/ 2 levels "TRAIN","TEST": 2 2 2 2 2 2 2 2 2 2 ...
## NULL
Let’s create a linear model from our dataset. We will then see what, if any, effect bobblehead’s have on attendance.
##Booblehead Linear Models ###################################################
# specify a simple model with bobblehead entered last
my.model <- {attend ~ ordered_month + ordered_day_of_week + bobblehead}
# fit the model to the training set
train.model.fit <- lm(my.model, data = dodgers.train)
# summary of model fit to the training set
print(summary(train.model.fit))
##
## Call:
## lm(formula = my.model, data = dodgers.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11168.1 -3302.9 -101.9 3105.6 13556.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35537.0 3301.4 10.764 2.21e-13 ***
## ordered_monthMay -3945.3 3507.2 -1.125 0.2673
## ordered_monthJune 7162.4 3876.3 1.848 0.0720 .
## ordered_monthJuly 1080.0 3524.6 0.306 0.7609
## ordered_monthAug 1496.0 3350.7 0.446 0.6577
## ordered_monthSept 284.9 3417.2 0.083 0.9340
## ordered_monthOct -2039.0 4546.1 -0.449 0.6562
## ordered_day_of_weekTue 8435.1 3482.1 2.422 0.0200 *
## ordered_day_of_weekWed 1181.9 3125.2 0.378 0.7073
## ordered_day_of_weekThur 2753.8 4700.3 0.586 0.5612
## ordered_day_of_weekFri 3962.3 3164.0 1.252 0.2177
## ordered_day_of_weekSat 3261.3 3293.1 0.990 0.3280
## ordered_day_of_weekSun 4479.6 3284.5 1.364 0.1802
## bobbleheadYES 9604.5 3607.0 2.663 0.0111 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6423 on 40 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.3907
## F-statistic: 3.614 on 13 and 40 DF, p-value: 0.0008601
# training set predictions from the model fit to the training set
dodgers.train$predict_attend <- predict(train.model.fit)
# test set predictions from the model fit to the training set
dodgers.test$predict_attend <- predict(train.model.fit,
newdata = dodgers.test)
# compute the proportion of response variance
# accounted for when predicting out-of-sample
cat("\n","Proportion of Test Set Variance Accounted for: ",
round((with(dodgers.test,cor(attend,predict_attend)^2)),
digits=3),"\n",sep="")
##
## Proportion of Test Set Variance Accounted for: 0.453
Let’s plot our data using the following code.
# merge the training and test sets for plotting
dodgers.plotting.frame <- rbind(dodgers.train,dodgers.test)
# generate predictive modeling visual for management
group.labels <- c("No Bobbleheads","Bobbleheads")
group.symbols <- c(21,24)
group.colors <- c("black","black")
group.fill <- c("black","red")
xyplot(predict_attend/1000 ~ attend/1000 | training_test,
data = dodgers.plotting.frame, groups = bobblehead, cex = 2,
pch = group.symbols, col = group.colors, fill = group.fill,
layout = c(2, 1), xlim = c(20,65), ylim = c(20,65),
aspect=1, type = c("p","g"),
panel=function(x,y, ...)
{panel.xyplot(x,y,...)
panel.segments(25,25,60,60,col="black",cex=2)
},
strip=function(...) strip.default(..., style=1),
xlab = "Actual Attendance (thousands)",
ylab = "Predicted Attendance (thousands)",
key = list(space = "top",
text = list(rev(group.labels),col = rev(group.colors)),
points = list(pch = rev(group.symbols),
col = rev(group.colors),
fill = rev(group.fill))))
# use the full data set to obtain an estimate of the increase in
# attendance due to bobbleheads, controlling for other factors
my.model.fit <- lm(my.model, data = dodgers) # use all available data
print(summary(my.model.fit))
##
## Call:
## lm(formula = my.model, data = dodgers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10786.5 -3628.1 -516.1 2230.2 14351.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33909.16 2521.81 13.446 < 2e-16 ***
## ordered_monthMay -2385.62 2291.22 -1.041 0.30152
## ordered_monthJune 7163.23 2732.72 2.621 0.01083 *
## ordered_monthJuly 2849.83 2578.60 1.105 0.27303
## ordered_monthAug 2377.92 2402.91 0.990 0.32593
## ordered_monthSept 29.03 2521.25 0.012 0.99085
## ordered_monthOct -662.67 4046.45 -0.164 0.87041
## ordered_day_of_weekTue 7911.49 2702.21 2.928 0.00466 **
## ordered_day_of_weekWed 2460.02 2514.03 0.979 0.33134
## ordered_day_of_weekThur 775.36 3486.15 0.222 0.82467
## ordered_day_of_weekFri 4883.82 2504.65 1.950 0.05537 .
## ordered_day_of_weekSat 6372.06 2552.08 2.497 0.01500 *
## ordered_day_of_weekSun 6724.00 2506.72 2.682 0.00920 **
## bobbleheadYES 10714.90 2419.52 4.429 3.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6120 on 67 degrees of freedom
## Multiple R-squared: 0.5444, Adjusted R-squared: 0.456
## F-statistic: 6.158 on 13 and 67 DF, p-value: 2.083e-07
# tests statistical significance of the bobblehead promotion
# type I anova computes sums of squares for sequential tests
print(anova(my.model.fit))
## Analysis of Variance Table
##
## Response: attend
## Df Sum Sq Mean Sq F value Pr(>F)
## ordered_month 6 948958117 158159686 4.2225 0.001158 **
## ordered_day_of_week 6 1314813030 219135505 5.8504 6.002e-05 ***
## bobblehead 1 734587177 734587177 19.6118 3.590e-05 ***
## Residuals 67 2509574563 37456337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
What is the estimated effect that bobbleheads have on attendance?
cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ",
round(my.model.fit$coefficients[length(my.model.fit$coefficients)],
digits = 0),"\n",sep="")
##
## Estimated Effect of Bobblehead Promotion on Attendance: 10715
Let’s preprocess our data to run a random forest on our dataset.
library(tree)
dodgersfortree<-dodgers[,-16]
names(dodgers)
## [1] "month" "day" "attend"
## [4] "day_of_week" "opponent" "temp"
## [7] "skies" "day_night" "cap"
## [10] "shirt" "fireworks" "bobblehead"
## [13] "ordered_month" "ordered_day_of_week" "training_test"
library(randomForest)
attach(dodgersfortree)
## Randomforest Secondary Method ###################################################
par(mfrow=c(1,1))
set.seed(4543)
data(dodgersfortree)
dodgers.rf <- randomForest(attend ~ ., data=dodgersfortree, ntree=1000, keep.forest=FALSE,importance=TRUE)
varImpPlot(dodgers.rf)
Let’s plot our random forest and create a visualization for people to see.
dodgerstreetwo<-tree(attend~.,data = dodgersfortree)
plot(dodgerstreetwo)
text(dodgerstreetwo,pretty = 0)