## NFL WINNING TEAM PREDICTION
## DATA FROM 2003 thru 2013
## Training set 90% of data points between 2003 and 2012
## Test set 10% of data points between 2003 and 2012
## Validation set NFL data from 2013
setwd("~/Documents/NYDSA/DATA SET/NFL")
#DATA
nfldata <- read.csv("nflresults2.csv")
## Remove tied matches
nfldata1 <- subset(nfldata, nfldata[ ,37] > -1)
## Remove matches at neutral venues and factorize venues
nfldata2 <- subset(nfldata1, !nfldata1[ ,34]== 'N')
nfldata2$Site <- as.factor(nfldata2$Site)
## change results from factor of 0/1 to 1/2
nfldata2$Result <- nfldata2$Result+1
## Take the difference of variables (performance matrix) of Team A and Team B
nfldata2$DL5_ScoreOff <- nfldata2$L5_ScoreOff - nfldata2$OL5_ScoreOff
nfldata2$DL5_FirstDownOff <- nfldata2$L5_FirstDownOff - nfldata2$OL5_FirstDownOff
nfldata2$DL5_ThirdDownPctOff <- nfldata2$L5_ThirdDownPctOff - nfldata2$OL5_ThirdDownPctOff
nfldata2$DL5_RushAttOff <- nfldata2$L5_RushAttOff - nfldata2$OL5_RushAttOff
nfldata2$DL5_RushYdsOff <- nfldata2$L5_RushYdsOff - nfldata2$OL5_RushYdsOff
nfldata2$DL5_PassAttOff <- nfldata2$L5_PassAttOff - nfldata2$OL5_PassAttOff
nfldata2$DL5_PassCompOff <- nfldata2$L5_PassCompOff - nfldata2$OL5_PassCompOff
nfldata2$DL5_PassYdsOff <- nfldata2$L5_PassYdsOff - nfldata2$OL5_PassYdsOff
nfldata2$DL5_PassIntOff <- nfldata2$L5_PassIntOff - nfldata2$OL5_PassIntOff
nfldata2$DL5_FumblesOff <- nfldata2$L5_FumblesOff - nfldata2$OL5_FumblesOff
nfldata2$DL5_SackYdsOff <- nfldata2$L5_SackYdsOff - nfldata2$OL5_SackYdsOff
nfldata2$DL5_PenYdsOff <- nfldata2$L5_PenYdsOff - nfldata2$OL5_PenYdsOff
nfldata2$DL5_TimePossOff <- nfldata2$L5_TimePossOff - nfldata2$OL5_TimePossOff
nfldata2$DL5_PuntAvgOff <- nfldata2$L5_PuntAvgOff - nfldata2$OL5_PuntAvgOff
nfldata2$DL5_ScoreDef <- nfldata2$L5_ScoreDef - nfldata2$OL5_ScoreDef
nfldata2$DL5_FirstDownDef <- nfldata2$L5_FirstDownDef - nfldata2$OL5_FirstDownDef
nfldata2$DL5_ThirdDownPctDef <- nfldata2$L5_ThirdDownPctDef - nfldata2$OL5_ThirdDownPctDef
nfldata2$DL5_RushAttDef <- nfldata2$L5_RushAttDef - nfldata2$OL5_RushAttDef
nfldata2$DL5_RushYdsDef <- nfldata2$L5_RushYdsDef - nfldata2$OL5_RushYdsDef
nfldata2$DL5_PassAttDef <- nfldata2$L5_PassAttDef - nfldata2$OL5_PassAttDef
nfldata2$DL5_PassCompDef <- nfldata2$L5_PassCompDef - nfldata2$OL5_PassCompDef
nfldata2$DL5_PassYdsDef <- nfldata2$L5_PassYdsDef - nfldata2$OL5_PassYdsDef
nfldata2$DL5_PassIntDef <- nfldata2$L5_PassIntDef - nfldata2$OL5_PassIntDef
nfldata2$DL5_FumblesDef <- nfldata2$L5_FumblesDef - nfldata2$OL5_FumblesDef
nfldata2$DL5_SackYdsDef <- nfldata2$L5_SackYdsDef - nfldata2$OL5_SackYdsDef
nfldata2$DL5_PenYdsDef <- nfldata2$L5_PenYdsDef - nfldata2$OL5_PenYdsDef
nfldata2$DL5_TimePossDef <- nfldata2$L5_TimePossDef - nfldata2$OL5_TimePossDef
nfldata2$ResultPoints <- nfldata2$ScoreOff - nfldata2$ScoreDef
## Remove 2013 and keep it for validation
nfldata3 <- subset(nfldata2, nfldata2[,1]!='2013');
nflval <- subset(nfldata2, nfldata2[,1]=='2013');
## Create a test set and training set
set.seed(20141020)
n = dim(nfldata3)[1]
flag <- sort(sample(1:n, 385));
nfltrain <- nfldata3[-flag,];
nfltest <- nfldata3[flag,];
# MULTIPLE LINEAR REGRESSION (Model 1)
lm1 <- lm(Result ~ Site +Totalline + Line +L5_ScoreOff +L5_FirstDownOff +L5_ThirdDownPctOff +L5_RushAttOff +L5_RushYdsOff + L5_PassAttOff +L5_PassCompOff +L5_PassYdsOff +L5_PassIntOff +L5_FumblesOff +L5_SackYdsOff + L5_PenYdsOff +L5_TimePossOff +L5_PuntAvgOff +L5_ScoreDef +L5_FirstDownDef +L5_ThirdDownPctDef +
L5_RushAttDef +L5_RushYdsDef +L5_PassAttDef +L5_PassCompDef +L5_PassYdsDef +L5_PassIntDef +L5_FumblesDef +
L5_SackYdsDef +L5_PenYdsDef +L5_TimePossDef +OL5_ScoreOff +OL5_FirstDownOff +OL5_ThirdDownPctOff +
OL5_RushAttOff +OL5_RushYdsOff +OL5_PassAttOff +OL5_PassCompOff +OL5_PassYdsOff +OL5_PassIntOff +OL5_FumblesOff +
OL5_SackYdsOff +OL5_PenYdsOff +OL5_TimePossOff +OL5_PuntAvgOff +OL5_ScoreDef +OL5_FirstDownDef +
OL5_ThirdDownPctDef +OL5_RushAttDef +OL5_RushYdsDef +OL5_PassAttDef +OL5_PassCompDef +OL5_PassYdsDef +
OL5_PassIntDef +OL5_FumblesDef +OL5_SackYdsDef +OL5_PenYdsDef +OL5_TimePossDef, data=nfltrain)
# TRANSFORMATION
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.2
boxcox(lm1)

# **Log transformation is recommended**
# SCATTER PLOT / PAIR VIEW
library(psych)
## Warning: package 'psych' was built under R version 3.1.2
pairs.panels(nfltrain[,sample(6:37, 10)])

# SCATTER PLOT
library(corrplot)
M <- as.matrix(round(cor(nfltrain[,c(35:36,38:64)]),1))
corrplot(M, method = "square")
###MULTIPLE LINEAR REGRESSION w/ TRANSFORMATION (Model-A)
modelA <- lm( log(Result)~ Site +Line + DL5_RushYdsOff + DL5_PassCompOff + DL5_PassIntOff + DL5_FumblesOff + DL5_SackYdsOff + DL5_PuntAvgOff +DL5_PassCompDef+ DL5_PassIntDef + DL5_SackYdsDef, data = nfltrain);
summary(modelA)
##
## Call:
## lm(formula = log(Result) ~ Site + Line + DL5_RushYdsOff + DL5_PassCompOff +
## DL5_PassIntOff + DL5_FumblesOff + DL5_SackYdsOff + DL5_PuntAvgOff +
## DL5_PassCompDef + DL5_PassIntDef + DL5_SackYdsDef, data = nfltrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68575 -0.27610 0.00229 0.27709 0.68563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.409e-01 8.241e-03 41.367 <2e-16 ***
## SiteV 1.145e-02 1.255e-02 0.913 0.3615
## Line 2.101e-02 1.192e-03 17.623 <2e-16 ***
## DL5_RushYdsOff -1.079e-05 1.652e-04 -0.065 0.9479
## DL5_PassCompOff -1.062e-03 1.326e-03 -0.801 0.4230
## DL5_PassIntOff 1.021e-02 8.640e-03 1.182 0.2375
## DL5_FumblesOff -2.319e-02 1.034e-02 -2.243 0.0250 *
## DL5_SackYdsOff -1.184e-03 6.552e-04 -1.807 0.0709 .
## DL5_PuntAvgOff -2.104e-03 1.179e-03 -1.785 0.0744 .
## DL5_PassCompDef 2.238e-03 1.293e-03 1.730 0.0837 .
## DL5_PassIntDef -1.874e-02 8.479e-03 -2.210 0.0272 *
## DL5_SackYdsDef -5.659e-04 7.474e-04 -0.757 0.4490
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3178 on 3449 degrees of freedom
## Multiple R-squared: 0.162, Adjusted R-squared: 0.1593
## F-statistic: 60.62 on 11 and 3449 DF, p-value: < 2.2e-16
pred1 <- ifelse(fitted(modelA) < 0.344,1,2)
table(pred1, nfltrain$Result)
##
## pred1 1 2
## 1 1160 569
## 2 572 1160
pred2 <- predict(modelA, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.344,1,2)
table(pred2, nfltest$Result)
##
## pred2 1 2
## 1 128 57
## 2 59 141
#________________________________#
###SUBSET MODEL w/ k=6 predictors (MODEL-B)
library(leaps)
fat1.leaps <- regsubsets(log(Result) ~ Site +Line + DL5_RushYdsOff + DL5_PassCompOff + DL5_PassIntOff + DL5_FumblesOff + DL5_SackYdsOff + DL5_PuntAvgOff +DL5_PassCompDef+ DL5_PassIntDef + DL5_SackYdsDef, data= nfltrain, nbest= 100, really.big= TRUE);
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 1 linear dependencies found
## Reordering variables and trying again:
fat1.models <- summary(fat1.leaps)$which;
fat1.models.size <- as.numeric(attr(fat1.models, "dimnames")[[1]]);
fat1.models.rss <- summary(fat1.leaps)$rss;
op <- which(fat1.models.size == 6);
flag <- op[which.min(fat1.models.rss[op])];
modelBform <- fat1.models[flag,];
Bform <- names(modelBform)[modelBform][-1];
Bform <- paste(Bform, collapse = " + ");
Bform <- paste("log(Result) ~", Bform);
modelB <- lm(as.formula(Bform), data= nfltrain);
summary(fitted(modelB))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1698 0.2454 0.3437 0.3463 0.4477 0.8630
pred1 <- ifelse(fitted(modelB) < 0.3437,1,2)
table(pred1, nfltrain$Result)
##
## pred1 1 2
## 1 1160 571
## 2 572 1158
pred2 <- predict(modelB, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.3437,1,2)
table(pred2, nfltest$Result)
##
## pred2 1 2
## 1 128 54
## 2 59 144
## STEP FUNCTION (Model-C)
modelC <- step(modelA, trace=FALSE);
summary(modelC)
##
## Call:
## lm(formula = log(Result) ~ Line + DL5_FumblesOff + DL5_SackYdsOff +
## DL5_PuntAvgOff + DL5_PassCompDef + DL5_PassIntDef, data = nfltrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67957 -0.27771 0.00259 0.27766 0.68396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3466239 0.0054010 64.178 <2e-16 ***
## Line 0.0199520 0.0008922 22.362 <2e-16 ***
## DL5_FumblesOff -0.0241045 0.0102934 -2.342 0.0193 *
## DL5_SackYdsOff -0.0012215 0.0006358 -1.921 0.0548 .
## DL5_PuntAvgOff -0.0020263 0.0011678 -1.735 0.0828 .
## DL5_PassCompDef 0.0020980 0.0012801 1.639 0.1013
## DL5_PassIntDef -0.0178045 0.0081918 -2.173 0.0298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3177 on 3454 degrees of freedom
## Multiple R-squared: 0.1613, Adjusted R-squared: 0.1599
## F-statistic: 110.7 on 6 and 3454 DF, p-value: < 2.2e-16
pred1 <- ifelse(fitted(modelC) < 0.3437,1,2)
table(pred1, nfltrain$Result)
##
## pred1 1 2
## 1 1160 571
## 2 572 1158
pred2 <- predict(modelC, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.3437,1,2)
table(pred2, nfltest$Result)
##
## pred2 1 2
## 1 128 54
## 2 59 144
###RIDGE REGRESSION (MODEL-D)
### the lambda valude depends on your search regression and step size
library(MASS);
library(lattice);
## Warning: package 'lattice' was built under R version 3.1.2

library(leaps);
library(lars);
## Loaded lars 1.2
##
##
## Attaching package: 'lars'
##
## The following object is masked from 'package:psych':
##
## error.bars
nfltrain$Site <- as.numeric(nfltrain$Site) -1
nfltest$Site <- as.numeric(nfltest$Site) -1
nflval$Site <- as.numeric(nflval$Site) -1
model.ridgeb <- lm.ridge( Result ~ Line+Totalline+Site+ DL5_ScoreOff + DL5_FirstDownOff + DL5_ThirdDownPctOff + DL5_RushAttOff + DL5_RushYdsOff + DL5_PassAttOff + DL5_PassCompOff + DL5_PassYdsOff + DL5_PassIntOff + DL5_FumblesOff + DL5_SackYdsOff + DL5_PenYdsOff + DL5_TimePossOff + DL5_PuntAvgOff + DL5_ScoreDef + DL5_FirstDownDef + DL5_ThirdDownPctDef + DL5_RushAttDef + DL5_RushYdsDef + DL5_PassAttDef + DL5_PassCompDef + DL5_PassYdsDef + DL5_PassIntDef + DL5_FumblesDef + DL5_SackYdsDef + DL5_PenYdsDef + DL5_TimePossDef, data = nfltrain, lambda= seq(0,10,0.001));
ridge1 <- model.ridgeb;
lambdaopt <- which.min(ridge1$GCV);
### The coefficient of ridge on the original, raw data scale
modelD.coef <- coef(ridge1)[lambdaopt,];
## Testing error of Model D: Ridge Regression
rig1coef <- ridge1$coef[,lambdaopt];
rig1intercepts <- ridge1$ym - sum(ridge1$xm * (rig1coef / ridge1$scales));
predD <- scale(nfltrain[,names(ridge1$scales)],center = F, scale = ridge1$scales)%*% rig1coef + rig1intercepts;
pred1 <- ifelse(predD < 1.5,1,2)
table(pred1, nfltrain$Result)
##
## pred1 1 2
## 1 1161 574
## 2 571 1155
predE <- scale(nfltest[,names(ridge1$scales)],center = F, scale = ridge1$scales)%*% rig1coef + rig1intercepts;
pred2 <- ifelse(predE < 1.5,1,2)
table(pred2, nfltest$Result)
##
## pred2 1 2
## 1 130 55
## 2 57 143
### LASSO (MODEL-E)
modelE <- lars(as.matrix(nfltrain[,c(34:36,93:119)]), nfltrain[,37], type= "lasso", trace= FALSE);
Cp1 <- summary(modelE)$Cp;
index1 <- which.min(Cp1);
lasso.lambda <- modelE$lambda[index1]
round(coef(modelE)[index1,],4)
## Site Line Totalline
## 0.0000 0.0286 0.0000
## DL5_ScoreOff DL5_FirstDownOff DL5_ThirdDownPctOff
## 0.0020 0.0000 0.0002
## DL5_RushAttOff DL5_RushYdsOff DL5_PassAttOff
## 0.0012 0.0000 0.0000
## DL5_PassCompOff DL5_PassYdsOff DL5_PassIntOff
## 0.0000 0.0000 0.0055
## DL5_FumblesOff DL5_SackYdsOff DL5_PenYdsOff
## -0.0311 -0.0010 0.0000
## DL5_TimePossOff DL5_PuntAvgOff DL5_ScoreDef
## 0.0000 -0.0024 0.0000
## DL5_FirstDownDef DL5_ThirdDownPctDef DL5_RushAttDef
## 0.0000 0.0000 0.0044
## DL5_RushYdsDef DL5_PassAttDef DL5_PassCompDef
## 0.0000 0.0031 0.0000
## DL5_PassYdsDef DL5_PassIntDef DL5_FumblesDef
## 0.0000 -0.0192 -0.0390
## DL5_SackYdsDef DL5_PenYdsDef DL5_TimePossDef
## 0.0000 0.0000 0.0017
predF <- predict(modelE, as.matrix(nfltrain[,c(34:36,93:119)]), s=lasso.lambda, type="fit", mode="lambda");
pred1 <- ifelse(predF$fit < 1.5,1,2)
table(pred1, nfltrain$Result)
##
## pred1 1 2
## 1 1176 560
## 2 556 1169
predG <- predict(modelE, as.matrix(nfltest[,c(34:36,94:120)]), s=lasso.lambda, type="fit", mode="lambda");
pred2 <- ifelse(predG$fit < 1.5,1,2)
table(pred2, nfltest$Result)
##
## pred2 1 2
## 1 114 89
## 2 73 109
# KNN (MODEL-F)
## Datasets for KNN
knfltrain <- nfltrain[,c(37, 34:36,38:64, 66:92)]
knfltest <- nfltest[,c(37, 34:36,38:64, 66:92)]
knflval <- nflval[,c(37, 34:36,38:64, 66:92)]
knfltest$Site <- as.numeric(knfltest$Site) -1
knfltrain$Site <- as.numeric(knfltrain$Site) -1
knflval$Site <- as.numeric(knflval$Site) -1
knfltest <- cbind(knfltest[,1:2],scale(knfltest[,c(-1,-2)]))
knfltrain <- cbind(knfltrain[,1:2],scale(knfltrain[,c(-1,-2)]))
knflval <- cbind(knflval[,1:2],scale(knflval[,c(-1,-2)]))
##Top Vairables for KNN
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
select.top.N.ROC <- function(response, predictors, data, n) {
n <- min(n, length(predictors))
aucs <- sapply(predictors, function(predictor) {
auc(data[[response]], data[[predictor]])
})
return(predictors[order(aucs, decreasing=TRUE)][1:n])
}
top.variables <- select.top.N.ROC("Result", colnames(knfltest[,-1]), knfltest, 3)
top.variables
## [1] "Line" "L5_ScoreOff" "L5_FumblesOff"
top.variables[4] = "Site"
top.variables[1] = "Line"
top.variables[2] = "L5_ScoreOff"
top.variables[3] = "OL5_ScoreOff"
## KNN (MODEL-F)
library(class)
## Warning: package 'class' was built under R version 3.1.2
mod.train <- knn( knfltrain[,top.variables], knfltrain[,top.variables], k = 71, cl = knfltrain[,1]);
mod.test <- knn( knfltrain[,top.variables], knfltest[,top.variables], k = 71, cl= knfltrain[,1]);
mod.val <- knn( knfltrain[,top.variables], knflval[,top.variables], k = 71, cl= knfltrain[,1]);
misclass.train <- mean( mod.train != knfltrain[,1]);
misclass.test <- mean( mod.test != knfltest[,1]);
misclass.val <- mean( mod.val != knflval[,1]);
##View(knflval)
## KNN (MODEL-F) continued
ks <- c(10,50,100,150) # different neighbor setting
nks <- length(ks)
misclass.train <- numeric(length= nks)
misclass.test <- numeric(length= nks)
for (i in 1:nks ){
mod.train <- knn( knfltrain[,2:58], knfltrain[,2:58], k = ks[i], cl = knfltrain[,1]);
mod.test <- knn( knfltrain[,2:58], knfltest[,2:58], k = ks[i], cl= knfltrain[,1]);
misclass.train[i] <- mean( mod.train != knfltrain[,1]);
misclass.test[i] <- mean( mod.test != knfltest[,1]);
}
cbind(ks, misclass.train, misclass.test)
## ks misclass.train misclass.test
## [1,] 10 0.3056920 0.3792208
## [2,] 50 0.3403641 0.3584416
## [3,] 100 0.3533661 0.3428571
## [4,] 150 0.3539439 0.3480519
##GLM (MODEL-G)
#only DL columns from NFLtrain
nfltrain1 <- nfltrain[, c('Result', 'Site' ,'Line','Totalline' ,'DL5_ScoreOff' , 'DL5_FirstDownOff' , 'DL5_ThirdDownPctOff' , 'DL5_RushAttOff' , 'DL5_PassAttOff' , 'DL5_PassCompOff' , 'DL5_PassIntOff' ,'DL5_FumblesOff' , 'DL5_SackYdsOff' , 'DL5_TimePossOff' , 'DL5_PuntAvgOff' , 'DL5_ScoreDef' , 'DL5_ThirdDownPctDef' , 'DL5_RushAttDef' , 'DL5_RushYdsDef' , 'DL5_PassAttDef' , 'DL5_PassCompDef')]
glm1 <- glm(formula = Result-1 ~ ., family = "binomial", data = nfltrain1)
nothing <- glm(Result-1 ~ 1,data=nfltrain1)
fullmod = glm(Result-1 ~ .,family = "binomial", data= nfltrain1)
bothways = step(nothing, list(lower=formula(nothing),upper=formula(fullmod)),direction="both",trace=0)
pred2 <- ifelse(fitted(bothways) < 0.5,0,1)
table(pred2, nfltrain$Result)
##
## pred2 1 2
## 0 1157 572
## 1 575 1157