## NFL WINNING TEAM PREDICTION
## DATA FROM 2003 thru 2013
## Training set 90% of data points between 2003 and 2012
## Test set 10% of data points between 2003 and 2012
## Validation set NFL data from 2013
setwd("~/Documents/NYDSA/DATA SET/NFL")
#DATA
    nfldata <- read.csv("nflresults2.csv")
## Remove tied matches
    nfldata1 <- subset(nfldata, nfldata[ ,37] > -1)
## Remove matches at neutral venues and factorize venues
    nfldata2 <- subset(nfldata1, !nfldata1[ ,34]== 'N') 
    nfldata2$Site <- as.factor(nfldata2$Site)
## change results from factor of 0/1 to 1/2     
    nfldata2$Result <- nfldata2$Result+1
## Take the difference of variables (performance matrix) of Team A and Team B
nfldata2$DL5_ScoreOff <- nfldata2$L5_ScoreOff - nfldata2$OL5_ScoreOff
nfldata2$DL5_FirstDownOff <- nfldata2$L5_FirstDownOff - nfldata2$OL5_FirstDownOff
nfldata2$DL5_ThirdDownPctOff <- nfldata2$L5_ThirdDownPctOff - nfldata2$OL5_ThirdDownPctOff
nfldata2$DL5_RushAttOff <- nfldata2$L5_RushAttOff - nfldata2$OL5_RushAttOff
nfldata2$DL5_RushYdsOff <- nfldata2$L5_RushYdsOff - nfldata2$OL5_RushYdsOff
nfldata2$DL5_PassAttOff <- nfldata2$L5_PassAttOff - nfldata2$OL5_PassAttOff
nfldata2$DL5_PassCompOff <- nfldata2$L5_PassCompOff - nfldata2$OL5_PassCompOff
nfldata2$DL5_PassYdsOff <- nfldata2$L5_PassYdsOff - nfldata2$OL5_PassYdsOff
nfldata2$DL5_PassIntOff <- nfldata2$L5_PassIntOff - nfldata2$OL5_PassIntOff
nfldata2$DL5_FumblesOff <- nfldata2$L5_FumblesOff - nfldata2$OL5_FumblesOff
nfldata2$DL5_SackYdsOff <- nfldata2$L5_SackYdsOff - nfldata2$OL5_SackYdsOff
nfldata2$DL5_PenYdsOff <- nfldata2$L5_PenYdsOff - nfldata2$OL5_PenYdsOff
nfldata2$DL5_TimePossOff <- nfldata2$L5_TimePossOff - nfldata2$OL5_TimePossOff
nfldata2$DL5_PuntAvgOff <- nfldata2$L5_PuntAvgOff - nfldata2$OL5_PuntAvgOff
nfldata2$DL5_ScoreDef <- nfldata2$L5_ScoreDef - nfldata2$OL5_ScoreDef
nfldata2$DL5_FirstDownDef <- nfldata2$L5_FirstDownDef - nfldata2$OL5_FirstDownDef
nfldata2$DL5_ThirdDownPctDef <- nfldata2$L5_ThirdDownPctDef - nfldata2$OL5_ThirdDownPctDef
nfldata2$DL5_RushAttDef <- nfldata2$L5_RushAttDef - nfldata2$OL5_RushAttDef
nfldata2$DL5_RushYdsDef <- nfldata2$L5_RushYdsDef - nfldata2$OL5_RushYdsDef
nfldata2$DL5_PassAttDef <- nfldata2$L5_PassAttDef - nfldata2$OL5_PassAttDef
nfldata2$DL5_PassCompDef <- nfldata2$L5_PassCompDef - nfldata2$OL5_PassCompDef
nfldata2$DL5_PassYdsDef <- nfldata2$L5_PassYdsDef - nfldata2$OL5_PassYdsDef
nfldata2$DL5_PassIntDef <- nfldata2$L5_PassIntDef - nfldata2$OL5_PassIntDef
nfldata2$DL5_FumblesDef <- nfldata2$L5_FumblesDef - nfldata2$OL5_FumblesDef
nfldata2$DL5_SackYdsDef <- nfldata2$L5_SackYdsDef - nfldata2$OL5_SackYdsDef
nfldata2$DL5_PenYdsDef <- nfldata2$L5_PenYdsDef - nfldata2$OL5_PenYdsDef
nfldata2$DL5_TimePossDef <- nfldata2$L5_TimePossDef - nfldata2$OL5_TimePossDef
nfldata2$ResultPoints <- nfldata2$ScoreOff - nfldata2$ScoreDef

## Remove 2013 and keep it for validation
    nfldata3 <- subset(nfldata2, nfldata2[,1]!='2013');
    nflval <- subset(nfldata2, nfldata2[,1]=='2013');

## Create a test set and training set
    set.seed(20141020)
    n = dim(nfldata3)[1]
    flag <- sort(sample(1:n, 385));
    nfltrain <- nfldata3[-flag,];
    nfltest <- nfldata3[flag,];

# MULTIPLE LINEAR REGRESSION (Model 1)
lm1 <- lm(Result ~ Site +Totalline + Line +L5_ScoreOff    +L5_FirstDownOff    +L5_ThirdDownPctOff   +L5_RushAttOff  +L5_RushYdsOff  + L5_PassAttOff +L5_PassCompOff +L5_PassYdsOff  +L5_PassIntOff  +L5_FumblesOff  +L5_SackYdsOff  + L5_PenYdsOff  +L5_TimePossOff +L5_PuntAvgOff  +L5_ScoreDef    +L5_FirstDownDef    +L5_ThirdDownPctDef +
 L5_RushAttDef  +L5_RushYdsDef  +L5_PassAttDef  +L5_PassCompDef +L5_PassYdsDef  +L5_PassIntDef  +L5_FumblesDef  +
              L5_SackYdsDef +L5_PenYdsDef   +L5_TimePossDef     +OL5_ScoreOff   +OL5_FirstDownOff   +OL5_ThirdDownPctOff    +
              OL5_RushAttOff    +OL5_RushYdsOff +OL5_PassAttOff +OL5_PassCompOff    +OL5_PassYdsOff +OL5_PassIntOff +OL5_FumblesOff +
              OL5_SackYdsOff    +OL5_PenYdsOff  +OL5_TimePossOff    +OL5_PuntAvgOff +OL5_ScoreDef   +OL5_FirstDownDef   +
              OL5_ThirdDownPctDef   +OL5_RushAttDef +OL5_RushYdsDef +OL5_PassAttDef +OL5_PassCompDef    +OL5_PassYdsDef +
              OL5_PassIntDef    +OL5_FumblesDef +OL5_SackYdsDef +OL5_PenYdsDef      +OL5_TimePossDef, data=nfltrain)

# TRANSFORMATION 
library(MASS)   
## Warning: package 'MASS' was built under R version 3.1.2
    boxcox(lm1) 

    # **Log transformation is recommended**


# SCATTER PLOT / PAIR VIEW
    library(psych)
## Warning: package 'psych' was built under R version 3.1.2
    pairs.panels(nfltrain[,sample(6:37, 10)])

# SCATTER PLOT
    library(corrplot)
    M <- as.matrix(round(cor(nfltrain[,c(35:36,38:64)]),1))
    corrplot(M, method = "square")


###MULTIPLE LINEAR REGRESSION w/ TRANSFORMATION (Model-A)
modelA <- lm( log(Result)~ Site +Line    + DL5_RushYdsOff    + DL5_PassCompOff  + DL5_PassIntOff    + DL5_FumblesOff    + DL5_SackYdsOff + DL5_PuntAvgOff   +DL5_PassCompDef+ DL5_PassIntDef    + DL5_SackYdsDef, data = nfltrain); 
summary(modelA)
## 
## Call:
## lm(formula = log(Result) ~ Site + Line + DL5_RushYdsOff + DL5_PassCompOff + 
##     DL5_PassIntOff + DL5_FumblesOff + DL5_SackYdsOff + DL5_PuntAvgOff + 
##     DL5_PassCompDef + DL5_PassIntDef + DL5_SackYdsDef, data = nfltrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68575 -0.27610  0.00229  0.27709  0.68563 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.409e-01  8.241e-03  41.367   <2e-16 ***
## SiteV            1.145e-02  1.255e-02   0.913   0.3615    
## Line             2.101e-02  1.192e-03  17.623   <2e-16 ***
## DL5_RushYdsOff  -1.079e-05  1.652e-04  -0.065   0.9479    
## DL5_PassCompOff -1.062e-03  1.326e-03  -0.801   0.4230    
## DL5_PassIntOff   1.021e-02  8.640e-03   1.182   0.2375    
## DL5_FumblesOff  -2.319e-02  1.034e-02  -2.243   0.0250 *  
## DL5_SackYdsOff  -1.184e-03  6.552e-04  -1.807   0.0709 .  
## DL5_PuntAvgOff  -2.104e-03  1.179e-03  -1.785   0.0744 .  
## DL5_PassCompDef  2.238e-03  1.293e-03   1.730   0.0837 .  
## DL5_PassIntDef  -1.874e-02  8.479e-03  -2.210   0.0272 *  
## DL5_SackYdsDef  -5.659e-04  7.474e-04  -0.757   0.4490    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3178 on 3449 degrees of freedom
## Multiple R-squared:  0.162,  Adjusted R-squared:  0.1593 
## F-statistic: 60.62 on 11 and 3449 DF,  p-value: < 2.2e-16
pred1 <- ifelse(fitted(modelA) < 0.344,1,2)
table(pred1, nfltrain$Result)
##      
## pred1    1    2
##     1 1160  569
##     2  572 1160
pred2 <- predict(modelA, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.344,1,2)
table(pred2, nfltest$Result)
##      
## pred2   1   2
##     1 128  57
##     2  59 141
#________________________________#



###SUBSET MODEL w/ k=6 predictors (MODEL-B)
library(leaps)
fat1.leaps <- regsubsets(log(Result) ~ Site +Line    + DL5_RushYdsOff    + DL5_PassCompOff                          + DL5_PassIntOff    + DL5_FumblesOff    + DL5_SackYdsOff + DL5_PuntAvgOff   +DL5_PassCompDef+ DL5_PassIntDef    + DL5_SackYdsDef, data= nfltrain, nbest= 100, really.big= TRUE);
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 1 linear dependencies found
## Reordering variables and trying again:
fat1.models <- summary(fat1.leaps)$which;
fat1.models.size <- as.numeric(attr(fat1.models, "dimnames")[[1]]);
fat1.models.rss <- summary(fat1.leaps)$rss;
op <- which(fat1.models.size == 6); 
flag <- op[which.min(fat1.models.rss[op])]; 
modelBform <- fat1.models[flag,];
Bform <- names(modelBform)[modelBform][-1];
Bform <- paste(Bform, collapse = " + ");
Bform <- paste("log(Result) ~", Bform);
modelB <- lm(as.formula(Bform), data= nfltrain);

summary(fitted(modelB))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1698  0.2454  0.3437  0.3463  0.4477  0.8630
pred1 <- ifelse(fitted(modelB) < 0.3437,1,2)
table(pred1, nfltrain$Result)
##      
## pred1    1    2
##     1 1160  571
##     2  572 1158
pred2 <- predict(modelB, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.3437,1,2)
table(pred2, nfltest$Result)
##      
## pred2   1   2
##     1 128  54
##     2  59 144
## STEP FUNCTION (Model-C)
modelC <- step(modelA, trace=FALSE);
summary(modelC)
## 
## Call:
## lm(formula = log(Result) ~ Line + DL5_FumblesOff + DL5_SackYdsOff + 
##     DL5_PuntAvgOff + DL5_PassCompDef + DL5_PassIntDef, data = nfltrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67957 -0.27771  0.00259  0.27766  0.68396 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.3466239  0.0054010  64.178   <2e-16 ***
## Line             0.0199520  0.0008922  22.362   <2e-16 ***
## DL5_FumblesOff  -0.0241045  0.0102934  -2.342   0.0193 *  
## DL5_SackYdsOff  -0.0012215  0.0006358  -1.921   0.0548 .  
## DL5_PuntAvgOff  -0.0020263  0.0011678  -1.735   0.0828 .  
## DL5_PassCompDef  0.0020980  0.0012801   1.639   0.1013    
## DL5_PassIntDef  -0.0178045  0.0081918  -2.173   0.0298 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3177 on 3454 degrees of freedom
## Multiple R-squared:  0.1613, Adjusted R-squared:  0.1599 
## F-statistic: 110.7 on 6 and 3454 DF,  p-value: < 2.2e-16
pred1 <- ifelse(fitted(modelC) < 0.3437,1,2)
table(pred1, nfltrain$Result)
##      
## pred1    1    2
##     1 1160  571
##     2  572 1158
pred2 <- predict(modelC, newdata = nfltest)
pred2 <- ifelse(pred2 < 0.3437,1,2)
table(pred2, nfltest$Result)
##      
## pred2   1   2
##     1 128  54
##     2  59 144
###RIDGE REGRESSION (MODEL-D)
### the lambda valude depends on your search regression and step size  
library(MASS);
library(lattice);
## Warning: package 'lattice' was built under R version 3.1.2

library(leaps);
library(lars);
## Loaded lars 1.2
## 
## 
## Attaching package: 'lars'
## 
## The following object is masked from 'package:psych':
## 
##     error.bars
nfltrain$Site <- as.numeric(nfltrain$Site) -1
nfltest$Site <- as.numeric(nfltest$Site) -1
nflval$Site <- as.numeric(nflval$Site) -1


model.ridgeb <- lm.ridge( Result ~ Line+Totalline+Site+ DL5_ScoreOff    + DL5_FirstDownOff  + DL5_ThirdDownPctOff   + DL5_RushAttOff    + DL5_RushYdsOff    + DL5_PassAttOff    + DL5_PassCompOff   + DL5_PassYdsOff    + DL5_PassIntOff    + DL5_FumblesOff    + DL5_SackYdsOff    + DL5_PenYdsOff + DL5_TimePossOff   + DL5_PuntAvgOff    + DL5_ScoreDef  + DL5_FirstDownDef  + DL5_ThirdDownPctDef   + DL5_RushAttDef    + DL5_RushYdsDef    + DL5_PassAttDef    + DL5_PassCompDef   + DL5_PassYdsDef    + DL5_PassIntDef    + DL5_FumblesDef    + DL5_SackYdsDef    + DL5_PenYdsDef + DL5_TimePossDef, data = nfltrain, lambda= seq(0,10,0.001));

ridge1 <- model.ridgeb; 

lambdaopt <- which.min(ridge1$GCV); 

### The coefficient of ridge on the original, raw data scale
modelD.coef <- coef(ridge1)[lambdaopt,];

## Testing error of Model D: Ridge Regression
rig1coef <- ridge1$coef[,lambdaopt];
rig1intercepts <- ridge1$ym - sum(ridge1$xm * (rig1coef / ridge1$scales)); 

predD <- scale(nfltrain[,names(ridge1$scales)],center = F, scale = ridge1$scales)%*%  rig1coef + rig1intercepts;
pred1 <- ifelse(predD < 1.5,1,2)
table(pred1, nfltrain$Result)
##      
## pred1    1    2
##     1 1161  574
##     2  571 1155
predE <- scale(nfltest[,names(ridge1$scales)],center = F, scale = ridge1$scales)%*%  rig1coef + rig1intercepts;
pred2 <- ifelse(predE < 1.5,1,2)
table(pred2, nfltest$Result)
##      
## pred2   1   2
##     1 130  55
##     2  57 143
### LASSO (MODEL-E)
modelE <- lars(as.matrix(nfltrain[,c(34:36,93:119)]), nfltrain[,37], type= "lasso", trace= FALSE);
Cp1  <- summary(modelE)$Cp;
index1 <- which.min(Cp1);
lasso.lambda <- modelE$lambda[index1]
round(coef(modelE)[index1,],4)
##                Site                Line           Totalline 
##              0.0000              0.0286              0.0000 
##        DL5_ScoreOff    DL5_FirstDownOff DL5_ThirdDownPctOff 
##              0.0020              0.0000              0.0002 
##      DL5_RushAttOff      DL5_RushYdsOff      DL5_PassAttOff 
##              0.0012              0.0000              0.0000 
##     DL5_PassCompOff      DL5_PassYdsOff      DL5_PassIntOff 
##              0.0000              0.0000              0.0055 
##      DL5_FumblesOff      DL5_SackYdsOff       DL5_PenYdsOff 
##             -0.0311             -0.0010              0.0000 
##     DL5_TimePossOff      DL5_PuntAvgOff        DL5_ScoreDef 
##              0.0000             -0.0024              0.0000 
##    DL5_FirstDownDef DL5_ThirdDownPctDef      DL5_RushAttDef 
##              0.0000              0.0000              0.0044 
##      DL5_RushYdsDef      DL5_PassAttDef     DL5_PassCompDef 
##              0.0000              0.0031              0.0000 
##      DL5_PassYdsDef      DL5_PassIntDef      DL5_FumblesDef 
##              0.0000             -0.0192             -0.0390 
##      DL5_SackYdsDef       DL5_PenYdsDef     DL5_TimePossDef 
##              0.0000              0.0000              0.0017
predF <- predict(modelE, as.matrix(nfltrain[,c(34:36,93:119)]), s=lasso.lambda, type="fit", mode="lambda");
pred1 <- ifelse(predF$fit < 1.5,1,2)
table(pred1, nfltrain$Result)
##      
## pred1    1    2
##     1 1176  560
##     2  556 1169
predG <- predict(modelE, as.matrix(nfltest[,c(34:36,94:120)]), s=lasso.lambda, type="fit", mode="lambda");
pred2 <- ifelse(predG$fit < 1.5,1,2)
table(pred2, nfltest$Result)
##      
## pred2   1   2
##     1 114  89
##     2  73 109
# KNN (MODEL-F)
## Datasets for KNN
knfltrain <- nfltrain[,c(37, 34:36,38:64, 66:92)]
knfltest <- nfltest[,c(37, 34:36,38:64, 66:92)]
knflval <- nflval[,c(37, 34:36,38:64, 66:92)]


knfltest$Site <- as.numeric(knfltest$Site) -1
knfltrain$Site <- as.numeric(knfltrain$Site) -1
knflval$Site <- as.numeric(knflval$Site) -1


knfltest <- cbind(knfltest[,1:2],scale(knfltest[,c(-1,-2)]))
knfltrain <- cbind(knfltrain[,1:2],scale(knfltrain[,c(-1,-2)]))
knflval <- cbind(knflval[,1:2],scale(knflval[,c(-1,-2)]))

##Top Vairables for KNN
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
select.top.N.ROC <- function(response, predictors, data, n) {
    n <- min(n, length(predictors))
    aucs <- sapply(predictors, function(predictor) {
        auc(data[[response]], data[[predictor]])
    })
    return(predictors[order(aucs, decreasing=TRUE)][1:n])
}

top.variables <- select.top.N.ROC("Result", colnames(knfltest[,-1]), knfltest, 3)
top.variables
## [1] "Line"          "L5_ScoreOff"   "L5_FumblesOff"
top.variables[4] = "Site"
top.variables[1] = "Line"
top.variables[2] = "L5_ScoreOff"
top.variables[3] = "OL5_ScoreOff"

## KNN (MODEL-F)
library(class)
## Warning: package 'class' was built under R version 3.1.2
mod.train <- knn( knfltrain[,top.variables], knfltrain[,top.variables], k = 71, cl = knfltrain[,1]);
mod.test <- knn( knfltrain[,top.variables], knfltest[,top.variables], k = 71, cl= knfltrain[,1]);
mod.val <- knn( knfltrain[,top.variables], knflval[,top.variables], k = 71, cl= knfltrain[,1]);

misclass.train <- mean( mod.train != knfltrain[,1]);
misclass.test <- mean( mod.test != knfltest[,1]);
misclass.val <- mean( mod.val != knflval[,1]);


##View(knflval)

## KNN (MODEL-F) continued
ks <- c(10,50,100,150)  # different neighbor setting
nks <- length(ks)
misclass.train <-  numeric(length= nks)
misclass.test <- numeric(length= nks)

for (i in 1:nks ){
    mod.train <- knn( knfltrain[,2:58], knfltrain[,2:58], k = ks[i], cl = knfltrain[,1]);
    mod.test <- knn( knfltrain[,2:58], knfltest[,2:58], k = ks[i], cl= knfltrain[,1]);
    misclass.train[i] <- mean( mod.train != knfltrain[,1]);
    misclass.test[i] <- mean( mod.test != knfltest[,1]);
}

cbind(ks, misclass.train, misclass.test)
##       ks misclass.train misclass.test
## [1,]  10      0.3056920     0.3792208
## [2,]  50      0.3403641     0.3584416
## [3,] 100      0.3533661     0.3428571
## [4,] 150      0.3539439     0.3480519
##GLM (MODEL-G)

#only DL columns from NFLtrain
nfltrain1 <- nfltrain[, c('Result', 'Site' ,'Line','Totalline' ,'DL5_ScoreOff' , 'DL5_FirstDownOff' ,  'DL5_ThirdDownPctOff' , 'DL5_RushAttOff' , 'DL5_PassAttOff' , 'DL5_PassCompOff' , 'DL5_PassIntOff' ,'DL5_FumblesOff' , 'DL5_SackYdsOff' , 'DL5_TimePossOff' ,   'DL5_PuntAvgOff' , 'DL5_ScoreDef' , 'DL5_ThirdDownPctDef' , 'DL5_RushAttDef' , 'DL5_RushYdsDef' , 'DL5_PassAttDef' , 'DL5_PassCompDef')]

glm1 <- glm(formula = Result-1 ~ ., family = "binomial", data = nfltrain1)

nothing <- glm(Result-1 ~ 1,data=nfltrain1)
fullmod = glm(Result-1 ~ .,family = "binomial", data= nfltrain1)

bothways = step(nothing, list(lower=formula(nothing),upper=formula(fullmod)),direction="both",trace=0)
pred2 <- ifelse(fitted(bothways) < 0.5,0,1)
table(pred2, nfltrain$Result)
##      
## pred2    1    2
##     0 1157  572
##     1  575 1157