1 INTRODUCTION

The NFL (National Football League) has launched the fourth annual Big Data Bowl powered by Amazon Web Services, Inc. (AWS), a crowd-sourcing competition designed to engage and empower the football analytics community to drive innovation, create new insights and make the game more exciting for fans (for more see https://nflcommunications.com/Pages/NFL-Announces-Fourth-Annual-Big-Data-Bowl-Powered-by-AWS.aspx).

We use some interesting data from open source (https://www.kaggle.com/etsc9287/nfl-2018-eda-and-feature-engineering) for demonstration purpose only, applying logistic regression model.

For preceding Super Bowl LI 2016 game analysis see https://rpubs.com/alex-lev/247556

2 TOTAL YARDS RUN

This metric is crucial for estimating team activity during the play season on the whole. The only problem is how much this activity can produce in terms of wins versus losses i.e positive difference of wins?

To this end we create the new binary variable as dichotomy: win_los [0,1] by the simple rule win_los=if_else(wins>losses,1,0).

library(tidyverse)
library(broom)
library(extrafont)
library(DT)
library(lmtest)
library(pscl)
#source data for analysis
#See NFL Big Data Bowl 2018 Analysis by Ethan Schacht
#https://www.kaggle.com/etsc9287/nfl-2018-eda-and-feature-engineering
yards_standings <- readRDS(file="yards_standings")
#appending the new variable win_los 
yards_standings_log <- yards_standings%>% 
  mutate(win_los=if_else(wins>losses,1,0)) %>% 
  select_at(vars(total_yards,win_los,team))
glimpse(yards_standings_log)
## Rows: 32
## Columns: 3
## $ total_yards <dbl> 3413, 3379, 3083, 3050, 3004, 2971, 2906, 2901, 2879, 2866…
## $ win_los     <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1…
## $ team        <chr> "SEA", "SF", "LA", "BAL", "NE", "IND", "DEN", "DAL", "CAR"…
datatable(yards_standings_log)
#logistic regression model
fit.glm <- glm(win_los~total_yards,data = yards_standings_log,family = binomial(link = "logit"))
fit.glm
## 
## Call:  glm(formula = win_los ~ total_yards, family = binomial(link = "logit"), 
##     data = yards_standings_log)
## 
## Coefficients:
## (Intercept)  total_yards  
##   -8.940121     0.003377  
## 
## Degrees of Freedom: 31 Total (i.e. Null);  30 Residual
## Null Deviance:       44.24 
## Residual Deviance: 34.59     AIC: 38.59
summary(fit.glm)
## 
## Call:
## glm(formula = win_los ~ total_yards, family = binomial(link = "logit"), 
##     data = yards_standings_log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5633  -0.7874  -0.4714   0.8525   1.8705  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -8.940121   3.360193  -2.661  0.00780 **
## total_yards  0.003377   0.001275   2.649  0.00807 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.236  on 31  degrees of freedom
## Residual deviance: 34.592  on 30  degrees of freedom
## AIC: 38.592
## 
## Number of Fisher Scoring iterations: 4
anova(fit.glm,test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: win_los
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                           31     44.236            
## total_yards  1   9.6444        30     34.592 0.001899 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fit.glm)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1          44.2      31  -17.3  38.6  41.5     34.6          30    32
yards_standings_log$fitted.values <- fit.glm$fitted.values
pred <- predict.glm(fit.glm, 
                    data.frame(total_yards=yards_standings_log$total_yards,win_los = yards_standings_log$win_los),
                    type = "link", se.fit = TRUE)
yards_standings_log$fit <- pred$fit
yards_standings_log$se.fit <- pred$se.fit
# CI for fitted values
yards_standings_log <- within(yards_standings_log, {
  fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
  fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})

p <- ggplot(yards_standings_log, aes(x = total_yards, y = win_los))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = total_yards, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = total_yards, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of case when wins outnumber losses")) + xlab("Total yards") + ylab("Probability") + theme_bw() 
print(p)

summary(yards_standings_log$total_yards)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1967    2262    2595    2603    2884    3413
hist(yards_standings_log$total_yards,col="red",main = "Histogram of Yards",xlab = "Total yards run")

3 BALL POSSESSIONS TIME

Here is one more crucial metric for estimating team activity applied for logistic regression analysis in the same way as above.

poss_standings <- readRDS(file="poss_standings")
posses_log <- poss_standings%>% 
  mutate(win_los=if_else(wins>losses,1,0)) %>% 
  select_at(vars(total_poss_mins,win_los,team))
datatable(posses_log)
glimpse(posses_log)
## Rows: 32
## Columns: 3
## $ total_poss_mins <dbl> 809.70, 809.43, 798.60, 778.90, 761.45, 761.20, 744.57…
## $ win_los         <dbl> 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, …
## $ team            <chr> "BAL", "NE", "SEA", "PHI", "SF", "NO", "DAL", "OAK", "…
#logistic regression model
fit.glm.2 <- glm(win_los~total_poss_mins,data = posses_log,family = binomial(link = "logit"))
fit.glm.2
## 
## Call:  glm(formula = win_los ~ total_poss_mins, family = binomial(link = "logit"), 
##     data = posses_log)
## 
## Coefficients:
##     (Intercept)  total_poss_mins  
##       -15.37641          0.02168  
## 
## Degrees of Freedom: 31 Total (i.e. Null);  30 Residual
## Null Deviance:       44.24 
## Residual Deviance: 36.18     AIC: 40.18
summary(fit.glm.2)
## 
## Call:
## glm(formula = win_los ~ total_poss_mins, family = binomial(link = "logit"), 
##     data = posses_log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8503  -0.9322  -0.4194   0.9477   1.7866  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -15.376410   6.333475  -2.428   0.0152 *
## total_poss_mins   0.021683   0.008989   2.412   0.0159 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.236  on 31  degrees of freedom
## Residual deviance: 36.182  on 30  degrees of freedom
## AIC: 40.182
## 
## Number of Fisher Scoring iterations: 4
anova(fit.glm.2,test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: win_los
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                               31     44.236            
## total_poss_mins  1   8.0541        30     36.182  0.00454 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fit.glm.2)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1          44.2      31  -18.1  40.2  43.1     36.2          30    32
posses_log$fitted.values <- fit.glm.2$fitted.values
pred <- predict.glm(fit.glm.2, 
                    data.frame(total_poss_mins=posses_log$total_poss_mins,win_los = posses_log$win_los),
                    type = "link", se.fit = TRUE)
posses_log$fit <- pred$fit
posses_log$se.fit <- pred$se.fit
# CI for fitted values
posses_log <- within(posses_log, {
  fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
  fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})

p <- ggplot(posses_log, aes(x = total_poss_mins, y = win_los))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = total_poss_mins, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = total_poss_mins, y = fitted.values), color = "blue")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of case when wins outnumber losses")) + xlab("Total ball possessions, minutes") + ylab("Probability") + theme_bw() 
print(p)

summary(posses_log$total_poss_mins)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   593.7   663.8   694.4   702.5   736.1   809.7
hist(posses_log$total_poss_mins,col="blue",main = "Histogram of ball possessions",xlab = "Total ball possessions, minutes")

4 TOTAL YARDS RUN AND TOTAL BALL POSSESSIONS

What if two metrics (yards and possessions) applied for combined logistic regression model: does it make any sense?

yards_posses_log <- inner_join(yards_standings_log,posses_log,by="team") %>%mutate(win_los=win_los.x) %>%  select(team,total_yards,total_poss_mins,win_los)
yards_posses_log
## # A tibble: 32 × 4
##    team  total_yards total_poss_mins win_los
##    <chr>       <dbl>           <dbl>   <dbl>
##  1 SEA          3413            799.       1
##  2 SF           3379            761.       1
##  3 LA           3083            693.       1
##  4 BAL          3050            810.       1
##  5 NE           3004            809.       1
##  6 IND          2971            709.       1
##  7 DEN          2906            722.       0
##  8 DAL          2901            745.       1
##  9 CAR          2879            675.       0
## 10 CLE          2866            724.       0
## # … with 22 more rows
fit.glm.3 <- glm(win_los~total_poss_mins+total_yards,data = yards_posses_log,family = binomial(link = "logit"))
fit.glm.3
## 
## Call:  glm(formula = win_los ~ total_poss_mins + total_yards, family = binomial(link = "logit"), 
##     data = yards_posses_log)
## 
## Coefficients:
##     (Intercept)  total_poss_mins      total_yards  
##      -12.685075         0.008804         0.002437  
## 
## Degrees of Freedom: 31 Total (i.e. Null);  29 Residual
## Null Deviance:       44.24 
## Residual Deviance: 34.07     AIC: 40.07
summary(fit.glm.3)
## 
## Call:
## glm(formula = win_los ~ total_poss_mins + total_yards, family = binomial(link = "logit"), 
##     data = yards_posses_log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5097  -0.7685  -0.4190   0.8284   1.8426  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -12.685075   6.454712  -1.965   0.0494 *
## total_poss_mins   0.008804   0.012383   0.711   0.4771  
## total_yards       0.002437   0.001770   1.377   0.1685  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.236  on 31  degrees of freedom
## Residual deviance: 34.075  on 29  degrees of freedom
## AIC: 40.075
## 
## Number of Fisher Scoring iterations: 4
glance(fit.glm.3)
## # A tibble: 1 × 8
##   null.deviance df.null logLik   AIC   BIC deviance df.residual  nobs
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int> <int>
## 1          44.2      31  -17.0  40.1  44.5     34.1          29    32
anova(fit.glm.3,test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: win_los
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                               31     44.236            
## total_poss_mins  1   8.0541        30     36.182  0.00454 **
## total_yards      1   2.1077        29     34.075  0.14656   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(fit.glm.3,fit.glm,fit.glm.2)
## Likelihood ratio test
## 
## Model 1: win_los ~ total_poss_mins + total_yards
## Model 2: win_los ~ total_yards
## Model 3: win_los ~ total_poss_mins
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   3 -17.037                         
## 2   2 -17.296 -1 0.5174     0.4719    
## 3   2 -18.091  0 1.5903     <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Unfortunately we got no significant result at all compared to the models above(fit.glm, fit.glm.2)

Anyway we can predict for 3000 yards and 750 minutes the probability of cases when wins outnumber losses:

z <- exp(predict.glm(fit.glm,newdata = data.frame(total_yards=3000)))
z/(1+z)
##         1 
## 0.7668008
z2 <- exp(predict.glm(fit.glm.2,newdata = data.frame(total_poss_mins=750)))
z2/(1+z2)
##         1 
## 0.7080744
z3 <- exp(predict.glm(fit.glm.3,newdata = data.frame(total_yards=3000,total_poss_mins=750)))
z3/(1+z3)
##         1 
## 0.7734707

5 SPLITTING DATA SET AND TRAINING MODEL

MisClassError <- c()

N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(yards_standings_log,0.7,replace = F)
test <-yards_standings_log %>% setdiff(train)
#fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))

res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)

fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$win_los),3)

}

MCE <- 1-MisClassError
summary(MCE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3000  0.6000  0.7000  0.7158  0.8000  1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))
## [1] "Mean Classification Accuracy = 0.716"
print(paste('95% Confidence interval for MCE'))
## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))
## [1] "MCE <= 0.72"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))
## [1] "MCE >= 0.711"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) + 
 geom_histogram(aes(y=..density..), bins = 8,colour="black", fill="green")+theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
  theme(plot.title = element_text(hjust = .5))

summary(fit.glm)
## 
## Call:
## glm(formula = win_los ~ total_yards, family = binomial(link = "logit"), 
##     data = yards_standings_log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5633  -0.7874  -0.4714   0.8525   1.8705  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -8.940121   3.360193  -2.661  0.00780 **
## total_yards  0.003377   0.001275   2.649  0.00807 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.236  on 31  degrees of freedom
## Residual deviance: 34.592  on 30  degrees of freedom
## AIC: 38.592
## 
## Number of Fisher Scoring iterations: 4
pR2(fit.glm)
## fitting null model for pseudo-r2
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -17.2959824 -22.1181690   9.6443732   0.2180192   0.2602083   0.3473980
anova(fit.glm, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: win_los
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                           31     44.236            
## total_yards  1   9.6444        30     34.592 0.001899 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 INTERPRETING LOGISTIC REGRESSION MODEL

About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression.

\[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]

As we can see both total_poss_mins and total yards are statistically significant: low p-value suggests a strong association of ball possession time and yards run with the probability of team’s wins outnumber it’s losses. So both metrics (possessions and yards) are proved relevant with particular team capabilities.

The difference between the null deviance and the residual deviance shows how our model is doing against the null model (a model with only the intercept). The wider this gap, the better.

7 SUPPORT VECTOR MACHINE

Alternative option for classification wins-losses gained by NFL teams during the season is support vector machine method. See https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM.

library(e1071)

model.svm<-svm(win_los~total_poss_mins+total_yards,data = yards_posses_log,
               kernel="linear",type="C-classification",scale=T,probability=T) 
pred_svm<-predict(model.svm, yards_posses_log, probability=T) 
cbind(attr(pred_svm,which = "probabilities"),yards_posses_log)
##            1         0 team total_yards total_poss_mins win_los
## 1  0.8577219 0.1422781  SEA        3413          798.60       1
## 2  0.8218595 0.1781405   SF        3379          761.45       1
## 3  0.6526304 0.3473696   LA        3083          692.77       1
## 4  0.7738268 0.2261732  BAL        3050          809.70       1
## 5  0.7593010 0.2406990   NE        3004          809.43       1
## 6  0.6288017 0.3711983  IND        2971          708.88       1
## 7  0.6199111 0.3800889  DEN        2906          722.18       0
## 8  0.6470814 0.3529186  DAL        2901          744.57       1
## 9  0.5439047 0.4560953  CAR        2879          674.75       0
## 10 0.6061263 0.3938737  CLE        2866          724.17       0
## 11 0.6139427 0.3860573  MIN        2865          730.33       1
## 12 0.6525640 0.3474360   NO        2861          761.20       1
## 13 0.5969285 0.4030715  OAK        2803          736.77       0
## 14 0.5857945 0.4142055  LAC        2787          733.50       1
## 15 0.4749340 0.5250660  TEN        2634          701.10       0
## 16 0.5719342 0.4280658  PHI        2607          778.90       0
## 17 0.5017222 0.4982778  HOU        2583          735.93       1
## 18 0.3316208 0.6683792   KC        2466          645.98       1
## 19 0.3932347 0.6067653  DET        2458          696.02       0
## 20 0.3451749 0.6548251  BUF        2438          665.40       1
## 21 0.3170904 0.6829096  NYG        2413          650.52       0
## 22 0.3190805 0.6809195   GB        2325          679.30       0
## 23 0.2706461 0.7293539  JAX        2283          650.70       0
## 24 0.3130217 0.6869783  CHI        2267          692.20       1
## 25 0.2595780 0.7404220  WAS        2249          651.07       0
## 26 0.2850570 0.7149430  NYJ        2209          686.32       0
## 27 0.2220870 0.7779130  CIN        2207          627.45       0
## 28 0.2546933 0.7453067  PIT        2186          665.95       1
## 29 0.2246225 0.7753775  ATL        2113          659.05       0
## 30 0.1561541 0.8438459  ARI        2066          593.73       0
## 31 0.1618936 0.8381064  MIA        2057          604.15       0
## 32 0.2105381 0.7894619   TB        1967          689.35       0
table(pred_svm,yards_posses_log$win_los)#confusion matrix
##         
## pred_svm  0  1
##        0 12  4
##        1  5 11
sum(ifelse(pred_svm==yards_posses_log$win_los,1,0))/
  length(yards_posses_log$team)#classification accuracy
## [1] 0.71875
plot(model.svm,yards_posses_log,total_poss_mins~total_yards,col=c("blue","red"),)