The NFL (National Football League) has launched the fourth annual Big Data Bowl powered by Amazon Web Services, Inc. (AWS), a crowd-sourcing competition designed to engage and empower the football analytics community to drive innovation, create new insights and make the game more exciting for fans (for more see https://nflcommunications.com/Pages/NFL-Announces-Fourth-Annual-Big-Data-Bowl-Powered-by-AWS.aspx).
We use some interesting data from open source (https://www.kaggle.com/etsc9287/nfl-2018-eda-and-feature-engineering) for demonstration purpose only, applying logistic regression model.
For preceding Super Bowl LI 2016 game analysis see https://rpubs.com/alex-lev/247556
This metric is crucial for estimating team activity during the play season on the whole. The only problem is how much this activity can produce in terms of wins versus losses i.e positive difference of wins?
To this end we create the new binary variable as dichotomy: win_los [0,1] by the simple rule win_los=if_else(wins>losses,1,0).
library(tidyverse)
library(broom)
library(extrafont)
library(DT)
library(lmtest)
library(pscl)
#source data for analysis
#See NFL Big Data Bowl 2018 Analysis by Ethan Schacht
#https://www.kaggle.com/etsc9287/nfl-2018-eda-and-feature-engineering
yards_standings <- readRDS(file="yards_standings")
#appending the new variable win_los
yards_standings_log <- yards_standings%>%
mutate(win_los=if_else(wins>losses,1,0)) %>%
select_at(vars(total_yards,win_los,team))
glimpse(yards_standings_log)## Rows: 32
## Columns: 3
## $ total_yards <dbl> 3413, 3379, 3083, 3050, 3004, 2971, 2906, 2901, 2879, 2866…
## $ win_los <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1…
## $ team <chr> "SEA", "SF", "LA", "BAL", "NE", "IND", "DEN", "DAL", "CAR"…
datatable(yards_standings_log)#logistic regression model
fit.glm <- glm(win_los~total_yards,data = yards_standings_log,family = binomial(link = "logit"))
fit.glm##
## Call: glm(formula = win_los ~ total_yards, family = binomial(link = "logit"),
## data = yards_standings_log)
##
## Coefficients:
## (Intercept) total_yards
## -8.940121 0.003377
##
## Degrees of Freedom: 31 Total (i.e. Null); 30 Residual
## Null Deviance: 44.24
## Residual Deviance: 34.59 AIC: 38.59
summary(fit.glm)##
## Call:
## glm(formula = win_los ~ total_yards, family = binomial(link = "logit"),
## data = yards_standings_log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5633 -0.7874 -0.4714 0.8525 1.8705
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.940121 3.360193 -2.661 0.00780 **
## total_yards 0.003377 0.001275 2.649 0.00807 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.236 on 31 degrees of freedom
## Residual deviance: 34.592 on 30 degrees of freedom
## AIC: 38.592
##
## Number of Fisher Scoring iterations: 4
anova(fit.glm,test="Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: win_los
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 31 44.236
## total_yards 1 9.6444 30 34.592 0.001899 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fit.glm)## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 44.2 31 -17.3 38.6 41.5 34.6 30 32
yards_standings_log$fitted.values <- fit.glm$fitted.values
pred <- predict.glm(fit.glm,
data.frame(total_yards=yards_standings_log$total_yards,win_los = yards_standings_log$win_los),
type = "link", se.fit = TRUE)
yards_standings_log$fit <- pred$fit
yards_standings_log$se.fit <- pred$se.fit
# CI for fitted values
yards_standings_log <- within(yards_standings_log, {
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
p <- ggplot(yards_standings_log, aes(x = total_yards, y = win_los))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = total_yards, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = total_yards, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of case when wins outnumber losses")) + xlab("Total yards") + ylab("Probability") + theme_bw()
print(p)summary(yards_standings_log$total_yards)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1967 2262 2595 2603 2884 3413
hist(yards_standings_log$total_yards,col="red",main = "Histogram of Yards",xlab = "Total yards run")Here is one more crucial metric for estimating team activity applied for logistic regression analysis in the same way as above.
poss_standings <- readRDS(file="poss_standings")
posses_log <- poss_standings%>%
mutate(win_los=if_else(wins>losses,1,0)) %>%
select_at(vars(total_poss_mins,win_los,team))
datatable(posses_log)glimpse(posses_log)## Rows: 32
## Columns: 3
## $ total_poss_mins <dbl> 809.70, 809.43, 798.60, 778.90, 761.45, 761.20, 744.57…
## $ win_los <dbl> 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, …
## $ team <chr> "BAL", "NE", "SEA", "PHI", "SF", "NO", "DAL", "OAK", "…
#logistic regression model
fit.glm.2 <- glm(win_los~total_poss_mins,data = posses_log,family = binomial(link = "logit"))
fit.glm.2##
## Call: glm(formula = win_los ~ total_poss_mins, family = binomial(link = "logit"),
## data = posses_log)
##
## Coefficients:
## (Intercept) total_poss_mins
## -15.37641 0.02168
##
## Degrees of Freedom: 31 Total (i.e. Null); 30 Residual
## Null Deviance: 44.24
## Residual Deviance: 36.18 AIC: 40.18
summary(fit.glm.2)##
## Call:
## glm(formula = win_los ~ total_poss_mins, family = binomial(link = "logit"),
## data = posses_log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8503 -0.9322 -0.4194 0.9477 1.7866
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.376410 6.333475 -2.428 0.0152 *
## total_poss_mins 0.021683 0.008989 2.412 0.0159 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.236 on 31 degrees of freedom
## Residual deviance: 36.182 on 30 degrees of freedom
## AIC: 40.182
##
## Number of Fisher Scoring iterations: 4
anova(fit.glm.2,test = "Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: win_los
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 31 44.236
## total_poss_mins 1 8.0541 30 36.182 0.00454 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
glance(fit.glm.2)## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 44.2 31 -18.1 40.2 43.1 36.2 30 32
posses_log$fitted.values <- fit.glm.2$fitted.values
pred <- predict.glm(fit.glm.2,
data.frame(total_poss_mins=posses_log$total_poss_mins,win_los = posses_log$win_los),
type = "link", se.fit = TRUE)
posses_log$fit <- pred$fit
posses_log$se.fit <- pred$se.fit
# CI for fitted values
posses_log <- within(posses_log, {
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
p <- ggplot(posses_log, aes(x = total_poss_mins, y = win_los))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = total_poss_mins, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = total_poss_mins, y = fitted.values), color = "blue")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of case when wins outnumber losses")) + xlab("Total ball possessions, minutes") + ylab("Probability") + theme_bw()
print(p)summary(posses_log$total_poss_mins)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 593.7 663.8 694.4 702.5 736.1 809.7
hist(posses_log$total_poss_mins,col="blue",main = "Histogram of ball possessions",xlab = "Total ball possessions, minutes")What if two metrics (yards and possessions) applied for combined logistic regression model: does it make any sense?
yards_posses_log <- inner_join(yards_standings_log,posses_log,by="team") %>%mutate(win_los=win_los.x) %>% select(team,total_yards,total_poss_mins,win_los)
yards_posses_log## # A tibble: 32 × 4
## team total_yards total_poss_mins win_los
## <chr> <dbl> <dbl> <dbl>
## 1 SEA 3413 799. 1
## 2 SF 3379 761. 1
## 3 LA 3083 693. 1
## 4 BAL 3050 810. 1
## 5 NE 3004 809. 1
## 6 IND 2971 709. 1
## 7 DEN 2906 722. 0
## 8 DAL 2901 745. 1
## 9 CAR 2879 675. 0
## 10 CLE 2866 724. 0
## # … with 22 more rows
fit.glm.3 <- glm(win_los~total_poss_mins+total_yards,data = yards_posses_log,family = binomial(link = "logit"))
fit.glm.3##
## Call: glm(formula = win_los ~ total_poss_mins + total_yards, family = binomial(link = "logit"),
## data = yards_posses_log)
##
## Coefficients:
## (Intercept) total_poss_mins total_yards
## -12.685075 0.008804 0.002437
##
## Degrees of Freedom: 31 Total (i.e. Null); 29 Residual
## Null Deviance: 44.24
## Residual Deviance: 34.07 AIC: 40.07
summary(fit.glm.3)##
## Call:
## glm(formula = win_los ~ total_poss_mins + total_yards, family = binomial(link = "logit"),
## data = yards_posses_log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5097 -0.7685 -0.4190 0.8284 1.8426
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.685075 6.454712 -1.965 0.0494 *
## total_poss_mins 0.008804 0.012383 0.711 0.4771
## total_yards 0.002437 0.001770 1.377 0.1685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.236 on 31 degrees of freedom
## Residual deviance: 34.075 on 29 degrees of freedom
## AIC: 40.075
##
## Number of Fisher Scoring iterations: 4
glance(fit.glm.3)## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 44.2 31 -17.0 40.1 44.5 34.1 29 32
anova(fit.glm.3,test = "Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: win_los
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 31 44.236
## total_poss_mins 1 8.0541 30 36.182 0.00454 **
## total_yards 1 2.1077 29 34.075 0.14656
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(fit.glm.3,fit.glm,fit.glm.2)## Likelihood ratio test
##
## Model 1: win_los ~ total_poss_mins + total_yards
## Model 2: win_los ~ total_yards
## Model 3: win_los ~ total_poss_mins
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 -17.037
## 2 2 -17.296 -1 0.5174 0.4719
## 3 2 -18.091 0 1.5903 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Unfortunately we got no significant result at all compared to the models above(fit.glm, fit.glm.2)
Anyway we can predict for 3000 yards and 750 minutes the probability of cases when wins outnumber losses:
z <- exp(predict.glm(fit.glm,newdata = data.frame(total_yards=3000)))
z/(1+z)## 1
## 0.7668008
z2 <- exp(predict.glm(fit.glm.2,newdata = data.frame(total_poss_mins=750)))
z2/(1+z2)## 1
## 0.7080744
z3 <- exp(predict.glm(fit.glm.3,newdata = data.frame(total_yards=3000,total_poss_mins=750)))
z3/(1+z3)## 1
## 0.7734707
MisClassError <- c()
N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(yards_standings_log,0.7,replace = F)
test <-yards_standings_log %>% setdiff(train)
#fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))
res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)
fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$win_los),3)
}
MCE <- 1-MisClassError
summary(MCE)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3000 0.6000 0.7000 0.7158 0.8000 1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))## [1] "Mean Classification Accuracy = 0.716"
print(paste('95% Confidence interval for MCE'))## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))## [1] "MCE <= 0.72"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))## [1] "MCE >= 0.711"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) +
geom_histogram(aes(y=..density..), bins = 8,colour="black", fill="green")+theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
theme(plot.title = element_text(hjust = .5))summary(fit.glm)##
## Call:
## glm(formula = win_los ~ total_yards, family = binomial(link = "logit"),
## data = yards_standings_log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5633 -0.7874 -0.4714 0.8525 1.8705
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.940121 3.360193 -2.661 0.00780 **
## total_yards 0.003377 0.001275 2.649 0.00807 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.236 on 31 degrees of freedom
## Residual deviance: 34.592 on 30 degrees of freedom
## AIC: 38.592
##
## Number of Fisher Scoring iterations: 4
pR2(fit.glm)## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML r2CU
## -17.2959824 -22.1181690 9.6443732 0.2180192 0.2602083 0.3473980
anova(fit.glm, test="Chisq")## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: win_los
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 31 44.236
## total_yards 1 9.6444 30 34.592 0.001899 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression.
\[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]
As we can see both total_poss_mins and total yards are statistically significant: low p-value suggests a strong association of ball possession time and yards run with the probability of team’s wins outnumber it’s losses. So both metrics (possessions and yards) are proved relevant with particular team capabilities.
The difference between the null deviance and the residual deviance shows how our model is doing against the null model (a model with only the intercept). The wider this gap, the better.
Alternative option for classification wins-losses gained by NFL teams during the season is support vector machine method. See https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Classification/SVM.
library(e1071)
model.svm<-svm(win_los~total_poss_mins+total_yards,data = yards_posses_log,
kernel="linear",type="C-classification",scale=T,probability=T)
pred_svm<-predict(model.svm, yards_posses_log, probability=T)
cbind(attr(pred_svm,which = "probabilities"),yards_posses_log)## 1 0 team total_yards total_poss_mins win_los
## 1 0.8577219 0.1422781 SEA 3413 798.60 1
## 2 0.8218595 0.1781405 SF 3379 761.45 1
## 3 0.6526304 0.3473696 LA 3083 692.77 1
## 4 0.7738268 0.2261732 BAL 3050 809.70 1
## 5 0.7593010 0.2406990 NE 3004 809.43 1
## 6 0.6288017 0.3711983 IND 2971 708.88 1
## 7 0.6199111 0.3800889 DEN 2906 722.18 0
## 8 0.6470814 0.3529186 DAL 2901 744.57 1
## 9 0.5439047 0.4560953 CAR 2879 674.75 0
## 10 0.6061263 0.3938737 CLE 2866 724.17 0
## 11 0.6139427 0.3860573 MIN 2865 730.33 1
## 12 0.6525640 0.3474360 NO 2861 761.20 1
## 13 0.5969285 0.4030715 OAK 2803 736.77 0
## 14 0.5857945 0.4142055 LAC 2787 733.50 1
## 15 0.4749340 0.5250660 TEN 2634 701.10 0
## 16 0.5719342 0.4280658 PHI 2607 778.90 0
## 17 0.5017222 0.4982778 HOU 2583 735.93 1
## 18 0.3316208 0.6683792 KC 2466 645.98 1
## 19 0.3932347 0.6067653 DET 2458 696.02 0
## 20 0.3451749 0.6548251 BUF 2438 665.40 1
## 21 0.3170904 0.6829096 NYG 2413 650.52 0
## 22 0.3190805 0.6809195 GB 2325 679.30 0
## 23 0.2706461 0.7293539 JAX 2283 650.70 0
## 24 0.3130217 0.6869783 CHI 2267 692.20 1
## 25 0.2595780 0.7404220 WAS 2249 651.07 0
## 26 0.2850570 0.7149430 NYJ 2209 686.32 0
## 27 0.2220870 0.7779130 CIN 2207 627.45 0
## 28 0.2546933 0.7453067 PIT 2186 665.95 1
## 29 0.2246225 0.7753775 ATL 2113 659.05 0
## 30 0.1561541 0.8438459 ARI 2066 593.73 0
## 31 0.1618936 0.8381064 MIA 2057 604.15 0
## 32 0.2105381 0.7894619 TB 1967 689.35 0
table(pred_svm,yards_posses_log$win_los)#confusion matrix##
## pred_svm 0 1
## 0 12 4
## 1 5 11
sum(ifelse(pred_svm==yards_posses_log$win_los,1,0))/
length(yards_posses_log$team)#classification accuracy## [1] 0.71875
plot(model.svm,yards_posses_log,total_poss_mins~total_yards,col=c("blue","red"),)