In this project, I will be exploring, visualizing, analyzing, and modeling 2019 NFL play by play data that is publicly available online. The goal of this project is to learn something and gain interesting insights that would not have been possible without data. I hope to come to a conclusion at the end that is useful and applicable for a wide variety of people.
Here, I clear my workspace, load some useful packages, and read in the data set.
rm(list = ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nflfastR)
## Loading required package: nflscrapR
## Loading required package: nnet
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(ggimage)
pbp = readRDS(url('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_2019.rds'))
Here, we will look at the first 5 rows of the data set to see what we’re working with. Next, we’ll at the strucutre, so that we can get a better understanding of the variables used.
head(pbp)
## # A tibble: 6 x 306
## play_id game_id home_team away_team posteam posteam_type defteam side_of_field
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 35 2.02e9 CHI GB GB away CHI CHI
## 2 50 2.02e9 CHI GB GB away CHI GB
## 3 71 2.02e9 CHI GB GB away CHI GB
## 4 95 2.02e9 CHI GB GB away CHI GB
## 5 125 2.02e9 CHI GB GB away CHI GB
## 6 155 2.02e9 CHI GB CHI home GB CHI
## # … with 298 more variables: yardline_100 <dbl>, game_date <date>,
## # quarter_seconds_remaining <dbl>, half_seconds_remaining <dbl>,
## # game_seconds_remaining <dbl>, game_half <chr>, quarter_end <dbl>,
## # drive <dbl>, sp <dbl>, qtr <dbl>, down <dbl>, goal_to_go <dbl>, time <chr>,
## # yrdln <chr>, ydstogo <dbl>, ydsnet <dbl>, desc <chr>, play_type <chr>,
## # yards_gained <dbl>, shotgun <dbl>, no_huddle <dbl>, qb_dropback <dbl>,
## # qb_kneel <dbl>, qb_spike <dbl>, qb_scramble <dbl>, pass_length <chr>,
## # pass_location <chr>, air_yards <dbl>, yards_after_catch <dbl>,
## # run_location <chr>, run_gap <chr>, field_goal_result <chr>,
## # kick_distance <dbl>, extra_point_result <chr>, two_point_conv_result <chr>,
## # home_timeouts_remaining <dbl>, away_timeouts_remaining <dbl>,
## # timeout <dbl>, timeout_team <chr>, td_team <chr>,
## # posteam_timeouts_remaining <dbl>, defteam_timeouts_remaining <dbl>,
## # total_home_score <dbl>, total_away_score <dbl>, posteam_score <dbl>,
## # defteam_score <dbl>, score_differential <dbl>, posteam_score_post <dbl>,
## # defteam_score_post <dbl>, score_differential_post <dbl>,
## # no_score_prob <dbl>, opp_fg_prob <dbl>, opp_safety_prob <dbl>,
## # opp_td_prob <dbl>, fg_prob <dbl>, safety_prob <dbl>, td_prob <dbl>,
## # extra_point_prob <dbl>, two_point_conversion_prob <dbl>, ep <dbl>,
## # epa <dbl>, total_home_epa <dbl>, total_away_epa <dbl>,
## # total_home_rush_epa <dbl>, total_away_rush_epa <dbl>,
## # total_home_pass_epa <dbl>, total_away_pass_epa <dbl>, air_epa <dbl>,
## # yac_epa <dbl>, comp_air_epa <dbl>, comp_yac_epa <dbl>,
## # total_home_comp_air_epa <dbl>, total_away_comp_air_epa <dbl>,
## # total_home_comp_yac_epa <dbl>, total_away_comp_yac_epa <dbl>,
## # total_home_raw_air_epa <dbl>, total_away_raw_air_epa <dbl>,
## # total_home_raw_yac_epa <dbl>, total_away_raw_yac_epa <dbl>, wp <dbl>,
## # def_wp <dbl>, home_wp <dbl>, away_wp <dbl>, wpa <dbl>, home_wp_post <dbl>,
## # away_wp_post <dbl>, total_home_rush_wpa <dbl>, total_away_rush_wpa <dbl>,
## # total_home_pass_wpa <dbl>, total_away_pass_wpa <dbl>, air_wpa <dbl>,
## # yac_wpa <dbl>, comp_air_wpa <dbl>, comp_yac_wpa <dbl>,
## # total_home_comp_air_wpa <dbl>, total_away_comp_air_wpa <dbl>,
## # total_home_comp_yac_wpa <dbl>, total_away_comp_yac_wpa <dbl>,
## # total_home_raw_air_wpa <dbl>, total_away_raw_air_wpa <dbl>, …
str(pbp)
## tibble [47,480 × 306] (S3: tbl_df/tbl/data.frame)
## $ play_id : num [1:47480] 35 50 71 95 125 155 195 216 238 260 ...
## $ game_id : num [1:47480] 2.02e+09 2.02e+09 2.02e+09 2.02e+09 2.02e+09 ...
## $ home_team : chr [1:47480] "CHI" "CHI" "CHI" "CHI" ...
## $ away_team : chr [1:47480] "GB" "GB" "GB" "GB" ...
## $ posteam : chr [1:47480] "GB" "GB" "GB" "GB" ...
## $ posteam_type : chr [1:47480] "away" "away" "away" "away" ...
## $ defteam : chr [1:47480] "CHI" "CHI" "CHI" "CHI" ...
## $ side_of_field : chr [1:47480] "CHI" "GB" "GB" "GB" ...
## $ yardline_100 : num [1:47480] 35 75 75 75 85 57 52 47 47 40 ...
## $ game_date : Date[1:47480], format: "2019-09-05" "2019-09-05" ...
## $ quarter_seconds_remaining : num [1:47480] 900 900 873 825 795 781 753 718 714 672 ...
## $ half_seconds_remaining : num [1:47480] 1800 1800 1773 1725 1695 ...
## $ game_seconds_remaining : num [1:47480] 3600 3600 3573 3525 3495 ...
## $ game_half : chr [1:47480] "Half1" "Half1" "Half1" "Half1" ...
## $ quarter_end : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ drive : num [1:47480] 1 1 1 1 1 2 2 2 2 2 ...
## $ sp : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ qtr : num [1:47480] 1 1 1 1 1 1 1 1 1 1 ...
## $ down : num [1:47480] NA 1 2 3 4 1 1 2 3 1 ...
## $ goal_to_go : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ time : chr [1:47480] "15:00" "15:00" "14:33" "13:45" ...
## $ yrdln : chr [1:47480] "CHI 35" "GB 25" "GB 25" "GB 25" ...
## $ ydstogo : num [1:47480] 0 10 10 10 20 10 10 5 5 10 ...
## $ ydsnet : num [1:47480] -10 -10 -10 -10 -10 12 12 12 12 12 ...
## $ desc : chr [1:47480] "15-E.Pineiro kicks 65 yards from CHI 35 to end zone, Touchback." "(15:00) 33-A.Jones left tackle to GB 25 for no gain (58-R.Smith)." "(14:33) 12-A.Rodgers pass short left to 33-A.Jones to GB 25 for no gain (58-R.Smith)." "(13:45) (Shotgun) 12-A.Rodgers sacked at GB 15 for -10 yards (95-R.Robertson-Harris). Penalty on GB-65-L.Taylor"| __truncated__ ...
## $ play_type : chr [1:47480] "kickoff" "run" "pass" "pass" ...
## $ yards_gained : num [1:47480] 0 0 0 -10 0 0 5 0 7 0 ...
## $ shotgun : num [1:47480] 0 0 0 1 0 1 1 1 1 1 ...
## $ no_huddle : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ qb_dropback : num [1:47480] 0 0 1 1 0 0 0 1 1 1 ...
## $ qb_kneel : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ qb_spike : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ qb_scramble : num [1:47480] 0 0 0 0 0 0 0 0 1 0 ...
## $ pass_length : chr [1:47480] NA NA "short" NA ...
## $ pass_location : chr [1:47480] NA NA "left" NA ...
## $ air_yards : num [1:47480] NA NA -1 NA NA NA NA 2 NA 15 ...
## $ yards_after_catch : num [1:47480] NA NA 1 NA NA NA NA NA NA NA ...
## $ run_location : chr [1:47480] NA "left" NA NA ...
## $ run_gap : chr [1:47480] NA "tackle" NA NA ...
## $ field_goal_result : chr [1:47480] NA NA NA NA ...
## $ kick_distance : num [1:47480] NA NA NA NA 53 NA NA NA NA NA ...
## $ extra_point_result : chr [1:47480] NA NA NA NA ...
## $ two_point_conv_result : chr [1:47480] NA NA NA NA ...
## $ home_timeouts_remaining : num [1:47480] 3 3 3 3 3 3 3 3 3 3 ...
## $ away_timeouts_remaining : num [1:47480] 3 3 3 3 3 3 3 3 3 3 ...
## $ timeout : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ timeout_team : chr [1:47480] NA NA NA NA ...
## $ td_team : chr [1:47480] NA NA NA NA ...
## $ posteam_timeouts_remaining : num [1:47480] 3 3 3 3 3 3 3 3 3 3 ...
## $ defteam_timeouts_remaining : num [1:47480] 3 3 3 3 3 3 3 3 3 3 ...
## $ total_home_score : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_away_score : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ posteam_score : num [1:47480] NA 0 0 0 0 0 0 0 0 0 ...
## $ defteam_score : num [1:47480] NA 0 0 0 0 0 0 0 0 0 ...
## $ score_differential : num [1:47480] NA 0 0 0 0 0 0 0 0 0 ...
## $ posteam_score_post : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ defteam_score_post : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ score_differential_post : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ no_score_prob : num [1:47480] 0.00137 0.00137 0.00157 0.00186 0.00211 ...
## $ opp_fg_prob : num [1:47480] 0.163 0.163 0.188 0.226 0.327 ...
## $ opp_safety_prob : num [1:47480] 0.00444 0.00444 0.0057 0.00664 0.01096 ...
## $ opp_td_prob : num [1:47480] 0.254 0.254 0.295 0.35 0.467 ...
## $ fg_prob : num [1:47480] 0.2331 0.2331 0.2131 0.1712 0.0545 ...
## $ safety_prob : num [1:47480] 0.00366 0.00366 0.00398 0.00453 0.00429 ...
## $ td_prob : num [1:47480] 0.341 0.341 0.292 0.24 0.134 ...
## $ extra_point_prob : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ two_point_conversion_prob : num [1:47480] 0 0 0 0 0 0 0 0 0 0 ...
## $ ep : num [1:47480] 0.815 0.815 0.0506 -0.9371 -3.1584 ...
## $ epa : num [1:47480] 0 -0.764 -0.988 -2.221 0.715 ...
## $ total_home_epa : num [1:47480] 0 0.764 1.752 3.973 3.259 ...
## $ total_away_epa : num [1:47480] 0 -0.764 -1.752 -3.973 -3.259 ...
## $ total_home_rush_epa : num [1:47480] 0 0.764 0.764 0.764 0.764 ...
## $ total_away_rush_epa : num [1:47480] 0 -0.764 -0.764 -0.764 -0.764 ...
## $ total_home_pass_epa : num [1:47480] 0 0 0.988 3.209 3.209 ...
## $ total_away_pass_epa : num [1:47480] 0 0 -0.988 -3.209 -3.209 ...
## $ air_epa : num [1:47480] NA NA -1.1 NA NA ...
## $ yac_epa : num [1:47480] NA NA 0.107 NA NA ...
## $ comp_air_epa : num [1:47480] 0 0 -1.1 0 0 ...
## $ comp_yac_epa : num [1:47480] 0 0 0.107 0 0 ...
## $ total_home_comp_air_epa : num [1:47480] 0 0 1.1 1.1 1.1 ...
## $ total_away_comp_air_epa : num [1:47480] 0 0 -1.1 -1.1 -1.1 ...
## $ total_home_comp_yac_epa : num [1:47480] 0 0 -0.107 -0.107 -0.107 ...
## $ total_away_comp_yac_epa : num [1:47480] 0 0 0.107 0.107 0.107 ...
## $ total_home_raw_air_epa : num [1:47480] 0 0 1.1 1.1 1.1 ...
## $ total_away_raw_air_epa : num [1:47480] 0 0 -1.1 -1.1 -1.1 ...
## $ total_home_raw_yac_epa : num [1:47480] 0 0 -0.107 -0.107 -0.107 ...
## $ total_away_raw_yac_epa : num [1:47480] 0 0 0.107 0.107 0.107 ...
## $ wp : num [1:47480] NA 0.5 0.479 0.453 0.386 ...
## $ def_wp : num [1:47480] NA 0.5 0.521 0.547 0.614 ...
## $ home_wp : num [1:47480] NA 0.5 0.521 0.547 0.614 ...
## $ away_wp : num [1:47480] NA 0.5 0.479 0.453 0.386 ...
## $ wpa : num [1:47480] NA -0.0207 -0.0261 -0.0669 0.0576 ...
## $ home_wp_post : num [1:47480] NA 0.521 0.547 0.614 0.556 ...
## $ away_wp_post : num [1:47480] NA 0.479 0.453 0.386 0.444 ...
## $ total_home_rush_wpa : num [1:47480] 0 0.0207 0.0207 0.0207 0.0207 ...
## $ total_away_rush_wpa : num [1:47480] 0 -0.0207 -0.0207 -0.0207 -0.0207 ...
## $ total_home_pass_wpa : num [1:47480] 0 0 0.0261 0.093 0.093 ...
## $ total_away_pass_wpa : num [1:47480] 0 0 -0.0261 -0.093 -0.093 ...
## $ air_wpa : num [1:47480] NA NA -0.0316 NA NA ...
## [list output truncated]
Here I will manipulate the data by creating some new columns that I plan on using later in the analysis. First, I convert the play type variable to a factor, so that it is easier to use. I create a column called good_play. In this data set, a win probability function is included, which I will use. I classify a good play as 1 when a team’s win probability increases or stays the same because of a particular play and 0 when the win probability decreases after the play. I create a column that is 1 if the play type is a passing play and 0 if it is anything else. I create a column called yardage that has 3 possible values: long, medium, and short. If there are 8 yards to go until the 1st down marker, then yardage is long. If there’s 4-7 yards to go, then medium yardage. If less than 4 yards to go, then short yardage.
pbp$play_type = as.factor(pbp$play_type)
pbp$good_play = ifelse(pbp$wpa >= 0, 1, 0)
pbp$pass = ifelse(pbp$play_type == "pass", 1, 0)
pbp$yardage = case_when(pbp$ydstogo >= 8 ~ "long",
pbp$ydstogo >= 4 ~ "medium",
TRUE ~"short")
In this, I will create some different types of visualizations to see if I can find anything meaningful or interesting. These graphs can provide to be useful when communicating with those who lack the jargon that goes along with analytics and statistics.
Noteworthy: Quarter 5 means Overtime
## Bar Chart that counts number of plays of different play types
pbp %>%
ggplot(aes(x= play_type)) + geom_bar()
## Graph of average yards gained per play in each quarter
pbp %>% group_by(qtr) %>% summarise(mean_yards_gained = mean(yards_gained, na.rm = TRUE)) %>%
ggplot(aes(x= qtr, y = mean_yards_gained)) + geom_bar(stat = "identity") +
labs(title = "Mean yards gained per play") +
geom_hline(yintercept = mean(pbp$yards_gained, na.rm = TRUE), color = "blue") +
annotate("text", label = "Avg yards gained per play", x = 3.75, y = 4.2)
## Distribution of play types by quarter
pbp %>%
ggplot(aes(x= qtr, fill = play_type)) + geom_bar() + facet_wrap(~play_type)
## Graph of pass vs run plays broken down by down
pbp %>%
filter(play_type == "pass" | play_type == "run") %>%
ggplot(aes(x= down, fill = play_type)) + geom_bar(position="dodge")
## Warning: Removed 116 rows containing non-finite values (stat_count).
## Graph showing number of good plays broken down by pass vs run plays
pbp %>%
filter(play_type == "pass" | play_type == "run") %>%
ggplot(aes(x= play_type, fill = good_play)) + geom_bar(position="dodge") +
labs(title = "Number of good plays by play type")
## Graph of proportion of good plays grouped by down & play type
pbp %>%
filter(play_type == "pass" | play_type == "run", is.na(good_play) == FALSE, down != 5) %>%
group_by(down, play_type) %>%
summarise(p = sum(good_play) / length(good_play)) %>%
ggplot(aes(x = down, y = p, fill = play_type)) +
geom_bar(stat= "identity", position = "dodge") +
labs(title = "Proportion of Good Plays")
## Graph of proportion of good plays grouped by down, play type, and yardage
pbp %>%
filter(play_type == "pass" | play_type == "run", is.na(good_play) == FALSE, down != 5) %>%
group_by(yardage, play_type, down) %>%
summarise(p = sum(good_play) / length(good_play)) %>%
ggplot(aes(x= yardage, y = p, fill = play_type)) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(~down) +
labs(title = "Percentage of good plays based on down, play type, and yardage",
subtitle = "8+ yards = long, 4-7 yards = medium, < 4 = short")
-Pass play is most common play -Average yards per play is higher in the 1st and 3rd quarter of games -Larger number of pass plays happen in the 2nd and 4th quarters -Number of run plays increases slighlty in the 4th quarter. I believe this is due to the fact that teams that have a lead at the end of games will run the ball to burn the clock -Larger number of fields goals occur in the 2nd quarter compared to the other quarters -More run than pass plays occur on 1st down, but more pass than run plays occur on 2nd, 3rd, and 4th down. I believe this happens because teams that don’t get a decent amount of yards on 1st down are forced to “catch up” and pass plays give the best shot at accomplishing that -There is a significantly more larger number of “good” pass plays than “good” run plays according to my definition of good plays -Higher proportion of good plays on 1st down are pass plays. Pass and run plays have similar proportion of good plays on 2nd down. Higher proportion of good plays on 3rd and 4th down are run plays, which I found to be interesting. -Based on the last visualization, we can see the proportion of good plays based on play type, yardage, and down. This could be very useful in identifying what type of play to call in certain scenarios. -Higher proportion of good plays on 1st down and 10 are pass plays.
After exploring the data a bit, I became curious about whether there is a difference in the proportion of passing plays between the 1st and 2nd half of the game. To do this, I will complete a paired t-test that measures the difference of each team’s 1st and 2nd half proportion.
Null Hypothesis: HO: D = 0; Half has no effect on the proportion of passing plays Alternative Hypothesis: Ha: D != 0; Half does have an effect on the proportion of passing plays
team_quarter_pct_passing_first_half = pbp %>%
filter(qtr == 1 | qtr == 2) %>%
group_by(posteam) %>%
filter(play_type != "no_play", is.na(play_type) != TRUE) %>%
summarise(pct_passing = sum(pass) / length(pass))
team_quarter_pct_passing_second_half = pbp %>%
filter(qtr == 3 | qtr == 4) %>%
group_by(posteam) %>%
filter(play_type != "no_play", is.na(play_type) != TRUE) %>%
summarise(pct_passing = sum(pass) / length(pass))
t.test(team_quarter_pct_passing_first_half$pct_passing, team_quarter_pct_passing_second_half$pct_passing,
paired = TRUE, alternative = "two.sided")
##
## Paired t-test
##
## data: team_quarter_pct_passing_first_half$pct_passing and team_quarter_pct_passing_second_half$pct_passing
## t = 0.038568, df = 31, p-value = 0.9695
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01793267 0.01862397
## sample estimates:
## mean of the differences
## 0.0003456508
Based on the results of the paired t test, there is no evidence to suggest that the proportion of passing plays changes based on which half it is. The p-value os 0.9695 is extremely high. The mean difference was 0.0003456508, which is very close to 0. The 95% Confidence interval was (-0.01793267, 0.01862397). Based on these, I believe that it’s safe to say that the true mean difference is extremely close to 0.
I plan to attempt to predict if a play is a run or pass play based on different variables in the data set. I believe this could be a very useful tool for a defensive coordinator to utilize. If successful, a coordinator would be better equipped to call defensive audibles if analytics teams were able to quickly make predictions based on certain game scenarios.
## Building train and test data sets
classifier_data_set = pbp %>% filter(play_type == "pass" | play_type == "run", is.na(down) != TRUE)
n = length(classifier_data_set$play_id)
train_rows = sample(n, n/2)
train = classifier_data_set[train_rows,]
test = classifier_data_set[-train_rows,]
## Training the classifier
## pass is 1 if pass play, 0 if not
log_m1 = glm(pass ~ wp + ydstogo + down + game_seconds_remaining, data = train, family = "binomial")
summary(log_m1)
##
## Call:
## glm(formula = pass ~ wp + ydstogo + down + game_seconds_remaining,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1794 -1.1085 0.6138 0.9925 2.0481
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.410e+00 8.753e-02 -16.111 <2e-16 ***
## wp -1.451e+00 5.991e-02 -24.223 <2e-16 ***
## ydstogo 1.307e-01 5.506e-03 23.738 <2e-16 ***
## down 8.454e-01 2.512e-02 33.653 <2e-16 ***
## game_seconds_remaining -3.833e-05 1.622e-05 -2.363 0.0181 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22544 on 16658 degrees of freedom
## Residual deviance: 20345 on 16654 degrees of freedom
## AIC: 20355
##
## Number of Fisher Scoring iterations: 4
log_probs1 = predict(log_m1, newdata = test, type = "response")
log_p1 = ifelse(log_probs1 > 0.5, 1, 0)
c_matrix1 = table(log_p1 == test$pass)
c_matrix1
##
## FALSE TRUE
## 5555 11105
log_m2 = glm(pass ~ wp + ydstogo + down, data = train, family = "binomial")
summary(log_m2)
##
## Call:
## glm(formula = pass ~ wp + ydstogo + down, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1898 -1.1160 0.6149 0.9934 2.0438
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.475871 0.082963 -17.79 <2e-16 ***
## wp -1.457404 0.059760 -24.39 <2e-16 ***
## ydstogo 0.130588 0.005504 23.73 <2e-16 ***
## down 0.846470 0.025112 33.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22544 on 16658 degrees of freedom
## Residual deviance: 20350 on 16655 degrees of freedom
## AIC: 20358
##
## Number of Fisher Scoring iterations: 4
log_probs2 = predict(log_m2, newdata = test, type = "response")
log_p2 = ifelse(log_probs2 > 0.5, 1, 0)
c_matrix2 = table(log_p2 == test$pass)
c_matrix2
##
## FALSE TRUE
## 5542 11118
log_m3 = glm(pass ~ down + ydstogo + td_prob + fg_prob + posteam,
data = train, family = "binomial")
summary(log_m3)
##
## Call:
## glm(formula = pass ~ down + ydstogo + td_prob + fg_prob + posteam,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8504 -1.1501 0.6466 1.0285 2.3226
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.171665 0.151287 -7.745 9.59e-15 ***
## down 0.692969 0.028768 24.088 < 2e-16 ***
## ydstogo 0.102853 0.006026 17.067 < 2e-16 ***
## td_prob -1.975575 0.184294 -10.720 < 2e-16 ***
## fg_prob 0.794389 0.218816 3.630 0.000283 ***
## posteamATL 0.387168 0.136329 2.840 0.004512 **
## posteamBAL -0.501561 0.131079 -3.826 0.000130 ***
## posteamBUF -0.300990 0.133956 -2.247 0.024644 *
## posteamCAR 0.247883 0.135569 1.828 0.067480 .
## posteamCHI -0.129506 0.135929 -0.953 0.340718
## posteamCIN 0.191128 0.136645 1.399 0.161896
## posteamCLE -0.031987 0.137336 -0.233 0.815829
## posteamDAL -0.038803 0.133195 -0.291 0.770801
## posteamDEN -0.132857 0.138453 -0.960 0.337264
## posteamDET -0.051230 0.137544 -0.372 0.709551
## posteamGB 0.006255 0.132904 0.047 0.962459
## posteamHOU 0.064134 0.133692 0.480 0.631427
## posteamIND -0.393967 0.138137 -2.852 0.004344 **
## posteamJAX 0.055943 0.137905 0.406 0.684990
## posteamKC 0.090154 0.131891 0.684 0.494258
## posteamLA 0.013295 0.135363 0.098 0.921761
## posteamLAC 0.134025 0.139422 0.961 0.336405
## posteamLV -0.206886 0.139874 -1.479 0.139119
## posteamMIA 0.189357 0.138511 1.367 0.171596
## posteamMIN -0.291013 0.132930 -2.189 0.028581 *
## posteamNE -0.058999 0.132651 -0.445 0.656488
## posteamNO 0.048591 0.135317 0.359 0.719527
## posteamNYG 0.256046 0.139959 1.829 0.067336 .
## posteamNYJ -0.084380 0.138465 -0.609 0.542264
## posteamPHI -0.003338 0.131615 -0.025 0.979766
## posteamPIT -0.128882 0.139780 -0.922 0.356509
## posteamSEA -0.118681 0.131191 -0.905 0.365656
## posteamSF -0.319360 0.131810 -2.423 0.015398 *
## posteamTB 0.066652 0.135748 0.491 0.623430
## posteamTEN -0.348151 0.133615 -2.606 0.009170 **
## posteamWAS -0.120366 0.141054 -0.853 0.393472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22544 on 16658 degrees of freedom
## Residual deviance: 20686 on 16623 degrees of freedom
## AIC: 20758
##
## Number of Fisher Scoring iterations: 4
log_probs3 = predict(log_m3, newdata = test, type = "response")
log_p3 = ifelse(log_probs3 > 0.5, 1, 0)
c_matrix3 = table(log_p3 == test$pass)
c_matrix3
##
## FALSE TRUE
## 5790 10870
best_logistic_model_prop = 11095 / (11095 + 5565)
best_logistic_model_prop
## [1] 0.6659664
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
ROC = roc(test$pass, log_p2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC, col = "blue")
auc(ROC)
## Area under the curve: 0.643
The ROC curve graphs the true positive rate versus the false positive rate. A random machine learning algorithm would be be the line, y = x. The area under the curve would be 0.5 My machine learning algoritm has an area under the curve of 0.6378. That is not a very strong measure. This means my algorithm does not do a very good job at making predictions.
It would be interesting to see how my algorithm would do if there were other variables used that I do not have access to. I believe it would be beneficial for NFL teams to build similar machine learning algorithms that are trained on a data set that is more specific to the type of players and team they’d be facing. The algorithm might be better at predicting against certain QBs, for example like Patrick Mahomes or Lamar Jackson. Defensive coordinators could find this very useful as they’d be better equipped to make better decisions regarding defensive scheme, substitutions, and plays if they have can make good predictions of the next play type.
first_down = pbp %>% filter(down == 1) %>%
filter(play_type == "pass" | play_type == "run") %>%
mutate(pass = ifelse(play_type == "pass", 1, 0))
second_down = pbp %>% filter(down == 2) %>%
filter(play_type == "pass" | play_type == "run") %>%
mutate(pass = ifelse(play_type == "pass", 1, 0))
third_down = pbp %>% filter(down == 3) %>%
filter(play_type == "pass" | play_type == "run") %>%
mutate(pass = ifelse(play_type == "pass", 1, 0))
fourth_down = pbp %>% filter(down == 4) %>%
filter(play_type == "pass" | play_type == "run" | play_type == "field_goal" | play_type == "punt") %>%
mutate(pass = ifelse(play_type == "pass", 1, 0))
first_down$play_type = as.factor(first_down$play_type)
second_down$play_type = as.factor(second_down$play_type)
third_down$play_type = as.factor(third_down$play_type)
fourth_down$play_type = as.factor(fourth_down$play_type)
first_down_pct_passing = first_down %>%
group_by(posteam) %>%
summarize(pct_passing = sum(pass) / length(pass)) %>%
mutate("down" = 1)
second_down_pct_passing = second_down %>%
group_by(posteam) %>%
summarize(pct_passing = sum(pass)/ length(pass)) %>%
mutate("down" = 2)
third_down_pct_passing = third_down %>%
group_by(posteam) %>%
summarize(pct_passing = sum(pass)/ length(pass)) %>%
mutate("down" = 3)
fourth_down_pct_passing = fourth_down %>%
group_by(posteam) %>%
summarize(pct_passing = sum(pass)/ length(pass)) %>%
mutate("down" = 4)
fourth_down_pct_running = fourth_down %>%
mutate("run" = ifelse(play_type == "run",1,0)) %>%
group_by(posteam) %>%
summarize(pct_running = sum(run)/ length(run)) %>%
mutate("down" = 4)
fourth_down_aggressiveness = merge(fourth_down_pct_passing, fourth_down_pct_running, by = "posteam")
head(fourth_down_aggressiveness)
## posteam pct_passing down.x pct_running down.y
## 1 ARI 0.11475410 4 0.04918033 4
## 2 ATL 0.14678899 4 0.04587156 4
## 3 BAL 0.17171717 4 0.11111111 4
## 4 BUF 0.06766917 4 0.07518797 4
## 5 CAR 0.12698413 4 0.06349206 4
## 6 CHI 0.12598425 4 0.03937008 4
## Getting rid of team logos of old teams
logos = teams_colors_logos %>% select(team_abbr, team_logo_espn) %>%
filter(team_abbr != "LAR", team_abbr != "SD", team_abbr != "STL", team_abbr != "OAK")
## Visualizing fourth down aggressiveness of teams
ggplot(fourth_down_aggressiveness, aes(x= pct_passing, y = pct_running)) +
geom_image(aes(image = logos$team_logo_espn), size = 0.05, asp = 16 / 9) +
geom_hline(yintercept = mean(fourth_down_aggressiveness$pct_running)) +
geom_vline(xintercept = mean(fourth_down_aggressiveness$pct_passing)) +
annotate("text", x=.17, y=.1, label = "Aggressive on 4th down \n w/ pass & run") +
annotate("text", x=.07, y=.1, label = "Aggressive on 4th down \n w/ run") +
annotate("text", x=.05, y=.04, label = "Not Aggressive \n on 4th down \n w/ either pass or run") +
annotate("text", x=.18, y=.03, label = "Aggressive on 4th down \n w/ pass") +
labs(title = "Fourth Down Aggressiveness",
subtitle = "Percent of 4th down plays that were passing or running plays")
x1= merge(first_down_pct_passing, second_down_pct_passing, by = "posteam")
x2= merge(x1, third_down_pct_passing, by = "posteam")
x3= merge(x2, fourth_down_pct_passing, by = "posteam")
## Warning in merge.data.frame(x2, fourth_down_pct_passing, by = "posteam"): column
## names 'pct_passing.x', 'down.x', 'pct_passing.y', 'down.y' are duplicated in the
## result
colnames(x3) = c("Team", "first_down_passing", "Down1", "second_down_passing", "Down2 ",
"third_down_passing", "Down3", "fourth_down_passing", "Down4")
x3 = x3 %>% select(Team, first_down_passing, second_down_passing, third_down_passing, fourth_down_passing)
head(x3)
## Team first_down_passing second_down_passing third_down_passing
## 1 ARI 0.5183486 0.5921450 0.8256410
## 2 ATL 0.5855131 0.7014085 0.8398058
## 3 BAL 0.3894325 0.4792746 0.6237624
## 4 BUF 0.4916667 0.5231214 0.7260870
## 5 CAR 0.5427975 0.6592179 0.8454106
## 6 CHI 0.5186916 0.6023055 0.8139535
## fourth_down_passing
## 1 0.11475410
## 2 0.14678899
## 3 0.17171717
## 4 0.06766917
## 5 0.12698413
## 6 0.12598425
g1= ggplot(data = x3) + geom_histogram(aes(x = first_down_passing), binwidth = .01) +
geom_vline(xintercept = mean(first_down_pct_passing$pct_passing), col = "blue", size = 1) +
scale_x_continuous(limits = c(0,1))
g2= ggplot(data = x3) + geom_histogram(aes(x = second_down_passing), binwidth = .01) +
geom_vline(xintercept = mean(second_down_pct_passing$pct_passing), col = "blue", size = 1) +
scale_x_continuous(limits = c(0,1))
g3= ggplot(data = x3) + geom_histogram(aes(x = third_down_passing), binwidth = .01) +
geom_vline(xintercept = mean(third_down_pct_passing$pct_passing), col = "blue", size = 1)+
scale_x_continuous(limits = c(0,1))
g4= ggplot(data = x3) + geom_histogram(aes(x = fourth_down_passing), binwidth = .01) +
geom_vline(xintercept = mean(fourth_down_pct_passing$pct_passing), col = "blue", size = 1)+
scale_x_continuous(limits = c(0,1))
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(g1,g2,g3,g4, nrow=2)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).