suppressMessages(library(tidyverse))
suppressMessages(library(tidymodels))
suppressMessages(library(readxl))
attendance <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/attendance.csv")
standings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/standings.csv")
attendance_joined <- attendance %>%
left_join(standings,
by = c("year", "team_name", "team")
)
attendance_joined
## # A tibble: 10,846 x 20
## team team_name year total home away week weekly_attendan~ wins loss
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Ariz~ Cardinals 2000 893926 387475 506451 1 77434 3 13
## 2 Ariz~ Cardinals 2000 893926 387475 506451 2 66009 3 13
## 3 Ariz~ Cardinals 2000 893926 387475 506451 3 NA 3 13
## 4 Ariz~ Cardinals 2000 893926 387475 506451 4 71801 3 13
## 5 Ariz~ Cardinals 2000 893926 387475 506451 5 66985 3 13
## 6 Ariz~ Cardinals 2000 893926 387475 506451 6 44296 3 13
## 7 Ariz~ Cardinals 2000 893926 387475 506451 7 38293 3 13
## 8 Ariz~ Cardinals 2000 893926 387475 506451 8 62981 3 13
## 9 Ariz~ Cardinals 2000 893926 387475 506451 9 35286 3 13
## 10 Ariz~ Cardinals 2000 893926 387475 506451 10 52244 3 13
## # ... with 10,836 more rows, and 10 more variables: points_for <dbl>,
## # points_against <dbl>, points_differential <dbl>, margin_of_victory <dbl>,
## # strength_of_schedule <dbl>, simple_rating <dbl>, offensive_ranking <dbl>,
## # defensive_ranking <dbl>, playoffs <chr>, sb_winner <chr>
theme_set(theme_light())
attendance_joined %>%
filter(!is.na(weekly_attendance)) %>%
ggplot(aes(fct_reorder(team_name, weekly_attendance),
weekly_attendance,
fill = playoffs
)) +
geom_boxplot(outlier.alpha = 0.5) +
coord_flip() +
labs(
fill = NULL, x = NULL,
y = "Weekly NFL game attendance"
)

attendance_joined %>%
distinct(team_name, year, margin_of_victory, playoffs) %>%
ggplot(aes(margin_of_victory, fill = playoffs)) +
geom_histogram(position = "identity", alpha = 0.7) +
labs(
x = "Margin of victory",
y = "Number of teams",
fill = NULL
)

attendance_joined %>%
mutate(week = factor(week)) %>%
ggplot(aes(week, weekly_attendance, fill = week)) +
geom_boxplot(show.legend = FALSE, outlier.alpha = 0.5) +
labs(
x = "Week of NFL season",
y = "Weekly NFL game attendance"
)

attendance_df <- attendance_joined %>%
filter(!is.na(weekly_attendance)) %>%
select(
weekly_attendance, team_name, year, week,
margin_of_victory, strength_of_schedule, playoffs
)
attendance_df
## # A tibble: 10,208 x 7
## weekly_attendance team_name year week margin_of_victory strength_of_schedu~
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 77434 Cardinals 2000 1 -14.6 -0.7
## 2 66009 Cardinals 2000 2 -14.6 -0.7
## 3 71801 Cardinals 2000 4 -14.6 -0.7
## 4 66985 Cardinals 2000 5 -14.6 -0.7
## 5 44296 Cardinals 2000 6 -14.6 -0.7
## 6 38293 Cardinals 2000 7 -14.6 -0.7
## 7 62981 Cardinals 2000 8 -14.6 -0.7
## 8 35286 Cardinals 2000 9 -14.6 -0.7
## 9 52244 Cardinals 2000 10 -14.6 -0.7
## 10 64223 Cardinals 2000 11 -14.6 -0.7
## # ... with 10,198 more rows, and 1 more variable: playoffs <chr>
set.seed(1234)
attendance_split <- attendance_df %>%
initial_split(strata = playoffs)
nfl_train <- training(attendance_split)
nfl_test <- testing(attendance_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm")
lm_fit <- lm_spec %>%
fit(weekly_attendance ~ .,
data = nfl_train)
rf_spec <- rand_forest(mode = "regression") %>%
set_engine("ranger")
rf_fit <- rf_spec %>%
fit(weekly_attendance ~ .,
data =nfl_train)
results_train <- lm_fit %>%
predict(new_data = nfl_train) %>%
mutate(
truth = nfl_train$weekly_attendance,
model = "lm"
)%>%
bind_rows(rf_fit %>%
predict(new_data = nfl_train) %>%
mutate(
truth = nfl_train$weekly_attendance,
model = "rf"
))
results_test <- results_test <- lm_fit %>%
predict(new_data = nfl_test) %>%
mutate(
truth = nfl_test$weekly_attendance,
model = "lm"
) %>%
bind_rows(rf_fit %>%
predict(new_data = nfl_test) %>%
mutate(
truth = nfl_test$weekly_attendance,
model = "rf"
))
results_train %>%
group_by(model) %>%
rmse(truth = truth, estimate = .pred)
## # A tibble: 2 x 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 lm rmse standard 8307.
## 2 rf rmse standard 6106.
results_test %>%
group_by(model) %>%
rmse(truth = truth, estimate = .pred)
## # A tibble: 2 x 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 lm rmse standard 8351.
## 2 rf rmse standard 8573.
results_test %>%
mutate(train = "testing") %>%
bind_rows(results_train %>%
mutate(train = "training")) %>%
ggplot(aes(truth, .pred, color = model)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
facet_wrap(~train) +
labs(
x = "Truth",
y = "Predicted attendance",
color = "Type of model"
)

set.seed(1234)
nfl_folds <- vfold_cv(nfl_train, strata = playoffs)
workflow <- workflow() %>%
add_model(rf_spec) %>%
add_formula(weekly_attendance ~ .)
rf_res <- fit_resamples(
workflow,
nfl_folds,
control = control_resamples(save_pred = TRUE)
)
rf_res %>% collect_predictions()
## # A tibble: 7,656 x 5
## id .pred .row weekly_attendance .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 Fold01 66867. 28 73018 Preprocessor1_Model1
## 2 Fold01 68662. 44 65546 Preprocessor1_Model1
## 3 Fold01 67531. 57 64208 Preprocessor1_Model1
## 4 Fold01 64676. 62 65569 Preprocessor1_Model1
## 5 Fold01 65974. 65 66944 Preprocessor1_Model1
## 6 Fold01 66380. 67 65553 Preprocessor1_Model1
## 7 Fold01 67740. 73 71957 Preprocessor1_Model1
## 8 Fold01 61808. 83 50289 Preprocessor1_Model1
## 9 Fold01 68118. 89 73018 Preprocessor1_Model1
## 10 Fold01 65882. 95 51262 Preprocessor1_Model1
## # ... with 7,646 more rows
rf_res %>%
unnest(.predictions) %>%
ggplot(aes(weekly_attendance, .pred, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
labs(
x = "Truth",
y = "Predicted game attendance",
color = NULL
)
