suppressMessages(library(tidyverse))
suppressMessages(library(tidymodels))
suppressMessages(library(readxl))


attendance <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/attendance.csv")
standings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/standings.csv")

attendance_joined <- attendance %>%
  left_join(standings,
    by = c("year", "team_name", "team")
  )

attendance_joined
## # A tibble: 10,846 x 20
##    team  team_name  year  total   home   away  week weekly_attendan~  wins  loss
##    <chr> <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl>            <dbl> <dbl> <dbl>
##  1 Ariz~ Cardinals  2000 893926 387475 506451     1            77434     3    13
##  2 Ariz~ Cardinals  2000 893926 387475 506451     2            66009     3    13
##  3 Ariz~ Cardinals  2000 893926 387475 506451     3               NA     3    13
##  4 Ariz~ Cardinals  2000 893926 387475 506451     4            71801     3    13
##  5 Ariz~ Cardinals  2000 893926 387475 506451     5            66985     3    13
##  6 Ariz~ Cardinals  2000 893926 387475 506451     6            44296     3    13
##  7 Ariz~ Cardinals  2000 893926 387475 506451     7            38293     3    13
##  8 Ariz~ Cardinals  2000 893926 387475 506451     8            62981     3    13
##  9 Ariz~ Cardinals  2000 893926 387475 506451     9            35286     3    13
## 10 Ariz~ Cardinals  2000 893926 387475 506451    10            52244     3    13
## # ... with 10,836 more rows, and 10 more variables: points_for <dbl>,
## #   points_against <dbl>, points_differential <dbl>, margin_of_victory <dbl>,
## #   strength_of_schedule <dbl>, simple_rating <dbl>, offensive_ranking <dbl>,
## #   defensive_ranking <dbl>, playoffs <chr>, sb_winner <chr>
theme_set(theme_light())

attendance_joined %>%
  filter(!is.na(weekly_attendance)) %>%
  ggplot(aes(fct_reorder(team_name, weekly_attendance),
    weekly_attendance,
    fill = playoffs
  )) +
  geom_boxplot(outlier.alpha = 0.5) +
  coord_flip() +
  labs(
    fill = NULL, x = NULL,
    y = "Weekly NFL game attendance"
  )

attendance_joined %>%
  distinct(team_name, year, margin_of_victory, playoffs) %>%
  ggplot(aes(margin_of_victory, fill = playoffs)) +
  geom_histogram(position = "identity", alpha = 0.7) +
  labs(
    x = "Margin of victory",
    y = "Number of teams",
    fill = NULL
  )

attendance_joined %>%
  mutate(week = factor(week)) %>%
  ggplot(aes(week, weekly_attendance, fill = week)) +
  geom_boxplot(show.legend = FALSE, outlier.alpha = 0.5) +
  labs(
    x = "Week of NFL season",
    y = "Weekly NFL game attendance"
  )

attendance_df <- attendance_joined %>%
  filter(!is.na(weekly_attendance)) %>%
  select(
    weekly_attendance, team_name, year, week,
    margin_of_victory, strength_of_schedule, playoffs
  )

attendance_df
## # A tibble: 10,208 x 7
##    weekly_attendance team_name  year  week margin_of_victory strength_of_schedu~
##                <dbl> <chr>     <dbl> <dbl>             <dbl>               <dbl>
##  1             77434 Cardinals  2000     1             -14.6                -0.7
##  2             66009 Cardinals  2000     2             -14.6                -0.7
##  3             71801 Cardinals  2000     4             -14.6                -0.7
##  4             66985 Cardinals  2000     5             -14.6                -0.7
##  5             44296 Cardinals  2000     6             -14.6                -0.7
##  6             38293 Cardinals  2000     7             -14.6                -0.7
##  7             62981 Cardinals  2000     8             -14.6                -0.7
##  8             35286 Cardinals  2000     9             -14.6                -0.7
##  9             52244 Cardinals  2000    10             -14.6                -0.7
## 10             64223 Cardinals  2000    11             -14.6                -0.7
## # ... with 10,198 more rows, and 1 more variable: playoffs <chr>
set.seed(1234)
attendance_split <- attendance_df %>%
  initial_split(strata = playoffs)

nfl_train <- training(attendance_split)
nfl_test <- testing(attendance_split)
lm_spec <- linear_reg() %>%
               set_engine(engine = "lm")


lm_fit <- lm_spec %>%
               fit(weekly_attendance ~ .,
                   data = nfl_train)

rf_spec <- rand_forest(mode = "regression") %>%
               set_engine("ranger")

rf_fit <- rf_spec %>%
               fit(weekly_attendance ~ ., 
                   data =nfl_train)
results_train <- lm_fit %>%
    predict(new_data = nfl_train) %>%
    mutate(
        truth = nfl_train$weekly_attendance,
        model = "lm"
    )%>%
    bind_rows(rf_fit %>%
                  predict(new_data = nfl_train) %>%
                  mutate(
                      truth = nfl_train$weekly_attendance,
                      model = "rf"
                  ))


results_test <- results_test <- lm_fit %>%
  predict(new_data = nfl_test) %>%
  mutate(
    truth = nfl_test$weekly_attendance,
    model = "lm"
  ) %>%
  bind_rows(rf_fit %>%
    predict(new_data = nfl_test) %>%
    mutate(
      truth = nfl_test$weekly_attendance,
      model = "rf"
    ))
results_train %>%
  group_by(model) %>%
  rmse(truth = truth, estimate = .pred)
## # A tibble: 2 x 4
##   model .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 lm    rmse    standard       8307.
## 2 rf    rmse    standard       6106.
results_test %>%
  group_by(model) %>%
  rmse(truth = truth, estimate = .pred)
## # A tibble: 2 x 4
##   model .metric .estimator .estimate
##   <chr> <chr>   <chr>          <dbl>
## 1 lm    rmse    standard       8351.
## 2 rf    rmse    standard       8573.
results_test %>%
  mutate(train = "testing") %>%
  bind_rows(results_train %>%
    mutate(train = "training")) %>%
  ggplot(aes(truth, .pred, color = model)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.5) +
  facet_wrap(~train) +
  labs(
    x = "Truth",
    y = "Predicted attendance",
    color = "Type of model"
  )

set.seed(1234)
nfl_folds <- vfold_cv(nfl_train, strata = playoffs)

workflow <-  workflow() %>%
               add_model(rf_spec) %>%
               add_formula(weekly_attendance ~ .)

rf_res <- fit_resamples(
  workflow,
  nfl_folds,
  control = control_resamples(save_pred = TRUE)
)


rf_res %>% collect_predictions()
## # A tibble: 7,656 x 5
##    id      .pred  .row weekly_attendance .config             
##    <chr>   <dbl> <int>             <dbl> <chr>               
##  1 Fold01 66867.    28             73018 Preprocessor1_Model1
##  2 Fold01 68662.    44             65546 Preprocessor1_Model1
##  3 Fold01 67531.    57             64208 Preprocessor1_Model1
##  4 Fold01 64676.    62             65569 Preprocessor1_Model1
##  5 Fold01 65974.    65             66944 Preprocessor1_Model1
##  6 Fold01 66380.    67             65553 Preprocessor1_Model1
##  7 Fold01 67740.    73             71957 Preprocessor1_Model1
##  8 Fold01 61808.    83             50289 Preprocessor1_Model1
##  9 Fold01 68118.    89             73018 Preprocessor1_Model1
## 10 Fold01 65882.    95             51262 Preprocessor1_Model1
## # ... with 7,646 more rows
rf_res %>%
  unnest(.predictions) %>%
  ggplot(aes(weekly_attendance, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Truth",
    y = "Predicted game attendance",
    color = NULL
  )