Explore Data
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
pumpkins <-
pumpkins_raw %>%
separate(id, into = c("year", "type")) %>%
mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
filter(type == "P") %>%
select(weight_lbs, year, place, ott, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 2327 parsing failures.
## row col expected actual
## 13 -- a number EXH
## 36 -- a number EXH
## 58 -- a number EXH
## 60 -- a number EXH
## 61 -- a number EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins
## # A tibble: 15,965 × 6
## weight_lbs year place ott gpc_site country
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2032 2013 1 475 Uesugi Farms Weigh-off United…
## 2 1985 2013 2 453 Safeway World Championship Pumpkin Weig… United…
## 3 1894 2013 3 445 Safeway World Championship Pumpkin Weig… United…
## 4 1874. 2013 4 436 Elk Grove Giant Pumpkin Festival United…
## 5 1813 2013 5 430 The Great Howard Dill Giant Pumpkin Cla… Canada
## 6 1791 2013 6 431 Elk Grove Giant Pumpkin Festival United…
## 7 1784 2013 7 445 Uesugi Farms Weigh-off United…
## 8 1784. 2013 8 434 Stillwater Harvestfest United…
## 9 1780. 2013 9 422 Stillwater Harvestfest United…
## 10 1766. 2013 10 425 Durham Fair Weigh-Off United…
## # ℹ 15,955 more rows
pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs, color = place)) +
geom_point(alpha = 0.2, size = 1.1) +
labs(x = "over-the-top inches", y = "weight (lbs)") +
scale_color_viridis_c()

pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs)) +
geom_point(alpha = 0.2, size = 1.1, color = "gray60") +
geom_smooth(aes(color = factor(year)),
method = lm, formula = y ~ splines::bs(x, 3),
se = FALSE, linewidth = 1.5, alpha = 0.6
) +
labs(x = "over-the-top inches", y = "weight (lbs)", color = NULL) +
scale_color_viridis_d()

pumpkins %>%
mutate(
country = fct_lump(country, n = 10),
country = fct_reorder(country, weight_lbs)
) %>%
ggplot(aes(country, weight_lbs, color = country)) +
geom_boxplot(outlier.colour = NA) +
geom_jitter(alpha = 0.1, width = 0.15) +
labs(x = NULL, y = "weight (lbs)") +
theme(legend.position = "none")

Build and Fit a Workflow Set
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.2.3
## Warning: package 'broom' was built under R version 4.2.3
## Warning: package 'dials' was built under R version 4.2.3
## Warning: package 'infer' was built under R version 4.2.3
## Warning: package 'modeldata' was built under R version 4.2.3
## Warning: package 'parsnip' was built under R version 4.2.3
## Warning: package 'recipes' was built under R version 4.2.3
## Warning: package 'rsample' was built under R version 4.2.3
## Warning: package 'tune' was built under R version 4.2.3
## Warning: package 'workflows' was built under R version 4.2.3
## Warning: package 'workflowsets' was built under R version 4.2.3
## Warning: package 'yardstick' was built under R version 4.2.3
set.seed(123)
pumpkin_split <- pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
initial_split(strata = weight_lbs)
pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)
set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [8954/996]> Fold01
## 2 <split [8954/996]> Fold02
## 3 <split [8954/996]> Fold03
## 4 <split [8954/996]> Fold04
## 5 <split [8954/996]> Fold05
## 6 <split [8954/996]> Fold06
## 7 <split [8955/995]> Fold07
## 8 <split [8956/994]> Fold08
## 9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <-
recipe(weight_lbs ~ ott + year + country + gpc_site,
data = pumpkin_train
) %>%
step_other(country, gpc_site, threshold = 0.02)
ind_rec <-
base_rec %>%
step_dummy(all_nominal_predictors())
spline_rec <-
ind_rec %>%
step_bs(ott)
rf_spec <-
rand_forest(trees = 1e3) %>%
set_mode("regression") %>%
set_engine("ranger")
mars_spec <-
mars() %>%
set_mode("regression") %>%
set_engine("earth")
lm_spec <- linear_reg()
pumpkin_set <-
workflow_set(
list(base_rec, ind_rec, spline_rec),
list(rf_spec, mars_spec, lm_spec),
cross = FALSE
)
pumpkin_set
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)
pumpkin_rs <-
workflow_map(
pumpkin_set,
"fit_resamples",
resamples = pumpkin_folds
)
pumpkin_rs
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[1]> <rsmp[+]>
Evaluate Workflow Set
autoplot(pumpkin_rs)

collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_rand_f… Prepro… recipe rand… rmse standard 86.2 10 1.05e+0
## 2 recipe_1_rand_f… Prepro… recipe rand… rsq standard 0.968 10 9.53e-4
## 3 recipe_2_mars Prepro… recipe mars rmse standard 83.8 10 1.92e+0
## 4 recipe_2_mars Prepro… recipe mars rsq standard 0.969 10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe line… rmse standard 82.4 10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe line… rsq standard 0.970 10 1.97e-3
final_fit <-
extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
fit(pumpkin_train)
tidy(final_fit)
## # A tibble: 15 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -9731. 675. -14.4 1.30e- 46
## 2 year 4.89 0.334 14.6 5.03e- 48
## 3 country_Canada 9.29 6.12 1.52 1.29e- 1
## 4 country_Germany -11.5 6.68 -1.71 8.64e- 2
## 5 country_Italy 8.12 7.02 1.16 2.47e- 1
## 6 country_United.States 11.9 5.66 2.11 3.53e- 2
## 7 country_other -10.7 6.33 -1.69 9.13e- 2
## 8 gpc_site_Elk.Grove.Giant.Pumpkin.Fest… -7.81 7.70 -1.01 3.10e- 1
## 9 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr… 21.1 7.80 2.70 6.89e- 3
## 10 gpc_site_Stillwater.Harvestfest 11.6 7.87 1.48 1.40e- 1
## 11 gpc_site_Wiegemeisterschaft.Berlin.Br… 1.51 8.07 0.187 8.51e- 1
## 12 gpc_site_other 1.41 5.60 0.251 8.02e- 1
## 13 ott_bs_1 -345. 36.3 -9.50 2.49e- 21
## 14 ott_bs_2 450. 11.9 37.9 2.75e-293
## 15 ott_bs_3 2585. 25.6 101. 0