library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <-
pumpkins_raw %>%
separate(id, into = c("year", "type")) %>%
filter(type == "P") %>%
mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
select(weight_lbs, ott, year, place, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 1359 parsing failures.
## row col expected actual
## 39 -- a number EXH
## 105 -- a number EXH
## 209 -- a number EXH
## 217 -- a number EXH
## 223 -- a number EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins %>%
filter(ott < 1e3, ott > 10) %>%
ggplot(aes(ott, weight_lbs, color = place)) +
geom_point(alpha = 0.1, size = 1.1) +
labs(x = "over-the-top-inches", y = "weight (lbs)") +
scale_color_viridis_c()
pumpkins %>%
filter(ott < 1e3, ott > 20) %>%
ggplot(aes(ott, weight_lbs,)) +
geom_point(alpha = 0.1, size = 1.1, color = "gray60") +
geom_smooth(aes(color = factor(year)),
method = "lm",
formula = y ~ splines::bs(x, 3),
se = FALSE, linewidth = 1.5, alpha = 0.6) +
labs(x = "over-the-top-inches",
y = "weight (lbs)", color = NULL) +
scale_color_viridis_d()
pumpkins %>%
mutate(country = fct_lump(country, n = 10),
country = fct_reorder(country, weight_lbs)) %>%
ggplot(aes(country, weight_lbs, color = country)) +
geom_boxplot(outlier.color = NA) +
geom_jitter(alpha = 0.1, width = 0.15) +
labs(x = NULL, y = "weight (lbs)") +
theme(legend.position = "none")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
pumpkin_split <-
pumpkins %>%
filter(ott < 1e3, ott > 20) %>%
initial_split(strata = weight_lbs)
pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)
set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [8954/996]> Fold01
## 2 <split [8954/996]> Fold02
## 3 <split [8954/996]> Fold03
## 4 <split [8954/996]> Fold04
## 5 <split [8954/996]> Fold05
## 6 <split [8954/996]> Fold06
## 7 <split [8955/995]> Fold07
## 8 <split [8956/994]> Fold08
## 9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <-
recipe(weight_lbs ~ ott + year + country + gpc_site,
data = pumpkin_train) %>%
step_other(country, gpc_site, threshold = 0.02)
ind_rec <-
base_rec %>%
step_dummy(all_nominal_predictors())
spline_rec <-
ind_rec %>%
step_bs(ott)
rf_spec <-
rand_forest(trees = 1e3) %>%
set_mode("regression") %>%
set_engine("ranger")
mars_spec <-
mars() %>%
set_mode("regression") %>%
set_engine("earth")
lm_spec <- linear_reg()
pumpkin_set <-
workflow_set(
list(base_rec, ind_rec, spline_rec),
list(rf_spec, mars_spec, lm_spec),
cross = FALSE
)
pumpkin_set
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)
pumpkin_rs <-
workflow_map(
pumpkin_set,
"fit_resamples",
resamples = pumpkin_folds
)
pumpkin_rs
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[1]> <rsmp[+]>
autoplot(pumpkin_rs)
collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_rand_f… Prepro… recipe rand… rmse standard 86.1 10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe rand… rsq standard 0.969 10 9.97e-4
## 3 recipe_2_mars Prepro… recipe mars rmse standard 83.8 10 1.92e+0
## 4 recipe_2_mars Prepro… recipe mars rsq standard 0.969 10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe line… rmse standard 82.4 10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe line… rsq standard 0.970 10 1.97e-3
final_fit <-
extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
fit(pumpkin_train)
tidy(final_fit) %>%
arrange(-abs(estimate))
## # A tibble: 15 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -9731. 675. -14.4 1.30e- 46
## 2 ott_bs_3 2585. 25.6 101. 0
## 3 ott_bs_2 450. 11.9 37.9 2.75e-293
## 4 ott_bs_1 -345. 36.3 -9.50 2.49e- 21
## 5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr… 21.1 7.80 2.70 6.89e- 3
## 6 country_United.States 11.9 5.66 2.11 3.53e- 2
## 7 gpc_site_Stillwater.Harvestfest 11.6 7.87 1.48 1.40e- 1
## 8 country_Germany -11.5 6.68 -1.71 8.64e- 2
## 9 country_other -10.7 6.33 -1.69 9.13e- 2
## 10 country_Canada 9.29 6.12 1.52 1.29e- 1
## 11 country_Italy 8.12 7.02 1.16 2.47e- 1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest… -7.81 7.70 -1.01 3.10e- 1
## 13 year 4.89 0.334 14.6 5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br… 1.51 8.07 0.187 8.51e- 1
## 15 gpc_site_other 1.41 5.60 0.251 8.02e- 1
#2. Data Exploration and Transformation: - The newly transformed data includes separating the id category into two subset categories, transforming numerous character data into numeric data, filtering to giant pumpkins only and selecting only key factors to help us answer the question.
- The three major pre-processing tools that are used in this file are step_other (which pools infrequently occurring data/values into their own category), step_dummy (which will create a set of binary dummy variables from a factor variable), and step_bs which (will create a specification of a recipe step that will create new columns that are basis expansions of variables using B-splines).
#4. Model Evaluation: - We used autoplot to compare the Linear regression model against the mars model and the random forest model. The findings show that all 3 options performed relatively similar, with linear_regression model seeming to do slightly better.