library(tidyverse)
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <-
pumpkins_raw %>%
separate(id, into = c("year", "type")) %>%
mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
filter(type == "P") %>%
select(weight_lbs, year, place, ott, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 2327 parsing failures.
## row col expected actual
## 13 -- a number EXH
## 36 -- a number EXH
## 58 -- a number EXH
## 60 -- a number EXH
## 61 -- a number EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins
## # A tibble: 15,965 × 6
## weight_lbs year place ott gpc_site country
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2032 2013 1 475 Uesugi Farms Weigh-off United…
## 2 1985 2013 2 453 Safeway World Championship Pumpkin Weig… United…
## 3 1894 2013 3 445 Safeway World Championship Pumpkin Weig… United…
## 4 1874. 2013 4 436 Elk Grove Giant Pumpkin Festival United…
## 5 1813 2013 5 430 The Great Howard Dill Giant Pumpkin Cla… Canada
## 6 1791 2013 6 431 Elk Grove Giant Pumpkin Festival United…
## 7 1784 2013 7 445 Uesugi Farms Weigh-off United…
## 8 1784. 2013 8 434 Stillwater Harvestfest United…
## 9 1780. 2013 9 422 Stillwater Harvestfest United…
## 10 1766. 2013 10 425 Durham Fair Weigh-Off United…
## # ℹ 15,955 more rows
pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs, color = place)) +
geom_point(alpha = 0.2, size = 1.1) +
labs(x = "over-the-top inches", y = "weight (lbs)") +
scale_color_viridis_c()
pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs)) +
geom_point(alpha = 0.2, size = 1.1, color = "gray60") +
geom_smooth(aes(color = factor(year)),
method = lm, formula = y ~ splines::bs(x, 3),
se = FALSE, size = 1.5, alpha = 0.6
) +
labs(x = "over-the-top inches", y = "weight (lbs)", color = NULL) +
scale_color_viridis_d()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pumpkins %>%
mutate(
country = fct_lump(country, n = 10),
country = fct_reorder(country, weight_lbs)
) %>%
ggplot(aes(country, weight_lbs, color = country)) +
geom_boxplot(outlier.colour = NA) +
geom_jitter(alpha = 0.1, width = 0.15) +
labs(x = NULL, y = "weight (lbs)") +
theme(legend.position = "none")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
pumpkin_split <- pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
initial_split(strata = weight_lbs)
pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)
set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [8954/996]> Fold01
## 2 <split [8954/996]> Fold02
## 3 <split [8954/996]> Fold03
## 4 <split [8954/996]> Fold04
## 5 <split [8954/996]> Fold05
## 6 <split [8954/996]> Fold06
## 7 <split [8955/995]> Fold07
## 8 <split [8956/994]> Fold08
## 9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <-
recipe(weight_lbs ~ ott + year + country + gpc_site,
data = pumpkin_train
) %>%
step_other(country, gpc_site, threshold = 0.02)
ind_rec <-
base_rec %>%
step_dummy(all_nominal_predictors())
spline_rec <-
ind_rec %>%
step_bs(ott)
rf_spec <-
rand_forest(trees = 1e3) %>%
set_mode("regression") %>%
set_engine("ranger")
mars_spec <-
mars() %>%
set_mode("regression") %>%
set_engine("earth")
lm_spec <- linear_reg()
pumpkin_set <-
workflow_set(
list(base_rec, ind_rec, spline_rec),
list(rf_spec, mars_spec, lm_spec),
cross = FALSE
)
pumpkin_set
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)
pumpkin_rs <-
workflow_map(
pumpkin_set,
"fit_resamples",
resamples = pumpkin_folds
)
pumpkin_rs
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[1]> <rsmp[+]>
autoplot(pumpkin_rs)
collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_rand_f… Prepro… recipe rand… rmse standard 86.1 10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe rand… rsq standard 0.969 10 9.97e-4
## 3 recipe_2_mars Prepro… recipe mars rmse standard 83.8 10 1.92e+0
## 4 recipe_2_mars Prepro… recipe mars rsq standard 0.969 10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe line… rmse standard 82.4 10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe line… rsq standard 0.970 10 1.97e-3
final_fit <-
extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
fit(pumpkin_train)
tidy(final_fit) %>%
arrange(-abs(estimate))
## # A tibble: 15 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -9731. 675. -14.4 1.30e- 46
## 2 ott_bs_3 2585. 25.6 101. 0
## 3 ott_bs_2 450. 11.9 37.9 2.75e-293
## 4 ott_bs_1 -345. 36.3 -9.50 2.49e- 21
## 5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr… 21.1 7.80 2.70 6.89e- 3
## 6 country_United.States 11.9 5.66 2.11 3.53e- 2
## 7 gpc_site_Stillwater.Harvestfest 11.6 7.87 1.48 1.40e- 1
## 8 country_Germany -11.5 6.68 -1.71 8.64e- 2
## 9 country_other -10.7 6.33 -1.69 9.13e- 2
## 10 country_Canada 9.29 6.12 1.52 1.29e- 1
## 11 country_Italy 8.12 7.02 1.16 2.47e- 1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest… -7.81 7.70 -1.01 3.10e- 1
## 13 year 4.89 0.334 14.6 5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br… 1.51 8.07 0.187 8.51e- 1
## 15 gpc_site_other 1.41 5.60 0.251 8.02e- 1
The data consists of 28065 observations and 14 variables. In our analysis we looked at a few variables such as the “ott”, the year and the country the pumpkin came from.
The primary dependent variable is weight_lbs, indicating the pumpkin’s weight in pounds. Among the predictors, ott (over-the-top inches) likely represents the pumpkin’s circumference, and is particularly emphasized for its relationship with weight. The year variable, extracted from the dataset’s ID, pinpoints each entry’s associated year. place likely designates a pumpkin’s rank or position in a competition. The data also incorporates categorical variables like gpc_site, reflecting various competition sites, and country, denoting the pumpkin’s origin. Jellvik’s study delves into visualizing these relationships, eventually leading to the creation and evaluation of multiple predictive models, including a random forest, MARS, and linear regression.
From the raw dataset, the id was split into year and type, with a focus retained on pumpkins of type “P”. Selected variables like weight_lbs, year, ott, gpc_site, and country were parsed into numeric formats or adjusted for better modeling relevance. The ott range was confined between 20 and 1e3, likely to address outliers. Additionally, the country category was modified to club less frequent entries together and reorder based on pumpkin weight. This refining process, including additional preprocessing like dummy variable creation, ensured the data was optimized for both visual and statistical interpretation.
In my pumpkin size analysis, I split the raw data’s id into year and type, retaining only type “P” pumpkins. I then refined variables such as weight_lbs, year, ott, gpc_site, and country. To manage outliers, I set constraints on ott. The country variable was adjusted to group less frequent entries, and I created dummy variables from nominal predictors to ease modeling.
In the analysis, the machine learning models used are Random Forest (rand_forest), Multivariate Adaptive Regression Splines (MARS or mars), and Linear Regression (linear_reg).
The metrics used for model evaluation are the Root Mean Squared Error (RMSE) and the R-squared (rsq). The RMSE measures the average magnitude of errors between predicted and observed values, while the R-squared metric indicates the proportion of variance in the dependent variable that’s predicted by the independent variables in the model.
From the analysis of the pumpkin dataset, three models were employed: Linear Regression, Random Forest, and MARS. Among them, the Linear Regression model stood out as the top performer, registering an RMSE of 82.4 and an impressive R-squared of 0.970. In comparison, both the Random Forest and MARS models closely followed, but the Linear Regression held a slight edge. The consistently high R-squared values across all models highlight the significant predictive potential in our dataset, suggesting that the chosen features effectively capture the patterns related to pumpkin sizes.