Explore data

library(tidyverse)

pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <-
  pumpkins_raw %>%
  separate(id, into = c("year", "type")) %>%
  mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
  filter(type == "P") %>%
  select(weight_lbs, year, place, ott, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 2327 parsing failures.
## row col expected actual
##  13  -- a number    EXH
##  36  -- a number    EXH
##  58  -- a number    EXH
##  60  -- a number    EXH
##  61  -- a number    EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins
## # A tibble: 15,965 × 6
##    weight_lbs  year place   ott gpc_site                                 country
##         <dbl> <dbl> <dbl> <dbl> <chr>                                    <chr>  
##  1      2032   2013     1   475 Uesugi Farms Weigh-off                   United…
##  2      1985   2013     2   453 Safeway World Championship Pumpkin Weig… United…
##  3      1894   2013     3   445 Safeway World Championship Pumpkin Weig… United…
##  4      1874.  2013     4   436 Elk Grove Giant Pumpkin Festival         United…
##  5      1813   2013     5   430 The Great Howard Dill Giant Pumpkin Cla… Canada 
##  6      1791   2013     6   431 Elk Grove Giant Pumpkin Festival         United…
##  7      1784   2013     7   445 Uesugi Farms Weigh-off                   United…
##  8      1784.  2013     8   434 Stillwater Harvestfest                   United…
##  9      1780.  2013     9   422 Stillwater Harvestfest                   United…
## 10      1766.  2013    10   425 Durham Fair Weigh-Off                    United…
## # ℹ 15,955 more rows
pumpkins %>%
  filter(ott > 20, ott < 1e3) %>%
  ggplot(aes(ott, weight_lbs, color = place)) +
  geom_point(alpha = 0.2, size = 1.1) +
  labs(x = "over-the-top inches", y = "weight (lbs)") +
  scale_color_viridis_c()

pumpkins %>%
  filter(ott > 20, ott < 1e3) %>%
  ggplot(aes(ott, weight_lbs)) +
  geom_point(alpha = 0.2, size = 1.1, color = "gray60") +
  geom_smooth(aes(color = factor(year)),
    method = lm, formula = y ~ splines::bs(x, 3),
    se = FALSE, size = 1.5, alpha = 0.6
  ) +
  labs(x = "over-the-top inches", y = "weight (lbs)", color = NULL) +
  scale_color_viridis_d()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

pumpkins %>%
  mutate(
    country = fct_lump(country, n = 10),
    country = fct_reorder(country, weight_lbs)
  ) %>%
  ggplot(aes(country, weight_lbs, color = country)) +
  geom_boxplot(outlier.colour = NA) +
  geom_jitter(alpha = 0.1, width = 0.15) +
  labs(x = NULL, y = "weight (lbs)") +
  theme(legend.position = "none")

Build and fit a workflow set

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
pumpkin_split <- pumpkins %>%
  filter(ott > 20, ott < 1e3) %>%
  initial_split(strata = weight_lbs)

pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)

set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [8954/996]> Fold01
##  2 <split [8954/996]> Fold02
##  3 <split [8954/996]> Fold03
##  4 <split [8954/996]> Fold04
##  5 <split [8954/996]> Fold05
##  6 <split [8954/996]> Fold06
##  7 <split [8955/995]> Fold07
##  8 <split [8956/994]> Fold08
##  9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <-
  recipe(weight_lbs ~ ott + year + country + gpc_site,
    data = pumpkin_train
  ) %>%
  step_other(country, gpc_site, threshold = 0.02)

ind_rec <-
  base_rec %>%
  step_dummy(all_nominal_predictors())

spline_rec <-
  ind_rec %>%
  step_bs(ott)
rf_spec <-
  rand_forest(trees = 1e3) %>%
  set_mode("regression") %>%
  set_engine("ranger")

mars_spec <-
  mars() %>%
  set_mode("regression") %>%
  set_engine("earth")

lm_spec <- linear_reg()
pumpkin_set <-
  workflow_set(
    list(base_rec, ind_rec, spline_rec),
    list(rf_spec, mars_spec, lm_spec),
    cross = FALSE
  )

pumpkin_set
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result    
##   <chr>                <list>           <list>    <list>    
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)

pumpkin_rs <-
  workflow_map(
    pumpkin_set,
    "fit_resamples",
    resamples = pumpkin_folds
  )

pumpkin_rs
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result   
##   <chr>                <list>           <list>    <list>   
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[1]> <rsmp[+]>

Evaluate workflow set

autoplot(pumpkin_rs)

collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
##   wflow_id         .config preproc model .metric .estimator   mean     n std_err
##   <chr>            <chr>   <chr>   <chr> <chr>   <chr>       <dbl> <int>   <dbl>
## 1 recipe_1_rand_f… Prepro… recipe  rand… rmse    standard   86.1      10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe  rand… rsq     standard    0.969    10 9.97e-4
## 3 recipe_2_mars    Prepro… recipe  mars  rmse    standard   83.8      10 1.92e+0
## 4 recipe_2_mars    Prepro… recipe  mars  rsq     standard    0.969    10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe  line… rmse    standard   82.4      10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe  line… rsq     standard    0.970    10 1.97e-3
final_fit <-
  extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
  fit(pumpkin_train)
tidy(final_fit) %>%
  arrange(-abs(estimate))
## # A tibble: 15 × 5
##    term                                   estimate std.error statistic   p.value
##    <chr>                                     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                            -9731.     675.      -14.4   1.30e- 46
##  2 ott_bs_3                                2585.      25.6     101.    0        
##  3 ott_bs_2                                 450.      11.9      37.9   2.75e-293
##  4 ott_bs_1                                -345.      36.3      -9.50  2.49e- 21
##  5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr…    21.1      7.80      2.70  6.89e-  3
##  6 country_United.States                     11.9      5.66      2.11  3.53e-  2
##  7 gpc_site_Stillwater.Harvestfest           11.6      7.87      1.48  1.40e-  1
##  8 country_Germany                          -11.5      6.68     -1.71  8.64e-  2
##  9 country_other                            -10.7      6.33     -1.69  9.13e-  2
## 10 country_Canada                             9.29     6.12      1.52  1.29e-  1
## 11 country_Italy                              8.12     7.02      1.16  2.47e-  1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest…    -7.81     7.70     -1.01  3.10e-  1
## 13 year                                       4.89     0.334    14.6   5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br…     1.51     8.07      0.187 8.51e-  1
## 15 gpc_site_other                             1.41     5.60      0.251 8.02e-  1
  1. The research question we are trying to address is if we are able to look at characteristics of a pumpkin and using worksets predict the size of a pumpkin.

The data consists of 28065 observations and 14 variables. In our analysis we looked at a few variables such as the “ott”, the year and the country the pumpkin came from.

The primary dependent variable is weight_lbs, indicating the pumpkin’s weight in pounds. Among the predictors, ott (over-the-top inches) likely represents the pumpkin’s circumference, and is particularly emphasized for its relationship with weight. The year variable, extracted from the dataset’s ID, pinpoints each entry’s associated year. place likely designates a pumpkin’s rank or position in a competition. The data also incorporates categorical variables like gpc_site, reflecting various competition sites, and country, denoting the pumpkin’s origin. Jellvik’s study delves into visualizing these relationships, eventually leading to the creation and evaluation of multiple predictive models, including a random forest, MARS, and linear regression.

  1. From the raw dataset, the id was split into year and type, with a focus retained on pumpkins of type “P”. Selected variables like weight_lbs, year, ott, gpc_site, and country were parsed into numeric formats or adjusted for better modeling relevance. The ott range was confined between 20 and 1e3, likely to address outliers. Additionally, the country category was modified to club less frequent entries together and reorder based on pumpkin weight. This refining process, including additional preprocessing like dummy variable creation, ensured the data was optimized for both visual and statistical interpretation.

  2. In my pumpkin size analysis, I split the raw data’s id into year and type, retaining only type “P” pumpkins. I then refined variables such as weight_lbs, year, ott, gpc_site, and country. To manage outliers, I set constraints on ott. The country variable was adjusted to group less frequent entries, and I created dummy variables from nominal predictors to ease modeling.

In the analysis, the machine learning models used are Random Forest (rand_forest), Multivariate Adaptive Regression Splines (MARS or mars), and Linear Regression (linear_reg).

  1. The metrics used for model evaluation are the Root Mean Squared Error (RMSE) and the R-squared (rsq). The RMSE measures the average magnitude of errors between predicted and observed values, while the R-squared metric indicates the proportion of variance in the dependent variable that’s predicted by the independent variables in the model.

  2. From the analysis of the pumpkin dataset, three models were employed: Linear Regression, Random Forest, and MARS. Among them, the Linear Regression model stood out as the top performer, registering an RMSE of 82.4 and an impressive R-squared of 0.970. In comparison, both the Random Forest and MARS models closely followed, but the Linear Regression held a slight edge. The consistently high R-squared values across all models highlight the significant predictive potential in our dataset, suggesting that the chosen features effectively capture the patterns related to pumpkin sizes.