Explore Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")

## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

pumpkins <- 
    pumpkins_raw %>% 
    separate(id, into = c("year", "type")) %>%
    filter(type == "P") %>%
    mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
    select(weight_lbs, ott, year, place, gpc_site, country)

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 1359 parsing failures.
## row col expected actual
##  39  -- a number    EXH
## 105  -- a number    EXH
## 209  -- a number    EXH
## 217  -- a number    EXH
## 223  -- a number    EXH
## ... ... ........ ......
## See problems(...) for more details.

pumpkins %>%
    filter(ott < 1e3, ott > 10) %>%
    ggplot(aes(ott, weight_lbs, color = place)) +
    geom_point(alpha = 0.1, size = 1.1) +
    labs(x = "over-the-top-inches", y = "weight (lbs)") +
    scale_color_viridis_c()

pumpkins %>%
    filter(ott < 1e3, ott > 20) %>%
    ggplot(aes(ott, weight_lbs,)) +
    geom_point(alpha = 0.1, size = 1.1, color = "gray60") +
    geom_smooth(aes(color = factor(year)),
                method = "lm", 
                formula = y ~ splines::bs(x, 3),
                se = FALSE, linewidth = 1.5, alpha = 0.6) +
    labs(x = "over-the-top-inches", 
         y = "weight (lbs)", color = NULL) +
    scale_color_viridis_d()

pumpkins %>%
    mutate(country = fct_lump(country, n = 10),
           country = fct_reorder(country, weight_lbs)) %>%
    ggplot(aes(country, weight_lbs, color = country)) +
    geom_boxplot(outlier.color = NA) +
    geom_jitter(alpha = 0.1, width = 0.15) +
    labs(x = NULL, y = "weight (lbs)") +
    theme(legend.position = "none")

Build and fit a workflow set

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──

## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

set.seed(123)
pumpkin_split <- 
    pumpkins %>% 
    filter(ott < 1e3, ott > 20) %>%
    initial_split(strata = weight_lbs)

pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)

set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [8954/996]> Fold01
##  2 <split [8954/996]> Fold02
##  3 <split [8954/996]> Fold03
##  4 <split [8954/996]> Fold04
##  5 <split [8954/996]> Fold05
##  6 <split [8954/996]> Fold06
##  7 <split [8955/995]> Fold07
##  8 <split [8956/994]> Fold08
##  9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10

base_rec <- 
    recipe(weight_lbs ~ ott + year + country + gpc_site,
           data = pumpkin_train) %>%
  step_other(country, gpc_site, threshold = 0.02)

ind_rec <-
  base_rec %>%
  step_dummy(all_nominal_predictors())

spline_rec <-
  ind_rec %>%
  step_bs(ott)

rf_spec <-
  rand_forest(trees = 1e3) %>%
  set_mode("regression") %>%
  set_engine("ranger")

mars_spec <-
  mars() %>%
  set_mode("regression") %>%
  set_engine("earth")

lm_spec <- linear_reg()

pumpkin_set <-
  workflow_set(
    list(base_rec, ind_rec, spline_rec),
    list(rf_spec, mars_spec, lm_spec),
    cross = FALSE
  )

pumpkin_set

## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result    
##   <chr>                <list>           <list>    <list>    
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[0]> <list [0]>

doParallel::registerDoParallel()
set.seed(2021)

pumpkin_rs <-
  workflow_map(
    pumpkin_set,
    "fit_resamples",
    resamples = pumpkin_folds
  )

pumpkin_rs

## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result   
##   <chr>                <list>           <list>    <list>   
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[1]> <rsmp[+]>

Evaluate Workflow Set

autoplot(pumpkin_rs)

collect_metrics(pumpkin_rs)

## # A tibble: 6 × 9
##   wflow_id         .config preproc model .metric .estimator   mean     n std_err
##   <chr>            <chr>   <chr>   <chr> <chr>   <chr>       <dbl> <int>   <dbl>
## 1 recipe_1_rand_f… Prepro… recipe  rand… rmse    standard   86.1      10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe  rand… rsq     standard    0.969    10 9.97e-4
## 3 recipe_2_mars    Prepro… recipe  mars  rmse    standard   83.8      10 1.92e+0
## 4 recipe_2_mars    Prepro… recipe  mars  rsq     standard    0.969    10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe  line… rmse    standard   82.4      10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe  line… rsq     standard    0.970    10 1.97e-3

final_fit <-
  extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
  fit(pumpkin_train)

tidy(final_fit) %>%
  arrange(-abs(estimate))

## # A tibble: 15 × 5
##    term                                   estimate std.error statistic   p.value
##    <chr>                                     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                            -9731.     675.      -14.4   1.30e- 46
##  2 ott_bs_3                                2585.      25.6     101.    0        
##  3 ott_bs_2                                 450.      11.9      37.9   2.75e-293
##  4 ott_bs_1                                -345.      36.3      -9.50  2.49e- 21
##  5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr…    21.1      7.80      2.70  6.89e-  3
##  6 country_United.States                     11.9      5.66      2.11  3.53e-  2
##  7 gpc_site_Stillwater.Harvestfest           11.6      7.87      1.48  1.40e-  1
##  8 country_Germany                          -11.5      6.68     -1.71  8.64e-  2
##  9 country_other                            -10.7      6.33     -1.69  9.13e-  2
## 10 country_Canada                             9.29     6.12      1.52  1.29e-  1
## 11 country_Italy                              8.12     7.02      1.16  2.47e-  1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest…    -7.81     7.70     -1.01  3.10e-  1
## 13 year                                       4.89     0.334    14.6   5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br…     1.51     8.07      0.187 8.51e-  1
## 15 gpc_site_other                             1.41     5.60      0.251 8.02e-  1

1. What is the research question?

The question we are trying to answer is whether or not we can predict the weight of giant pumpkins from other characteristics measured during competitions. # Describe the data briefly
The data is numerous measurements and information regarding giant pumpkins during a competition. # What are the characteristics of the key variables used in the analysis?
The primary variables we are looking at are the weight of the pumpkin, the ott measurement, the year the pumpkin was grown and place of origin.

#2. Data Exploration and Transformation: - The newly transformed data includes separating the id category into two subset categories, transforming numerous character data into numeric data, filtering to giant pumpkins only and selecting only key factors to help us answer the question.

3. Data Preparation and Modeling:

- The three major pre-processing tools that are used in this file are step_other (which pools infrequently occurring data/values into their own category), step_dummy (which will create a set of binary dummy variables from a factor variable), and step_bs which (will create a specification of a recipe step that will create new columns that are basis expansions of variables using B-splines).

#4. Model Evaluation: - We used autoplot to compare the Linear regression model against the mars model and the random forest model. The findings show that all 3 options performed relatively similar, with linear_regression model seeming to do slightly better.

5. Conclusion:

The key findings are that our spline terms are the most telling of predictive weight, followed by a few sites and countries as well as year making an appearance in the importance list.

Untitled

Colin Tracy

2023-10-03