Explore Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <- 
    pumpkins_raw %>% 
    separate(id, into = c("year", "type")) %>%
    filter(type == "P") %>%
    mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
    select(weight_lbs, ott, year, place, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 1359 parsing failures.
## row col expected actual
##  39  -- a number    EXH
## 105  -- a number    EXH
## 209  -- a number    EXH
## 217  -- a number    EXH
## 223  -- a number    EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins %>%
    filter(ott < 1e3, ott > 10) %>%
    ggplot(aes(ott, weight_lbs, color = place)) +
    geom_point(alpha = 0.1, size = 1.1) +
    labs(x = "over-the-top-inches", y = "weight (lbs)") +
    scale_color_viridis_c()

pumpkins %>%
    filter(ott < 1e3, ott > 20) %>%
    ggplot(aes(ott, weight_lbs,)) +
    geom_point(alpha = 0.1, size = 1.1, color = "gray60") +
    geom_smooth(aes(color = factor(year)),
                method = "lm", 
                formula = y ~ splines::bs(x, 3),
                se = FALSE, linewidth = 1.5, alpha = 0.6) +
    labs(x = "over-the-top-inches", 
         y = "weight (lbs)", color = NULL) +
    scale_color_viridis_d()

pumpkins %>%
    mutate(country = fct_lump(country, n = 10),
           country = fct_reorder(country, weight_lbs)) %>%
    ggplot(aes(country, weight_lbs, color = country)) +
    geom_boxplot(outlier.color = NA) +
    geom_jitter(alpha = 0.1, width = 0.15) +
    labs(x = NULL, y = "weight (lbs)") +
    theme(legend.position = "none")

Build and fit a workflow set

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
pumpkin_split <- 
    pumpkins %>% 
    filter(ott < 1e3, ott > 20) %>%
    initial_split(strata = weight_lbs)

pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)

set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [8954/996]> Fold01
##  2 <split [8954/996]> Fold02
##  3 <split [8954/996]> Fold03
##  4 <split [8954/996]> Fold04
##  5 <split [8954/996]> Fold05
##  6 <split [8954/996]> Fold06
##  7 <split [8955/995]> Fold07
##  8 <split [8956/994]> Fold08
##  9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <- 
    recipe(weight_lbs ~ ott + year + country + gpc_site,
           data = pumpkin_train) %>%
  step_other(country, gpc_site, threshold = 0.02)

ind_rec <-
  base_rec %>%
  step_dummy(all_nominal_predictors())

spline_rec <-
  ind_rec %>%
  step_bs(ott)
rf_spec <-
  rand_forest(trees = 1e3) %>%
  set_mode("regression") %>%
  set_engine("ranger")

mars_spec <-
  mars() %>%
  set_mode("regression") %>%
  set_engine("earth")

lm_spec <- linear_reg()
pumpkin_set <-
  workflow_set(
    list(base_rec, ind_rec, spline_rec),
    list(rf_spec, mars_spec, lm_spec),
    cross = FALSE
  )

pumpkin_set
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result    
##   <chr>                <list>           <list>    <list>    
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)

pumpkin_rs <-
  workflow_map(
    pumpkin_set,
    "fit_resamples",
    resamples = pumpkin_folds
  )

pumpkin_rs
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result   
##   <chr>                <list>           <list>    <list>   
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[1]> <rsmp[+]>

Evaluate Workflow Set

autoplot(pumpkin_rs)

collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
##   wflow_id         .config preproc model .metric .estimator   mean     n std_err
##   <chr>            <chr>   <chr>   <chr> <chr>   <chr>       <dbl> <int>   <dbl>
## 1 recipe_1_rand_f… Prepro… recipe  rand… rmse    standard   86.1      10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe  rand… rsq     standard    0.969    10 9.97e-4
## 3 recipe_2_mars    Prepro… recipe  mars  rmse    standard   83.8      10 1.92e+0
## 4 recipe_2_mars    Prepro… recipe  mars  rsq     standard    0.969    10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe  line… rmse    standard   82.4      10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe  line… rsq     standard    0.970    10 1.97e-3
final_fit <-
  extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
  fit(pumpkin_train)
tidy(final_fit) %>%
  arrange(-abs(estimate))
## # A tibble: 15 × 5
##    term                                   estimate std.error statistic   p.value
##    <chr>                                     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                            -9731.     675.      -14.4   1.30e- 46
##  2 ott_bs_3                                2585.      25.6     101.    0        
##  3 ott_bs_2                                 450.      11.9      37.9   2.75e-293
##  4 ott_bs_1                                -345.      36.3      -9.50  2.49e- 21
##  5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr…    21.1      7.80      2.70  6.89e-  3
##  6 country_United.States                     11.9      5.66      2.11  3.53e-  2
##  7 gpc_site_Stillwater.Harvestfest           11.6      7.87      1.48  1.40e-  1
##  8 country_Germany                          -11.5      6.68     -1.71  8.64e-  2
##  9 country_other                            -10.7      6.33     -1.69  9.13e-  2
## 10 country_Canada                             9.29     6.12      1.52  1.29e-  1
## 11 country_Italy                              8.12     7.02      1.16  2.47e-  1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest…    -7.81     7.70     -1.01  3.10e-  1
## 13 year                                       4.89     0.334    14.6   5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br…     1.51     8.07      0.187 8.51e-  1
## 15 gpc_site_other                             1.41     5.60      0.251 8.02e-  1

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data includes separating the id category into two subset categories, transforming numerous character data into numeric data, filtering to giant pumpkins only and selecting only key factors to help us answer the question.

3. Data Preparation and Modeling:

- The three major pre-processing tools that are used in this file are step_other (which pools infrequently occurring data/values into their own category), step_dummy (which will create a set of binary dummy variables from a factor variable), and step_bs which (will create a specification of a recipe step that will create new columns that are basis expansions of variables using B-splines). 

#4. Model Evaluation: - We used autoplot to compare the Linear regression model against the mars model and the random forest model. The findings show that all 3 options performed relatively similar, with linear_regression model seeming to do slightly better.

5. Conclusion: