library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <-
  pumpkins_raw %>%
  separate(id, into = c("year", "type")) %>%
  mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
  filter(type == "P") %>%
  select(weight_lbs, year, place, ott, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 2327 parsing failures.
## row col expected actual
##  13  -- a number    EXH
##  36  -- a number    EXH
##  58  -- a number    EXH
##  60  -- a number    EXH
##  61  -- a number    EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins
## # A tibble: 15,965 × 6
##    weight_lbs  year place   ott gpc_site                                 country
##         <dbl> <dbl> <dbl> <dbl> <chr>                                    <chr>  
##  1      2032   2013     1   475 Uesugi Farms Weigh-off                   United…
##  2      1985   2013     2   453 Safeway World Championship Pumpkin Weig… United…
##  3      1894   2013     3   445 Safeway World Championship Pumpkin Weig… United…
##  4      1874.  2013     4   436 Elk Grove Giant Pumpkin Festival         United…
##  5      1813   2013     5   430 The Great Howard Dill Giant Pumpkin Cla… Canada 
##  6      1791   2013     6   431 Elk Grove Giant Pumpkin Festival         United…
##  7      1784   2013     7   445 Uesugi Farms Weigh-off                   United…
##  8      1784.  2013     8   434 Stillwater Harvestfest                   United…
##  9      1780.  2013     9   422 Stillwater Harvestfest                   United…
## 10      1766.  2013    10   425 Durham Fair Weigh-Off                    United…
## # ℹ 15,955 more rows
pumpkins %>%
  filter(ott > 20, ott < 1e3) %>%
  ggplot(aes(ott, weight_lbs, color = place)) +
  geom_point(alpha = 0.2, size = 1.1) +
  labs(x = "over-the-top inches", y = "weight (lbs)") +
  scale_color_viridis_c()

pumpkins %>%
  filter(ott > 20, ott < 1e3) %>%
  ggplot(aes(ott, weight_lbs)) +
  geom_point(alpha = 0.2, size = 1.1, color = "gray60") +
  geom_smooth(aes(color = factor(year)),
    method = lm, formula = y ~ splines::bs(x, 3),
    se = FALSE, size = 1.5, alpha = 0.6
  ) +
  labs(x = "over-the-top inches", y = "weight (lbs)", color = NULL) +
  scale_color_viridis_d()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

pumpkins %>%
  mutate(
    country = fct_lump(country, n = 10),
    country = fct_reorder(country, weight_lbs)
  ) %>%
  ggplot(aes(country, weight_lbs, color = country)) +
  geom_boxplot(outlier.colour = NA) +
  geom_jitter(alpha = 0.1, width = 0.15) +
  labs(x = NULL, y = "weight (lbs)") +
  theme(legend.position = "none")

Build and fit a workflow set

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
pumpkin_split <-
    pumpkins %>%
    filter(ott < 1e3, ott > 20) %>%
    initial_split(strata = weight_lbs)

pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)

set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [8954/996]> Fold01
##  2 <split [8954/996]> Fold02
##  3 <split [8954/996]> Fold03
##  4 <split [8954/996]> Fold04
##  5 <split [8954/996]> Fold05
##  6 <split [8954/996]> Fold06
##  7 <split [8955/995]> Fold07
##  8 <split [8956/994]> Fold08
##  9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <- 
    recipe(weight_lbs ~ ott + year + country + gpc_site, data = pumpkin_train) %>%
    step_other(country, gpc_site, threshold = 0.02)

ind_rec <-
    base_rec %>%
    step_dummy(all_nominal_predictors())

spline_rec <-
    ind_rec %>%
    step_bs(ott)
rf_spec <-
    rand_forest(trees = 1e3) %>%
    set_mode("regression") %>%
    set_engine("ranger")

mars_spec <-
    mars() %>%
    set_mode("regression") %>%
    set_engine("earth")

lm_spec <- linear_reg()
pumpkin_set <-
  workflow_set(
    list(base_rec, ind_rec, spline_rec),
    list(rf_spec, mars_spec, lm_spec),
    cross = FALSE
  )

pumpkin_set
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result    
##   <chr>                <list>           <list>    <list>    
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)

pumpkin_rs <-
    workflow_map(
        pumpkin_set,
        "fit_resamples",
        resamples = pumpkin_folds
    )

pumpkin_rs
## # A workflow set/tibble: 3 × 4
##   wflow_id             info             option    result   
##   <chr>                <list>           <list>    <list>   
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars        <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg  <tibble [1 × 4]> <opts[1]> <rsmp[+]>

Evaluate workflow set

autoplot(pumpkin_rs)

collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
##   wflow_id         .config preproc model .metric .estimator   mean     n std_err
##   <chr>            <chr>   <chr>   <chr> <chr>   <chr>       <dbl> <int>   <dbl>
## 1 recipe_1_rand_f… Prepro… recipe  rand… rmse    standard   86.1      10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe  rand… rsq     standard    0.969    10 9.97e-4
## 3 recipe_2_mars    Prepro… recipe  mars  rmse    standard   83.8      10 1.92e+0
## 4 recipe_2_mars    Prepro… recipe  mars  rsq     standard    0.969    10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe  line… rmse    standard   82.4      10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe  line… rsq     standard    0.970    10 1.97e-3
final_fit<-
extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
    fit(pumpkin_train)

tidy(final_fit) %>%
    arrange(-abs(estimate))
## # A tibble: 15 × 5
##    term                                   estimate std.error statistic   p.value
##    <chr>                                     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                            -9731.     675.      -14.4   1.30e- 46
##  2 ott_bs_3                                2585.      25.6     101.    0        
##  3 ott_bs_2                                 450.      11.9      37.9   2.75e-293
##  4 ott_bs_1                                -345.      36.3      -9.50  2.49e- 21
##  5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr…    21.1      7.80      2.70  6.89e-  3
##  6 country_United.States                     11.9      5.66      2.11  3.53e-  2
##  7 gpc_site_Stillwater.Harvestfest           11.6      7.87      1.48  1.40e-  1
##  8 country_Germany                          -11.5      6.68     -1.71  8.64e-  2
##  9 country_other                            -10.7      6.33     -1.69  9.13e-  2
## 10 country_Canada                             9.29     6.12      1.52  1.29e-  1
## 11 country_Italy                              8.12     7.02      1.16  2.47e-  1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest…    -7.81     7.70     -1.01  3.10e-  1
## 13 year                                       4.89     0.334    14.6   5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br…     1.51     8.07      0.187 8.51e-  1
## 15 gpc_site_other                             1.41     5.60      0.251 8.02e-  1
  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is: Can we predict pumpkin weights from a pumpkin growing competition data set using other characteristics from the pumpkins?

    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set is all about pumpkins. The data set has 15,965 observations of 6 variables. The variables include weight in pounds, the year, ott (over-the-top inches) and the country.

    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest in the data set are year, weight, ott and country. Year, ott, and weight are all numerical data. The year row tells us what year the pumpkin was grown. The ott row tells us how many inches the pumpkin measures across on the top. The weight tells how much the pumpkin weighed in pounds, which is what we are trying to predict using the other characteristics. Country is character data that describes where the pumpkin was grown.

  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data was 15,965 observations with 6 variables. The data set was split into training and testing data. The training data has only 9,950 observations of 6 variables, and the testing data only has 3,319 observations of 6 variables.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. We prepared the data set by using the separate(id, into = c(“year”, “type”)) to split the “id” column into “year” and “type” columns. We used mutate(across(c(year, weight_lbs, ott, place), parse_number)) to change the data into numeric data types so that all variables are used in modeling. We used filter(type == “P”) to make sure all the data was relevant to obtain the goal of predicting the weight of pumpkins. We used step_bs to the “ott” which looked for possible non-linear relationships between “ott” and pumpkin weights. We used fct_lump and fct_reorder to simplify and reorder the country variable based off pumpkin weight. This makes the variable easier to interpret for the machine learning model. We used inital_split and vfold_cv to stratify the data based on the weight_lbs variable so that the training and test data were balanced when split.

    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models we used were rand_forest(trees = 1e3) from the ranger engine. Random forest is good with regression tasks. We used this to predict the weight of a giant pumpkin based on various characteristics. Random forest is good to use to find complex relationships, such as the relationships between ott, year, country, gpc_site, and pumpkin weight. We also used mars() from the earth engine. Mars() is a regression technique that uses linear splines to model complex relationships in the data set. It is a good function to use to find nonlinear relationships patterns in relationships between the predictor variables like year, ott, and country, and the target variable which is pumpkin weight. We also used linear_reg from the lm engine. Linear_reg is a good model for linear regression, and helps establish a baseline for linear relationships between predictor variables like year, country, and ott, and the target variable which is pumpkin weight.

  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics used to evaluate the model were RMSE and R-squared.RMSE measures the average standard deviation of predicted values from actual values. It is often used for regression tasks, which is why it was helpful to use in the pumpkin weight prediction. A lower RMSE indicates that the predictions are closer to the actual weights. R-squared measures the proportion of variance in target variables that are explained by the model. In this case, the target variable is the pumpkin weights. An R-squared value close to 1 indicates that the model explains a high percentage of the variance which is good in regression tasks.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. The analysis shows there is a strong relationship between the over-the-top inches of a pumpkin and the pumpkin weight. The larger the over-the-top inches value, the heavier the pumpkin. The analysis shows that the spline terms are important features in predicting the pumpkin weights. The analysis also shows there is a small relationship between increased pumpkin weights over the years.