library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
pumpkins_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-19/pumpkins.csv")
## Rows: 28065 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id, place, weight_lbs, grower_name, city, state_prov, country, gpc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pumpkins <-
pumpkins_raw %>%
separate(id, into = c("year", "type")) %>%
mutate(across(c(year, weight_lbs, ott, place), parse_number)) %>%
filter(type == "P") %>%
select(weight_lbs, year, place, ott, gpc_site, country)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c(year, weight_lbs, ott, place), parse_number)`.
## Caused by warning:
## ! 2327 parsing failures.
## row col expected actual
## 13 -- a number EXH
## 36 -- a number EXH
## 58 -- a number EXH
## 60 -- a number EXH
## 61 -- a number EXH
## ... ... ........ ......
## See problems(...) for more details.
pumpkins
## # A tibble: 15,965 × 6
## weight_lbs year place ott gpc_site country
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 2032 2013 1 475 Uesugi Farms Weigh-off United…
## 2 1985 2013 2 453 Safeway World Championship Pumpkin Weig… United…
## 3 1894 2013 3 445 Safeway World Championship Pumpkin Weig… United…
## 4 1874. 2013 4 436 Elk Grove Giant Pumpkin Festival United…
## 5 1813 2013 5 430 The Great Howard Dill Giant Pumpkin Cla… Canada
## 6 1791 2013 6 431 Elk Grove Giant Pumpkin Festival United…
## 7 1784 2013 7 445 Uesugi Farms Weigh-off United…
## 8 1784. 2013 8 434 Stillwater Harvestfest United…
## 9 1780. 2013 9 422 Stillwater Harvestfest United…
## 10 1766. 2013 10 425 Durham Fair Weigh-Off United…
## # ℹ 15,955 more rows
pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs, color = place)) +
geom_point(alpha = 0.2, size = 1.1) +
labs(x = "over-the-top inches", y = "weight (lbs)") +
scale_color_viridis_c()
pumpkins %>%
filter(ott > 20, ott < 1e3) %>%
ggplot(aes(ott, weight_lbs)) +
geom_point(alpha = 0.2, size = 1.1, color = "gray60") +
geom_smooth(aes(color = factor(year)),
method = lm, formula = y ~ splines::bs(x, 3),
se = FALSE, size = 1.5, alpha = 0.6
) +
labs(x = "over-the-top inches", y = "weight (lbs)", color = NULL) +
scale_color_viridis_d()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pumpkins %>%
mutate(
country = fct_lump(country, n = 10),
country = fct_reorder(country, weight_lbs)
) %>%
ggplot(aes(country, weight_lbs, color = country)) +
geom_boxplot(outlier.colour = NA) +
geom_jitter(alpha = 0.1, width = 0.15) +
labs(x = NULL, y = "weight (lbs)") +
theme(legend.position = "none")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
set.seed(123)
pumpkin_split <-
pumpkins %>%
filter(ott < 1e3, ott > 20) %>%
initial_split(strata = weight_lbs)
pumpkin_train <- training(pumpkin_split)
pumpkin_test <- testing(pumpkin_split)
set.seed(234)
pumpkin_folds <- vfold_cv(pumpkin_train, strata = weight_lbs)
pumpkin_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [8954/996]> Fold01
## 2 <split [8954/996]> Fold02
## 3 <split [8954/996]> Fold03
## 4 <split [8954/996]> Fold04
## 5 <split [8954/996]> Fold05
## 6 <split [8954/996]> Fold06
## 7 <split [8955/995]> Fold07
## 8 <split [8956/994]> Fold08
## 9 <split [8957/993]> Fold09
## 10 <split [8958/992]> Fold10
base_rec <-
recipe(weight_lbs ~ ott + year + country + gpc_site, data = pumpkin_train) %>%
step_other(country, gpc_site, threshold = 0.02)
ind_rec <-
base_rec %>%
step_dummy(all_nominal_predictors())
spline_rec <-
ind_rec %>%
step_bs(ott)
rf_spec <-
rand_forest(trees = 1e3) %>%
set_mode("regression") %>%
set_engine("ranger")
mars_spec <-
mars() %>%
set_mode("regression") %>%
set_engine("earth")
lm_spec <- linear_reg()
pumpkin_set <-
workflow_set(
list(base_rec, ind_rec, spline_rec),
list(rf_spec, mars_spec, lm_spec),
cross = FALSE
)
pumpkin_set
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
doParallel::registerDoParallel()
set.seed(2021)
pumpkin_rs <-
workflow_map(
pumpkin_set,
"fit_resamples",
resamples = pumpkin_folds
)
pumpkin_rs
## # A workflow set/tibble: 3 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 recipe_1_rand_forest <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 2 recipe_2_mars <tibble [1 × 4]> <opts[1]> <rsmp[+]>
## 3 recipe_3_linear_reg <tibble [1 × 4]> <opts[1]> <rsmp[+]>
autoplot(pumpkin_rs)
collect_metrics(pumpkin_rs)
## # A tibble: 6 × 9
## wflow_id .config preproc model .metric .estimator mean n std_err
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int> <dbl>
## 1 recipe_1_rand_f… Prepro… recipe rand… rmse standard 86.1 10 1.10e+0
## 2 recipe_1_rand_f… Prepro… recipe rand… rsq standard 0.969 10 9.97e-4
## 3 recipe_2_mars Prepro… recipe mars rmse standard 83.8 10 1.92e+0
## 4 recipe_2_mars Prepro… recipe mars rsq standard 0.969 10 1.67e-3
## 5 recipe_3_linear… Prepro… recipe line… rmse standard 82.4 10 2.27e+0
## 6 recipe_3_linear… Prepro… recipe line… rsq standard 0.970 10 1.97e-3
final_fit<-
extract_workflow(pumpkin_rs, "recipe_3_linear_reg") %>%
fit(pumpkin_train)
tidy(final_fit) %>%
arrange(-abs(estimate))
## # A tibble: 15 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -9731. 675. -14.4 1.30e- 46
## 2 ott_bs_3 2585. 25.6 101. 0
## 3 ott_bs_2 450. 11.9 37.9 2.75e-293
## 4 ott_bs_1 -345. 36.3 -9.50 2.49e- 21
## 5 gpc_site_Ohio.Valley.Giant.Pumpkin.Gr… 21.1 7.80 2.70 6.89e- 3
## 6 country_United.States 11.9 5.66 2.11 3.53e- 2
## 7 gpc_site_Stillwater.Harvestfest 11.6 7.87 1.48 1.40e- 1
## 8 country_Germany -11.5 6.68 -1.71 8.64e- 2
## 9 country_other -10.7 6.33 -1.69 9.13e- 2
## 10 country_Canada 9.29 6.12 1.52 1.29e- 1
## 11 country_Italy 8.12 7.02 1.16 2.47e- 1
## 12 gpc_site_Elk.Grove.Giant.Pumpkin.Fest… -7.81 7.70 -1.01 3.10e- 1
## 13 year 4.89 0.334 14.6 5.03e- 48
## 14 gpc_site_Wiegemeisterschaft.Berlin.Br… 1.51 8.07 0.187 8.51e- 1
## 15 gpc_site_other 1.41 5.60 0.251 8.02e- 1
What is the research question? Clearly state the research question you aim to address using the new dataset. The research question is: Can we predict pumpkin weights from a pumpkin growing competition data set using other characteristics from the pumpkins?
Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set is all about pumpkins. The data set has 15,965 observations of 6 variables. The variables include weight in pounds, the year, ott (over-the-top inches) and the country.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest in the data set are year, weight, ott and country. Year, ott, and weight are all numerical data. The year row tells us what year the pumpkin was grown. The ott row tells us how many inches the pumpkin measures across on the top. The weight tells how much the pumpkin weighed in pounds, which is what we are trying to predict using the other characteristics. Country is character data that describes where the pumpkin was grown.
What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. We prepared the data set by using the separate(id, into = c(“year”, “type”)) to split the “id” column into “year” and “type” columns. We used mutate(across(c(year, weight_lbs, ott, place), parse_number)) to change the data into numeric data types so that all variables are used in modeling. We used filter(type == “P”) to make sure all the data was relevant to obtain the goal of predicting the weight of pumpkins. We used step_bs to the “ott” which looked for possible non-linear relationships between “ott” and pumpkin weights. We used fct_lump and fct_reorder to simplify and reorder the country variable based off pumpkin weight. This makes the variable easier to interpret for the machine learning model. We used inital_split and vfold_cv to stratify the data based on the weight_lbs variable so that the training and test data were balanced when split.
What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models we used were rand_forest(trees = 1e3) from the ranger engine. Random forest is good with regression tasks. We used this to predict the weight of a giant pumpkin based on various characteristics. Random forest is good to use to find complex relationships, such as the relationships between ott, year, country, gpc_site, and pumpkin weight. We also used mars() from the earth engine. Mars() is a regression technique that uses linear splines to model complex relationships in the data set. It is a good function to use to find nonlinear relationships patterns in relationships between the predictor variables like year, ott, and country, and the target variable which is pumpkin weight. We also used linear_reg from the lm engine. Linear_reg is a good model for linear regression, and helps establish a baseline for linear relationships between predictor variables like year, country, and ott, and the target variable which is pumpkin weight.