Tidymodels Regression

Harold Nelson

2024-11-04

Setup

Load tidyverse and tidymodels Load the dataframe OAW2409.

Solution

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
load("OAW2409.Rdata")

Split the data

Put 70% of the data in training. Set the strata to TMAX. Create OAW_training and OAW_test.

Solution

set.seed(123)
OAW_split <- initial_split(OAW2409, 
                  prop = .7, 
                  strata = TMAX)

OAW_training <- OAW_split %>%
 training()

OAW_test <- OAW_split %>%
 testing()

Verify the Split

Make sure that there are about 70% of the data in train and about 30% in test.

Also Examine the distributions of TMAX in the training and test dataframes. They should be similar since we specified that the strata would be TMAX.

Solution

.7 * nrow(OAW2409)
## [1] 21300.3
nrow(OAW_training)
## [1] 21298
print(" ")
## [1] " "
.3 * nrow(OAW2409)
## [1] 9128.7
nrow(OAW_test)
## [1] 9131
print(" ")
## [1] " "
summary(OAW_training$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   50.00   59.00   60.65   71.00  110.00
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    50.0    59.0    60.6    71.0   104.0

Create a linear regression model.

Call it linear_model.

Solution

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  # Set the model engine
  set_engine("lm") %>% 
  # Set the model mode
  set_mode("regression")

Alternatives

lm1 = linear_reg(mode = "regression", engine = "lm")
lm1
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
lm2 = linear_reg()
lm2
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

Train

Fit a model, lm_fit, to predict TMAX using TMIN as the only predictor. Use the training data only.

Solution

lm_fit <- linear_model %>% 
  fit(TMAX ~ TMIN,
      data = OAW_training)

Examine the Model

Print lm_fit to view model information and use the tidy() function.

Solution

lm_fit
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
## 
## Coefficients:
## (Intercept)         TMIN  
##       19.57         1.03
tidy(lm_fit)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    19.6    0.301        65.1       0
## 2 TMIN            1.03   0.00734     140.        0

Predictions

Create TMAX_predictions for the test data using the predict() function on lm_fit. Use head() and str() to understand the file.

Solution

TMAX_predictions <- predict(lm_fit,
                        new_data = OAW_test)
head(TMAX_predictions)
## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1  59.8
## 2  67.0
## 3  63.9
## 4  69.0
## 5  60.8
## 6  60.8
str(TMAX_predictions)
## tibble [9,131 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:9131] 59.8 67 63.9 69 60.8 ...

Compare

Use summary() to compare the distributions of the actual and predicted values.

Solution

summary(TMAX_predictions)
##      .pred      
##  Min.   :16.48  
##  1st Qu.:53.57  
##  Median :60.79  
##  Mean   :60.69  
##  3rd Qu.:68.00  
##  Max.   :86.55
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    50.0    59.0    60.6    71.0   104.0

Build the Results

Create a dataframe TMAX_results containing TMAX, TMIN and .pred.

Solution

TMAX_results <- OAW_test %>% 
  select(TMAX,TMIN) %>% 
  bind_cols(TMAX_predictions)

Evaluate the Model

Use the rmse() and rsq() functions.

Solution

TMAX_results %>% 
  rmse(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.86
# Calculate the R squared metric
TMAX_results %>% 
  rsq(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.479

Create an R squared plot of model performance

Solution

ggplot(TMAX_results, aes(x = TMAX, y = .pred)) +
  geom_point(alpha = 0.1,size = .1) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual TMAX', y = 'Predicted TMAX')

Recreate

Recreate this model using last_fit(). Print the values of RSQ and RMSE. Prefix the fit and df objects with base_.

Solution

base_fit <- linear_model %>% 
  last_fit(TMAX ~ TMIN, split = OAW_split)

base_df <- base_fit %>% 
  collect_predictions()

base_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.86
base_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.479

An Alternative Model

Copy the code from the previous chunk and create an mo_ model, where the only predictor is the factor mo.

How do the performance metrics compare?

Solution

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        13.5
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0197

Coefficients

The following procedure gives a table with the coefficients of the model. You can’t directly tidy a fit object.

Solution

mo_fit |> extract_workflow() |> tidy() 
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   56.8      0.200      285.  0        
## 2 mo             0.590    0.0271      21.8 7.33e-104

A More Complex Model

Run a model with both mo and TMIN. Compare the RMSE value with the previous model

Solution

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo + TMIN, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.86
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.479
mo_fit |> extract_workflow() |> tidy()
## # A tibble: 3 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)  19.4      0.309       62.8   0     
## 2 mo            0.0423   0.0202       2.10  0.0361
## 3 TMIN          1.03     0.00749    137.    0

Categorical mo

The previous model considered mo as a quantitative variable. If we make it categorical, we allow December and January to be similar.

Solution

mo_cat_fit <- linear_model %>% 
  last_fit(TMAX ~  factor(mo) + TMIN, split = OAW_split)

mo_cat_df <- mo_cat_fit %>% 
  collect_predictions()

mo_cat_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.95
mo_cat_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.741
mo_cat_fit |> extract_workflow() |> tidy()
## # A tibble: 13 × 5
##    term           estimate std.error statistic   p.value
##    <chr>             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)      35.4     0.290      122.   0        
##  2 `factor(mo)2`     3.79    0.240       15.8  8.74e- 56
##  3 `factor(mo)3`     7.81    0.234       33.3  2.16e-237
##  4 `factor(mo)4`    12.6     0.241       52.4  0        
##  5 `factor(mo)5`    18.0     0.247       72.9  0        
##  6 `factor(mo)6`    21.5     0.260       82.7  0        
##  7 `factor(mo)7`    27.3     0.270      101.   0        
##  8 `factor(mo)8`    27.3     0.270      101.   0        
##  9 `factor(mo)9`    22.6     0.259       87.2  0        
## 10 `factor(mo)10`   13.1     0.244       53.6  0        
## 11 `factor(mo)11`    4.47    0.240       18.6  6.65e- 77
## 12 `factor(mo)12`   -0.272   0.236       -1.15 2.49e-  1
## 13 TMIN              0.299   0.00744     40.2  0