Tidymodels Regression

Harold Nelson

2024-04-06

Setup

Load tidyverse and tidymodels Load the dataframe OAW2309.

Solution

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.0 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
load("OAW2309.Rdata")

Split the data

Put 70% of the data in training. Set the strata to TMAX. Create OAW_training and OAW_test.

Solution

set.seed(123)
OAW_split <- initial_split(OAW2309, 
                  prop = .7, 
                  strata = TMAX)

OAW_training <- OAW_split %>%
 training()

OAW_test <- OAW_split %>%
 testing()

Verify the Split

Make sure that there are about 70% of the data in train and about 30% in test.

Also Examine the distributions of TMAX in the training and test dataframes. They should be similar since we specified that the strata would be TMAX.

Solution

.7 * nrow(OAW2309)
## [1] 21052.5
nrow(OAW_training)
## [1] 21050
print(" ")
## [1] " "
.3 * nrow(OAW2309)
## [1] 9022.5
nrow(OAW_test)
## [1] 9025
print(" ")
## [1] " "
summary(OAW_training$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   50.00   59.00   60.61   71.00  105.00
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.62   71.00  110.00

Create a linear regression model.

Call it linear_model.

Solution

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  # Set the model engine
  set_engine("lm") %>% 
  # Set the model mode
  set_mode("regression")

Alternatives

lm1 = linear_reg(mode = "regression", engine = "lm")
lm1
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
lm2 = linear_reg()
lm2
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm

Train

Fit a model, lm_fit, to predict TMAX using TMIN as the only predictor. Use the training data only.

Solution

lm_fit <- linear_model %>% 
  fit(TMAX ~ TMIN,
      data = OAW_training)

Examine the Model

Print lm_fit to view model information and use the tidy() function.

Solution

lm_fit
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
## 
## Coefficients:
## (Intercept)         TMIN  
##      19.809        1.024
tidy(lm_fit)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    19.8    0.301        65.9       0
## 2 TMIN            1.02   0.00735     139.        0

Predictions

Create TMAX_predictions for the test data using the predict() function on lm_fit. Use head() and str() to understand the file.

Solution

TMAX_predictions <- predict(lm_fit,
                        new_data = OAW_test)
head(TMAX_predictions)
## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1  59.8
## 2  66.9
## 3  63.9
## 4  69.0
## 5  60.8
## 6  60.8
str(TMAX_predictions)
## tibble [9,025 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:9025] 59.8 66.9 63.9 69 60.8 ...

Compare

Use summary() to compare the distributions of the actual and predicted values.

Solution

summary(TMAX_predictions)
##      .pred      
##  Min.   :16.74  
##  1st Qu.:53.62  
##  Median :60.79  
##  Mean   :60.71  
##  3rd Qu.:67.96  
##  Max.   :86.40
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.62   71.00  110.00

Build the Results

Create a dataframe TMAX_results containing TMAX, TMIN and .pred.

Solution

TMAX_results <- OAW_test %>% 
  select(TMAX,TMIN) %>% 
  bind_cols(TMAX_predictions)

Evaluate the Model

Use the rmse() and rsq() functions.

Solution

TMAX_results %>% 
  rmse(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.89
# Calculate the R squared metric
TMAX_results %>% 
  rsq(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.481

Create an R squared plot of model performance

Solution

ggplot(TMAX_results, aes(x = TMAX, y = .pred)) +
  geom_point(alpha = 0.1,size = .1) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual TMAX', y = 'Predicted TMAX')

Recreate

Recreate this model using last_fit(). Print the values of RSQ and RMSE. Prefix the fit and df objects with base_.

Solution

base_fit <- linear_model %>% 
  last_fit(TMAX ~ TMIN, split = OAW_split)

base_df <- base_fit %>% 
  collect_predictions()

base_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.89
base_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.481

An Alternative Model

Copy the code from the previous chunk and create an mo_ model, where the only predictor is the factor mo.

How do the performance metrics compare?

Solution

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        7.20
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.726

Coefficients

The following procedure gives a table with the coefficients of the model. You can’t directly tidy a fit object.

Solution

mo_fit |> extract_workflow() |> tidy() 
## # A tibble: 12 × 5
##    term        estimate std.error statistic   p.value
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)  44.9        0.173   259.    0        
##  2 mo2           4.02       0.252    16.0   4.75e- 57
##  3 mo3           8.60       0.244    35.3   6.42e-265
##  4 mo4          14.1        0.247    57.1   0        
##  5 mo5          21.3        0.245    86.8   0        
##  6 mo6          26.2        0.246   107.    0        
##  7 mo7          32.5        0.246   132.    0        
##  8 mo8          32.7        0.244   134.    0        
##  9 mo9          26.8        0.247   109.    0        
## 10 mo10         15.6        0.245    63.5   0        
## 11 mo11          5.68       0.246    23.0   6.12e-116
## 12 mo12          0.0435     0.246     0.177 8.59e-  1

A More Complex Model

Run a model with both mo and TMIN. Compare the RMSE value with the previous model

Solution

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo + TMIN, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        6.95
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.744
mo_fit |> extract_workflow() |> tidy()
## # A tibble: 13 × 5
##    term        estimate std.error statistic   p.value
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)   35.4     0.289      123.   0        
##  2 mo2            3.77    0.243       15.5  3.35e- 54
##  3 mo3            7.95    0.236       33.8  2.16e-243
##  4 mo4           12.6     0.241       52.3  0        
##  5 mo5           18.2     0.249       73.1  0        
##  6 mo6           21.7     0.262       82.7  0        
##  7 mo7           27.1     0.272       99.7  0        
##  8 mo8           27.3     0.271      101.   0        
##  9 mo9           22.6     0.260       87.0  0        
## 10 mo10          13.1     0.245       53.4  0        
## 11 mo11           4.60    0.239       19.2  1.30e- 81
## 12 mo12          -0.270   0.237       -1.14 2.54e-  1
## 13 TMIN           0.298   0.00744     40.1  0