Tidymodels Regression

Harold Nelson

2023-11-07

Setup

Load tidyverse and tidymodels Load the dataframe OAW2309.

Solution

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tune         1.1.1
## ✔ infer        1.0.3     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.0     ✔ yardstick    1.2.0
## ✔ recipes      1.0.5     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
load("OAW2309.Rdata")

Split the data

Put 70% of the data in training. Set the strata to TMAX. Create OAW_training and OAW_test.

Solution

set.seed(123)
OAW_split <- initial_split(OAW2309, 
                  prop = .7, 
                  strata = TMAX)

OAW_training <- OAW_split %>%
 training()

OAW_test <- OAW_split %>%
 testing()

Verify the Split

Make sure that there are about 3 times as many observations in train as in test.

Examine the distributions of TMAX in the training and test dataframes.

Solution

nrow(OAW_training)
## [1] 21050
nrow(OAW_test)
## [1] 9025
summary(OAW_training$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   50.00   59.00   60.61   71.00  105.00
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.62   71.00  110.00

Create a linear regression model.

Call it linear_model.

Solution

# Initialize a linear regression object, linear_model
linear_model <- linear_reg() %>% 
  # Set the model engine
  set_engine("lm") %>% 
  # Set the model mode
  set_mode("regression")

Train

Fit a model, lm_fit, to predict TMAX using TMIN as the only predictor. Use the training data only.

Solution

lm_fit <- linear_model %>% 
  fit(TMAX ~ TMIN,
      data = OAW_training)

Examine the Model

Print lm_fit to view model information and use the tidy() function.

Solution

lm_fit
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
## 
## Coefficients:
## (Intercept)         TMIN  
##      19.809        1.024
tidy(lm_fit)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    19.8    0.301        65.9       0
## 2 TMIN            1.02   0.00735     139.        0

Predictions

Create TMAX_predictions using the predict() function on lm_fit. Use head() and str() to understand the file.

Solution

TMAX_predictions <- predict(lm_fit,
                        new_data = OAW_test)
head(TMAX_predictions)
## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1  59.8
## 2  66.9
## 3  63.9
## 4  69.0
## 5  60.8
## 6  60.8
str(TMAX_predictions)
## tibble [9,025 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:9025] 59.8 66.9 63.9 69 60.8 ...

Compare

Use summary() to compare the distributions of the actual and predicted values.

Solution

summary(TMAX_predictions)
##      .pred      
##  Min.   :16.74  
##  1st Qu.:53.62  
##  Median :60.79  
##  Mean   :60.71  
##  3rd Qu.:67.96  
##  Max.   :86.40
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.62   71.00  110.00

Build the Results

Create a dataframe TMAX_results containing TMAX, TMIN and .pred.

Solution

TMAX_results <- OAW_test %>% 
  select(TMAX,TMIN) %>% 
  bind_cols(TMAX_predictions)

Evaluate the Model

Use the rmse() and rsq() functions.

Solution

TMAX_results %>% 
  rmse(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.89
# Calculate the R squared metric
TMAX_results %>% 
  rsq(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.481

Create an R squared plot of model performance

Solution

ggplot(TMAX_results, aes(x = TMAX, y = .pred)) +
  geom_point(alpha = 0.1,size = .1) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual TMAX', y = 'Predicted TMAX')

## Recreate

Recreate this model using last_fit(). Print the values of RSQ and RMSE. Prefix the fit and df objects with base_.

Solution

base_fit <- linear_model %>% 
  last_fit(TMAX ~ TMIN, split = OAW_split)

base_df <- base_fit %>% 
  collect_predictions()

base_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.89
base_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.481

An Alternative Model

Copy the code from the previous chunk and create an mo_ model, where the only predictor is the factor mo.

How do the performance metrics compare?

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        7.20
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.726

Coefficients

The following procedure gives a table with the coefficients of the model. You can’t directly tidy a fit object.

mo_fit |> extract_workflow() |> tidy()
## # A tibble: 12 × 5
##    term        estimate std.error statistic   p.value
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)  44.9        0.173   259.    0        
##  2 mo2           4.02       0.252    16.0   4.75e- 57
##  3 mo3           8.60       0.244    35.3   6.42e-265
##  4 mo4          14.1        0.247    57.1   0        
##  5 mo5          21.3        0.245    86.8   0        
##  6 mo6          26.2        0.246   107.    0        
##  7 mo7          32.5        0.246   132.    0        
##  8 mo8          32.7        0.244   134.    0        
##  9 mo9          26.8        0.247   109.    0        
## 10 mo10         15.6        0.245    63.5   0        
## 11 mo11          5.68       0.246    23.0   6.12e-116
## 12 mo12          0.0435     0.246     0.177 8.59e-  1