Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

Load OAW2409 dataframe

load("OAW2409.Rdata")

Split the data

OAW_split <- initial_split(OAW2409,
                           prop=.7,
                           strata= TMAX)
OAW_training <- OAW_split %>%
  training()

OAW_test <- OAW_split %>%
  testing()

Verify the split

.7 * nrow(OAW2409)
## [1] 21300.3
nrow(OAW_training)
## [1] 21298
print(" ")
## [1] " "
.3 * nrow(OAW2409)
## [1] 9128.7
nrow(OAW_test)
## [1] 9131
print(" ")
## [1] " "
summary(OAW_training$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   50.00   59.00   60.63   71.00  105.00
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.65   71.00  110.00

Create Linear Regression Model

linear_model <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Train the linear regression model

lm_fit <- linear_model %>% 
  fit(TMAX ~ TMIN,
      data = OAW_training)

Examine the Model

lm_fit
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
## 
## Coefficients:
## (Intercept)         TMIN  
##      19.800        1.025
tidy(lm_fit)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    19.8    0.300        66.0       0
## 2 TMIN            1.02   0.00733     140.        0

Cast Predictions

TMAX_predictions <- predict(lm_fit,
                        new_data = OAW_test)
head(TMAX_predictions)
## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1  68.0
## 2  64.9
## 3  59.8
## 4  71.0
## 5  66.9
## 6  63.9
str(TMAX_predictions)
## tibble [9,131 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:9131] 68 64.9 59.8 71 66.9 ...

Compare predictions to actual values

summary(TMAX_predictions)
##      .pred      
##  Min.   :16.73  
##  1st Qu.:53.62  
##  Median :60.79  
##  Mean   :60.75  
##  3rd Qu.:67.96  
##  Max.   :86.41
print(" ")
## [1] " "
summary(OAW_test$TMAX)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.65   71.00  110.00

Build the results

TMAX_results <- OAW_test %>% 
  select(TMAX,TMIN) %>% 
  bind_cols(TMAX_predictions)

Evaluate the model

TMAX_results %>% 
  rmse(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88
TMAX_results %>% 
  rsq(TMAX, .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484

Create R-squared plot of model performance

ggplot(TMAX_results, aes(x = TMAX, y = .pred)) +
  geom_point(alpha = 0.1,size = .1) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual TMAX', y = 'Predicted TMAX')

Recreate this with last_fit()

base_fit <- linear_model %>% 
  last_fit(TMAX ~ TMIN, split = OAW_split)

base_df <- base_fit %>% 
  collect_predictions()

base_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88
base_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484

Alternative Model with only mo as a predictor

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        13.6
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0247

Coefficients

mo_fit %>% 
  extract_workflow() %>%
  tidy() 
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   57.0      0.199      286.  0       
## 2 mo             0.559    0.0271      20.7 4.82e-94

More Complex Model with both mo and TMIN as predictors.

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo + TMIN, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88
mo_df |>
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484
mo_fit |> extract_workflow() |> tidy()
## # A tibble: 3 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)  19.7      0.309       63.6   0     
## 2 mo            0.0394   0.0201       1.96  0.0499
## 3 TMIN          1.02     0.00747    137.    0

Categorical mo

mo_cat_fit <- linear_model %>% 
  last_fit(TMAX ~  factor(mo) + TMIN, split = OAW_split)

mo_cat_df <- mo_cat_fit %>% 
  collect_predictions()

mo_cat_df %>%
  rmse(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        7.03
mo_cat_df %>%
  rsq(TMAX,.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.739
mo_cat_fit %>%
  extract_workflow() %>%
  tidy()
## # A tibble: 13 × 5
##    term           estimate std.error statistic   p.value
##    <chr>             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)      35.6     0.287      124.   0        
##  2 `factor(mo)2`     3.73    0.239       15.6  1.59e- 54
##  3 `factor(mo)3`     7.88    0.234       33.7  3.11e-242
##  4 `factor(mo)4`    12.6     0.238       52.9  0        
##  5 `factor(mo)5`    18.1     0.244       74.1  0        
##  6 `factor(mo)6`    21.6     0.258       83.5  0        
##  7 `factor(mo)7`    27.3     0.267      102.   0        
##  8 `factor(mo)8`    27.1     0.269      101.   0        
##  9 `factor(mo)9`    22.6     0.256       88.3  0        
## 10 `factor(mo)10`   13.0     0.241       54.0  0        
## 11 `factor(mo)11`    4.40    0.237       18.6  1.58e- 76
## 12 `factor(mo)12`   -0.395   0.234       -1.69 9.12e-  2
## 13 TMIN              0.297   0.00738     40.3  0