A6

Setup

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.7      ✔ rsample      1.2.1 
## ✔ dials        1.3.0      ✔ tune         1.2.1 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

Load OAW2409 dataframe

load("OAW2409.Rdata")

Split the data

OAW_split <- initial_split(OAW2409,
                           prop=.7,
                           strata= TMAX)
OAW_training <- OAW_split %>%
  training()

OAW_test <- OAW_split %>%
  testing()

Verify the split

.7 * nrow(OAW2409)

## [1] 21300.3

nrow(OAW_training)

## [1] 21298

print(" ")

## [1] " "

.3 * nrow(OAW2409)

## [1] 9128.7

nrow(OAW_test)

## [1] 9131

print(" ")

## [1] " "

summary(OAW_training$TMAX)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   50.00   59.00   60.63   71.00  105.00

print(" ")

## [1] " "

summary(OAW_test$TMAX)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.65   71.00  110.00

Create Linear Regression Model

linear_model <- linear_reg() %>% 
  set_engine("lm") %>% 
  set_mode("regression")

Train the linear regression model

lm_fit <- linear_model %>% 
  fit(TMAX ~ TMIN,
      data = OAW_training)

Examine the Model

lm_fit

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
## 
## Coefficients:
## (Intercept)         TMIN  
##      19.800        1.025

tidy(lm_fit)

## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    19.8    0.300        66.0       0
## 2 TMIN            1.02   0.00733     140.        0

Cast Predictions

TMAX_predictions <- predict(lm_fit,
                        new_data = OAW_test)
head(TMAX_predictions)

## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1  68.0
## 2  64.9
## 3  59.8
## 4  71.0
## 5  66.9
## 6  63.9

str(TMAX_predictions)

## tibble [9,131 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:9131] 68 64.9 59.8 71 66.9 ...

Compare predictions to actual values

summary(TMAX_predictions)

##      .pred      
##  Min.   :16.73  
##  1st Qu.:53.62  
##  Median :60.79  
##  Mean   :60.75  
##  3rd Qu.:67.96  
##  Max.   :86.41

print(" ")

## [1] " "

summary(OAW_test$TMAX)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   50.00   59.00   60.65   71.00  110.00

Build the results

TMAX_results <- OAW_test %>% 
  select(TMAX,TMIN) %>% 
  bind_cols(TMAX_predictions)

Evaluate the model

TMAX_results %>% 
  rmse(TMAX, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88

TMAX_results %>% 
  rsq(TMAX, .pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484

Create R-squared plot of model performance

ggplot(TMAX_results, aes(x = TMAX, y = .pred)) +
  geom_point(alpha = 0.1,size = .1) + 
  geom_abline(color = 'blue', linetype = 2) +
  coord_obs_pred() +
  labs(x = 'Actual TMAX', y = 'Predicted TMAX')

Recreate this with last_fit()

base_fit <- linear_model %>% 
  last_fit(TMAX ~ TMIN, split = OAW_split)

base_df <- base_fit %>% 
  collect_predictions()

base_df |>
  rmse(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88

base_df |>
  rsq(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484

Alternative Model with only mo as a predictor

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        13.6

mo_df |>
  rsq(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0247

Coefficients

mo_fit %>% 
  extract_workflow() %>%
  tidy()

## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   57.0      0.199      286.  0       
## 2 mo             0.559    0.0271      20.7 4.82e-94

More Complex Model with both mo and TMIN as predictors.

mo_fit <- linear_model %>% 
  last_fit(TMAX ~  mo + TMIN, split = OAW_split)

mo_df <- mo_fit %>% 
  collect_predictions()

mo_df |>
  rmse(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        9.88

mo_df |>
  rsq(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.484

mo_fit |> extract_workflow() |> tidy()

## # A tibble: 3 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)  19.7      0.309       63.6   0     
## 2 mo            0.0394   0.0201       1.96  0.0499
## 3 TMIN          1.02     0.00747    137.    0

Categorical mo

mo_cat_fit <- linear_model %>% 
  last_fit(TMAX ~  factor(mo) + TMIN, split = OAW_split)

mo_cat_df <- mo_cat_fit %>% 
  collect_predictions()

mo_cat_df %>%
  rmse(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        7.03

mo_cat_df %>%
  rsq(TMAX,.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.739

mo_cat_fit %>%
  extract_workflow() %>%
  tidy()

## # A tibble: 13 × 5
##    term           estimate std.error statistic   p.value
##    <chr>             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)      35.6     0.287      124.   0        
##  2 `factor(mo)2`     3.73    0.239       15.6  1.59e- 54
##  3 `factor(mo)3`     7.88    0.234       33.7  3.11e-242
##  4 `factor(mo)4`    12.6     0.238       52.9  0        
##  5 `factor(mo)5`    18.1     0.244       74.1  0        
##  6 `factor(mo)6`    21.6     0.258       83.5  0        
##  7 `factor(mo)7`    27.3     0.267      102.   0        
##  8 `factor(mo)8`    27.1     0.269      101.   0        
##  9 `factor(mo)9`    22.6     0.256       88.3  0        
## 10 `factor(mo)10`   13.0     0.241       54.0  0        
## 11 `factor(mo)11`    4.40    0.237       18.6  1.58e- 76
## 12 `factor(mo)12`   -0.395   0.234       -1.69 9.12e-  2
## 13 TMIN              0.297   0.00738     40.3  0

A6

Hayden Jones

2024-11-12

Setup

Load OAW2409 dataframe

Split the data

Verify the split

Create Linear Regression Model

Train the linear regression model

Examine the Model

Cast Predictions

Compare predictions to actual values

Build the results

Evaluate the model

Create R-squared plot of model performance

Recreate this with last_fit()

Alternative Model with only mo as a predictor

Coefficients

More Complex Model with both mo and TMIN as predictors.

Categorical mo