Setup
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.7 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
Load OAW2409 dataframe
load("OAW2409.Rdata")
Split the data
OAW_split <- initial_split(OAW2409,
prop=.7,
strata= TMAX)
OAW_training <- OAW_split %>%
training()
OAW_test <- OAW_split %>%
testing()
Verify the split
.7 * nrow(OAW2409)
## [1] 21300.3
nrow(OAW_training)
## [1] 21298
print(" ")
## [1] " "
.3 * nrow(OAW2409)
## [1] 9128.7
nrow(OAW_test)
## [1] 9131
print(" ")
## [1] " "
summary(OAW_training$TMAX)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 50.00 59.00 60.63 71.00 105.00
print(" ")
## [1] " "
summary(OAW_test$TMAX)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 50.00 59.00 60.65 71.00 110.00
Create Linear Regression Model
linear_model <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
Train the linear regression model
lm_fit <- linear_model %>%
fit(TMAX ~ TMIN,
data = OAW_training)
Examine the Model
lm_fit
## parsnip model object
##
##
## Call:
## stats::lm(formula = TMAX ~ TMIN, data = data)
##
## Coefficients:
## (Intercept) TMIN
## 19.800 1.025
tidy(lm_fit)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 19.8 0.300 66.0 0
## 2 TMIN 1.02 0.00733 140. 0
Cast Predictions
TMAX_predictions <- predict(lm_fit,
new_data = OAW_test)
head(TMAX_predictions)
## # A tibble: 6 × 1
## .pred
## <dbl>
## 1 68.0
## 2 64.9
## 3 59.8
## 4 71.0
## 5 66.9
## 6 63.9
str(TMAX_predictions)
## tibble [9,131 × 1] (S3: tbl_df/tbl/data.frame)
## $ .pred: num [1:9131] 68 64.9 59.8 71 66.9 ...
Compare predictions to actual values
summary(TMAX_predictions)
## .pred
## Min. :16.73
## 1st Qu.:53.62
## Median :60.79
## Mean :60.75
## 3rd Qu.:67.96
## Max. :86.41
print(" ")
## [1] " "
summary(OAW_test$TMAX)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 50.00 59.00 60.65 71.00 110.00
Build the results
TMAX_results <- OAW_test %>%
select(TMAX,TMIN) %>%
bind_cols(TMAX_predictions)
Evaluate the model
TMAX_results %>%
rmse(TMAX, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 9.88
TMAX_results %>%
rsq(TMAX, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.484
Recreate this with last_fit()
base_fit <- linear_model %>%
last_fit(TMAX ~ TMIN, split = OAW_split)
base_df <- base_fit %>%
collect_predictions()
base_df |>
rmse(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 9.88
base_df |>
rsq(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.484
Alternative Model with only mo as a predictor
mo_fit <- linear_model %>%
last_fit(TMAX ~ mo, split = OAW_split)
mo_df <- mo_fit %>%
collect_predictions()
mo_df |>
rmse(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 13.6
mo_df |>
rsq(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.0247
Coefficients
mo_fit %>%
extract_workflow() %>%
tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 57.0 0.199 286. 0
## 2 mo 0.559 0.0271 20.7 4.82e-94
More Complex Model with both mo and TMIN as predictors.
mo_fit <- linear_model %>%
last_fit(TMAX ~ mo + TMIN, split = OAW_split)
mo_df <- mo_fit %>%
collect_predictions()
mo_df |>
rmse(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 9.88
mo_df |>
rsq(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.484
mo_fit |> extract_workflow() |> tidy()
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 19.7 0.309 63.6 0
## 2 mo 0.0394 0.0201 1.96 0.0499
## 3 TMIN 1.02 0.00747 137. 0
Categorical mo
mo_cat_fit <- linear_model %>%
last_fit(TMAX ~ factor(mo) + TMIN, split = OAW_split)
mo_cat_df <- mo_cat_fit %>%
collect_predictions()
mo_cat_df %>%
rmse(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 7.03
mo_cat_df %>%
rsq(TMAX,.pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.739
mo_cat_fit %>%
extract_workflow() %>%
tidy()
## # A tibble: 13 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 35.6 0.287 124. 0
## 2 `factor(mo)2` 3.73 0.239 15.6 1.59e- 54
## 3 `factor(mo)3` 7.88 0.234 33.7 3.11e-242
## 4 `factor(mo)4` 12.6 0.238 52.9 0
## 5 `factor(mo)5` 18.1 0.244 74.1 0
## 6 `factor(mo)6` 21.6 0.258 83.5 0
## 7 `factor(mo)7` 27.3 0.267 102. 0
## 8 `factor(mo)8` 27.1 0.269 101. 0
## 9 `factor(mo)9` 22.6 0.256 88.3 0
## 10 `factor(mo)10` 13.0 0.241 54.0 0
## 11 `factor(mo)11` 4.40 0.237 18.6 1.58e- 76
## 12 `factor(mo)12` -0.395 0.234 -1.69 9.12e- 2
## 13 TMIN 0.297 0.00738 40.3 0