---
title: "Project 1 - Rowan O'Hara"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
theme: lumen
logo: ../img/pine-beetle-logo.png
source_code: embed
---
```{r setup, include=FALSE}
library(tidyverse)
library(flexdashboard)
library(readxl)
library(tidyverse)
library(broom)
library(car)
library(ggfortify)
library(tidymodels)
library(vip)
library(performance)
library(glmnet)
library(GGally)
library(ggpmisc)
library(caret)
data_beetles <- read_excel("../data/Data_1993.xlsx")
```
Introduction
================================================================================
Column {data-width=600, .tabset .tabset-fade}
--------------------------------------------------------------------------------
### Infestation
```{r dead_map}
ggplot(data = data_beetles,aes(x = Easting, y = Northing)) +
geom_point(aes(color =factor(Response), alpha =factor(Response))) +
scale_alpha_discrete(range =c(0.5, 1), guide= "none") +
scale_color_manual("",values=c("#778A35", "#FF5C5C"),labels =c("Alive", "JPB-attacked")) +
theme_minimal(base_size = 14) +
xlab("Easting Coordinate") +
ylab("Northing Coordinate") +
theme(legend.position =c(0,1), legend.text=element_text(size=15))
```
### Correlation Matrix
```{r correlation_matrix}
ggpairs(data_beetles[, c("DeadDist", "TreeDiam", "SDI_20th")],
diag = list(continuous = "barDiag", discrete = "barDiag", na = "naDiag")) +
theme_minimal(base_size = 14)
```
Column {data-width=400, .tabset .tabset-fade}
-------------------------------------------------------------------------------
### Data Dictionary
This dashboard showcases the data from the Lake Tahoe Basin Jeffrey pine beetle outbreak from 2003. Using this data, the minimum linear distance to the nearest brood tree (`DeadDist`) is predicted. The variables of interest--the *predictors* chosen for these models--are `TreeDiam` and `SDI_20th`.
| variable | description |
|:--------------|:-----------------------------------------------------------------------------|
| `DeadDist` | Minimum linear distance to nearest brood |
| `TreeDiam` | Tree diameter/size |
| `SDI_20th` | Stand Density Index @ 1/20th-acre neighborhood surrounding response tree |
### Base LM Coefficients
`DeadDist` ~ `TreeDiam` + `SDI_20th`
```{r fitted_coefficients}
lm_fit2 <- lm(DeadDist ~ TreeDiam + SDI_20th, data = data_beetles)
lm_fit2 %>%
tidy() %>%
knitr::kable()
car::vif(lm_fit2) %>%
knitr::kable(col.names = "variance inflation factor")
```
*The variance inflation factors are much lower after removing the `BA_20th` variable.*
Linear Regression
================================================================================
Column {data-width=600, .tabset .tabset-fade}
-----------------------------------------------------------------------
### Model
```{r lin-reg-model}
# This source was very helpful for this portion as well as the tidymodels recode lecture
# https://www.gmudatamining.com/lesson-10-r-tutorial.html
set.seed(1234)
pine_split <- initial_split(data_beetles, prop = 0.7)
pine_train <- pine_split %>%
training()
pine_test <- pine_split %>%
testing()
pine_rec <- data_beetles %>%
recipe(DeadDist ~ TreeDiam + SDI_20th) %>%
step_sqrt(all_outcomes()) %>%
step_corr(all_predictors()) %>%
prep()
lm_mod <-
linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
pine_wflow <-
workflow() %>%
add_model(lm_mod) %>%
add_recipe(pine_rec)
pine_fit <-
pine_wflow %>%
last_fit(split = pine_split)
pine_results <- pine_fit %>%
collect_predictions()
ggplot(data = pine_results,
mapping = aes(x = .pred, y = DeadDist)) +
geom_point(color = '#778A35', alpha = 0.25) +
geom_abline(intercept = 0, slope = 1, color = 'black') +
stat_poly_eq(formula = y ~ x,
eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~")),
parse = TRUE) + # https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
labs(title = 'Multi-Linear Regression Results',
x = 'Predicted DeadDist',
y = 'Actual DeadDist') +
theme_minimal(base_size = 14)
```
### Predictor Importance
```{r predictors}
pine_fit %>%
extract_fit_parsnip() %>%
vip::vip(aesthetics = list(
size = 4,
fill = "#778A35"
)
) +
theme_minimal(base_size = 14)
```
### Check Model
```{r check-linear-model}
pine_fit %>%
extract_fit_parsnip() %>%
performance::check_model()
# Could not get this to work
#autoplot(pine_fit, which = 1, label.size = 2) +
# theme_bw()
```
Column {data-width=400, .tabset .tabset-fade}
-----------------------------------------------------------------------
### Fitted Coefficients
`DeadDist` ~ `TreeDiam` + `SDI_20th`
```{r fitted-linear-model}
pine_fit %>% # fitted coefficients
extract_fit_parsnip() %>%
tidy() %>%
knitr::kable()
```
### Evaluation
```{r evaluate-linear}
pine_fit2 <- pine_wflow %>%
fit(data = pine_train)
pine_fit2_extract <- pine_fit %>%
extract_fit_parsnip()
pine_test_results <- predict(pine_fit2_extract, new_data = pine_test) %>%
bind_cols(pine_test)
pine_fit2 %>%
glance() %>%
knitr::kable()
yardstick::rmse(pine_test_results,
truth = DeadDist,
estimate = .pred) %>%
knitr::kable()
yardstick::rsq(pine_test_results,
truth = DeadDist,
estimate = .pred) %>%
knitr::kable()
```
With a high `rmse` value and a low `rsq` value, this model is not a good fit.
Ridge Regression
================================================================================
Column {data-width=600, .tabset .tabset-fade}
-----------------------------------------------------------------------
### Model
```{r ridge-reg-model}
# picking Dr. Smirnova's best lambda estimate; can estimate with tune() - see below
ridge_mod <-
linear_reg(mixture = 0, penalty = 0.1629751) %>% #validation sample or resampling can estimate this
set_engine("glmnet")
pine_ridge_rec <- pine_train %>%
recipe(DeadDist ~ TreeDiam + SDI_20th) %>%
step_sqrt(all_outcomes()) %>%
step_corr(all_predictors()) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
prep()
pine_ridge_wflow <-
workflow() %>%
add_model(ridge_mod) %>%
add_recipe(pine_ridge_rec)
pine_ridge_fit <-
pine_ridge_wflow %>%
last_fit(split = pine_split)
pine_ridge_results <- pine_ridge_fit %>%
collect_predictions()
ggplot(data = pine_ridge_results,
mapping = aes(x = .pred, y = DeadDist)) +
geom_point(color = '#778A35', alpha = 0.25) +
geom_abline(intercept = 0, slope = 1, color = 'black') +
stat_poly_eq(formula = y ~ x,
eq.with.lhs = "italic(hat(y))~`=`~",
aes(label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~")),
parse = TRUE) + # https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
labs(title = 'Ridge Regression Results',
x = 'Predicted DeadDist',
y = 'Actual DeadDist') +
theme_minimal(base_size = 14)
```
### Predictor Importance
```{r ridge-predictors}
pine_ridge_fit %>%
extract_fit_parsnip() %>%
vip::vip(aesthetics = list(
size = 4,
fill = "#778A35"
)
) +
theme_minimal(base_size = 14)
```
Column {data-width=400, .tabset .tabset-fade}
-----------------------------------------------------------------------
### Fitted Coefficients
`DeadDist` ~ `TreeDiam` + `SDI_20th`
```{r fitted-ridge-model}
pine_ridge_fit %>% # fitted coefficients
extract_fit_parsnip() %>%
tidy() %>%
knitr::kable()
```
### Evaluation
```{r evaluate-ridge}
pine_ridge_fit2 <- pine_ridge_wflow %>%
fit(data = pine_train)
pine_ridge_fit2_extract <- pine_ridge_fit2 %>%
extract_fit_parsnip()
pine_ridge_test_results <- predict(pine_ridge_fit2_extract, new_data = pine_test) %>%
bind_cols(pine_test)
yardstick::rmse(pine_ridge_test_results,
truth = DeadDist,
estimate = .pred) %>%
knitr::kable()
yardstick::rsq(pine_ridge_test_results,
truth = DeadDist,
estimate = .pred) %>%
knitr::kable()
```
With a high `rmse` value and a low `rsq` value, this model is not a good fit.