FlexDash Project

TreeNum	Easting	Northing	TreeDiam	Infest_Serv1	Infest_Serv2	Ind_DeadDist	DeadDist	SDI_20th	Neigh_SDI_1/4th
1	766124.4	4333846	7	15	24.1285	1	49.65013	3.447469	14.14867
2	766138.6	4333863	6	15	24.1285	1	31.57696	5.612228	43.54439
3	766133.6	4333855	6	15	24.1285	1	38.51192	6.538921	33.87841
4	766142.2	4333859	7	15	24.1285	1	29.12141	18.217140	32.05987
5	766141.2	4333858	12	15	24.1285	1	30.29469	13.522205	33.39981
6	766142.0	4333857	8	15	24.1285	1	29.74744	13.522205	32.05987
7	766147.6	4333859	14	15	24.1285	1	24.09804	17.077682	40.08890
8	766140.7	4333854	8	15	24.1285	1	31.89621	8.701482	41.19723
9	766141.4	4333854	15	15	24.1285	1	31.23747	10.417554	41.19723
10	766138.4	4333849	20	15	24.1285	1	35.46635	8.455941	36.52003
11	766135.6	4333847	15	15	24.1285	1	38.84444	5.399463	31.82509
12	766146.3	4333847	15	15	24.1285	1	29.32064	5.723934	30.50387
	x
Neigh_1.5	11.554
IND_BA_Infest_1	1.326
BA_Inf_20th	1.053
Neigh_1	11.224
---
title: "FlexDash Project"
author: "Bria Pierre"
date: "`r Sys.Date()`"
output:
  flexdashboard::flex_dashboard:
    logo: "favicon-32x32.png"
    orientation: rows
    theme: 
      version: 4
      bootswatch: minty
    source_code: embed
---
```{r setup, include=FALSE}
# Libraries
library(tidymodels)
library(vip)
library(tidyverse)
library(readxl)
library(broom)
library(car)
library(ggfortify)
library(performance)
library(knitr)
library(kableExtra)
library(dplyr)
# Read in the data (excel file)
pine_tbl <- read_excel("C:/Users/spier/OneDrive/HGEN612 Data Science II/hgen-612_temp/Data_1993.xlsx", sheet = 1)
```

# Setting the Stage: A Dive into the Data

Column {data-width=350}
------------------------------------------------------------------------
### Egan et al, 2016 “Multi-temporal ecological analysis of Jeffrey pine beetle outbreak dynamics within the Lake Tahoe Basin”. 60-acre study area with 10,722 trees followed annually. 
```{r JBP Attacked Trees}
library(ggplot2)
p.all <- ggplot(data = pine_tbl, aes(x = Easting, y = Northing)) +
geom_point(aes(color = factor(Response), alpha = factor(Response))) +
scale_alpha_discrete(range = c(0.5, 1), guide=FALSE)+
scale_color_manual("",values=c("yellowgreen", "red"),
labels = c("Alive", "JPB-attacked")) + theme_bw() + xlab("UTM X") + ylab("UTM Y")+
theme(legend.position = c(0,1), legend.text=element_text(size=15))
p.all
```

Column {data-width=650}
------------------------------------------------------------------------
### The Troublemaker

```{r}
knitr::include_graphics("jbp.png")
```

# Pine Beetle Data
A glance at the pine beetle data!
```{r Glance at Pine Beetle Data}
# Select specific rows from 'pine_tbl'
subset_data <- pine_tbl %>%
  select(-`BA_20th`: -IND_BA_Infest_1.5) %>%
  slice(1:12)

# Create a table
table <- kable(subset_data, format = "html") %>%
  kable_styling(full_width = FALSE)

# Display the table
table
```


# Script Ledger

<div style="text-align:center;">
  <img src="C:/Users/spier/OneDrive/HGEN612 Data Science II/hgen-612_temp/pine-beetle-variables.png" alt="Pine Beetle Variables" style="width:100%;"/>
</div>


# Relationship Between Variables
Column
------------------------------------
### Scatterplot Matrix
```{r}
library(GGally)
ggpairs(pine_tbl[,c("BA_Inf_20th", "DeadDist", "Neigh_1", "IND_BA_Infest_1", "Neigh_1.5" )],
diag = list(continuous ="barDiag", discrete = "barDiag", na = "naDiag") )
```

Column
-------------------------------------
### Variance Inflation Factors (VIF) predictors
```{r, VIF}
#Regression
lm.fit <- lm(DeadDist  ~ Neigh_1.5 + IND_BA_Infest_1 + BA_Inf_20th + Neigh_1, data=pine_tbl)
#VIF
vif <- vif(lm.fit)
knitr::kable(vif, digits = 3)
```

### Residuals vs Fitted Values
```{r}
lm.fit <- lm(DeadDist  ~ Neigh_1.5 + IND_BA_Infest_1 + BA_Inf_20th + Neigh_1, data=pine_tbl)
plot(lm.fit,1)
```


# Ridge Regression

```{r, include=FALSE}
# Ridge Regression
# Create Ridge Regression Model ----
# penalty == lambda (regulates size of coefficients)
# mixture == alpha 
# Note: parsinp allows for a formula method (formula specified in recipe above)
# Remember that glmnet() require a matrix specification

# Create training/testing data
pine_split <- initial_split(pine_tbl)
pine_train <- training(pine_split)
pine_test <- testing(pine_split)

# Dr. Smirnova's best lambda estimate
ridge_mod <-
  linear_reg(mixture = 0, penalty = 0.1629751) %>%  #validation sample or re-sampling can estimate this
  set_engine("glmnet")

# verify what we are doing
ridge_mod %>% 
  translate()

# create a new recipe
pine_rec <- pine_train %>% 
  recipe(DeadDist  ~ Neigh_1.5 + IND_BA_Infest_1 + BA_Inf_20th + Neigh_1) %>% 
  step_sqrt(all_outcomes()) %>% 
  step_corr(all_predictors()) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% #mean 0, sd 1
  step_zv(all_numeric(), -all_outcomes())
# prep()

pine_ridge_wflow <- 
  workflow() %>% 
  add_model(ridge_mod) %>% 
  add_recipe(pine_rec)

pine_ridge_wflow


pine_ridge_fit <- 
  pine_ridge_wflow %>% 
  fit(data = pine_train)

pine_ridge_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()

pine_ridge_fit %>% 
  extract_preprocessor()

pine_ridge_fit %>% 
  extract_spec_parsnip()


# refit best model on training and evaluate on testing data
last_fit(
  pine_ridge_wflow,
  pine_split
) %>%
  collect_metrics()

# verify Ridge Regression performance with standard linear regression approach
lm(sqrt(DeadDist) ~ Neigh_1.5 + IND_BA_Infest_1 + BA_Inf_20th + Neigh_1, data = pine_tbl) %>% 
  glance()
#adjusted r-squared = 0.734
```

Row {data-height=450}
-------------------------------------
### Recipe       
```{r, echo= TRUE}
# create a new recipe
pine_rec <- pine_train %>% 
  recipe(DeadDist  ~ Neigh_1.5 + IND_BA_Infest_1 + BA_Inf_20th + Neigh_1) %>% 
  step_sqrt(all_outcomes()) %>% 
  step_corr(all_predictors()) %>% 
  step_normalize(all_numeric(), -all_outcomes()) %>% #mean 0, sd 1
  step_zv(all_numeric(), -all_outcomes())
pine_rec
```

Row {data-height=350} 
-------------------------------------
### Coefficients of the predictors in the Ridge Regression model with the predictors ordered by the magniture of their coefficients. 
```{r}
library(ggplot2)

coefficients <- pine_ridge_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()

ggplot(coefficients, aes(x = reorder(term, estimate), y = estimate)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  labs(title = "Ridge Regression Coefficients",
       x = "Predictors",
       y = "Coefficient Estimate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```


### Adjusted R-Squared: Approximately 73.4% of the variance in the dependent variable (DeadDist) can be explained by the independent variables (IND_BA_Infest_1, BA_Inf_20th, and Neigh_1) included in the model.
```{r, Adjusted R-Squared}
library(flexdashboard)

# Adjusted R-squared value (multiply by 100 to convert to percentage)
adjusted_r_squared <- 0.734 * 100

# Create a gauge plot
gauge(adjusted_r_squared, min = 0, max = 100, symbol = '%', gaugeSectors(
  success = c(80, 100), warning = c(40, 79), danger = c(0, 39)
))
```


# Lasso Regression
```{r include=FALSE}
#Lasso
# create bootstrap samples for resampling and tuning the penalty parameter

set.seed(1234)
pine_boot <- bootstraps(pine_train) #re-sampling technique

# create a grid of tuning parameters
lambda_grid <- grid_regular(penalty(), levels = 50)

lasso_mod <-
  linear_reg(mixture = 1, penalty = tune()) %>%
  set_engine("glmnet")

# verify what we are doing
lasso_mod %>% 
  translate()


# create workflow
pine_lasso_wflow <- 
  workflow() %>% 
  add_model(lasso_mod) %>% 
  add_recipe(pine_rec)


set.seed(2020)
lasso_grid <- tune_grid(
  pine_lasso_wflow,
  resamples = pine_boot,
  grid = lambda_grid
)

# let's look at bootstrap results
lasso_grid %>%
  collect_metrics()


lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_errorbar(aes(
    ymin = mean - std_err,
    ymax = mean + std_err
  ),
  alpha = 0.5
  ) +
  geom_line(size = 1.5) +
  facet_wrap(~.metric, scales = "free", nrow = 2) +
  scale_x_log10() +
  theme(legend.position = "none")

lowest_rmse <- lasso_grid %>%
  select_best()
#find the lowest rmse

# update our final model with lowest rmse
final_lasso <- finalize_workflow(
  pine_lasso_wflow,
  lowest_rmse
)

final_lasso %>% 
  fit(pine_train) %>%
  extract_fit_parsnip() %>% 
  tidy()
# note that penalty (lambda) is close to zero; hence near equivalent to lm() solution

# variable importance plot
final_lasso %>%
  fit(pine_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = lowest_rmse$penalty) %>%
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)
```


Row {data-height=650}
-------------------------------------
### Final Lasso Coefficients
```{r Lasso Model Coefficients}
# Final Lasso Model Coefficients
final_lasso %>% 
  fit(pine_train) %>%
  extract_fit_parsnip() %>% 
  tidy() %>%
  ggplot(aes(x = term, y = estimate)) +
  geom_col(fill = "pink", color = "black") +
  coord_flip() +
  labs(title = "Final Lasso Model Coefficients",
       x = "Predictor Variable", y = "Coefficient Estimate") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))  # Adjust font size for y-axis labels
```

Row {data-height=350}
-------------------------------------
### Importance of the predictors! 
```{r Importance of Predictors}
final_lasso %>%
  fit(pine_train) %>%
  pull_workflow_fit() %>%
  vi(lambda = lowest_rmse$penalty) %>%
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)
```

### Plot of RMSE vs Penality
```{r Plot of RMSE vs. Penalty}
# Plot of RMSE vs. Penalty
lasso_grid %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>%
  ggplot(aes(x = penalty, y = mean)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  scale_x_log10() +
  labs(title = "RMSE vs. Penalty",
       x = "Penalty (log scale)", y = "RMSE") +
  theme_minimal()
```