HGEN 612 Data Science II: Project 2

---
title: "HGEN 612 Data Science II: Project 2"
output: 
  flexdashboard::flex_dashboard:
    orientation: rows
    vertical_layout: scroll
    source_code: embed
    theme: paper
---

```{r setup, include=FALSE}

## 1. Project Set Up

# load required packages

library(flexdashboard)
library(tidyverse)
library(DT)
library(ggplot2)
library(tidymodels)
library(GGally)
library(readxl)
library(tibble)
library(dplyr)
library(rsample)
library(broom)
library(vip)
library(parsnip)

# load dataset

pine <- read_excel("Data_1993.xlsx")

# convert to tibble

pine_tbl <- as_tibble(pine)

## 2. Linear Regression Model

# preparing data for modeling

model_data <- pine_tbl %>%
                select(DeadDist, 
                       TreeDiam,
                       Infest_Serv1,
                       SDI_20th,
                       BA_20th)

# create a recipe

pine_recipe <- recipe(DeadDist ~ ., data = model_data) %>%
               step_sqrt(all_outcomes()) %>%
               step_normalize(all_predictors())

# define linear regression model 

lm_model <- linear_reg() %>%
            set_engine("lm")

# create linear regression workflow

lm_workflow <- workflow() %>%
               add_model(lm_model) %>%
               add_recipe(pine_recipe)

# fit using training data 

lm_fit <- lm_workflow %>%
          fit(data = model_data)

# 3. LASSO Regression Model

# LASSO set up 

set.seed(123) 

# split for training/testing

pine_split <- initial_split(model_data, prop = 0.8)

pine_train <- training(pine_split)

pine_test  <- testing(pine_split)

# create a recipe

lasso_recipe <-
  recipe(DeadDist ~ ., 
         data = pine_train) %>%
  step_sqrt(all_outcomes()) %>%
  step_normalize(all_predictors())

# set a second seed for bootstrapping

set.seed(1234)

pine_bootstrap <- bootstraps(pine_train, 
                             times = 25)

# create parameter (lambda) grid

lambda_grid <- grid_regular(penalty(), 
                            levels = 50)

# define lasso model 

lasso_model <- linear_reg(mixture = 1,
                          penalty = tune()) %>%
  set_engine("glmnet")

# create workflow

lasso_workflow <- workflow() %>%
                  add_model(lasso_model)%>%
                  add_recipe(lasso_recipe)

# tune lambda using bootstrap resamples

# set new seed

set.seed(2020)

lasso_grid_results <- tune_grid(lasso_workflow,
                                resamples = pine_bootstrap,
                                grid = lambda_grid)

# check metrics

lasso_grid_results %>% collect_metrics()

# pick best lambda with lowest RMSE 

best_lambda <- lasso_grid_results %>%
              select_best(metric = "rmse")

# finalist the workflow

final_lasso <- finalize_workflow(lasso_workflow,
                                 best_lambda)

# fit the model on full training data

final_lasso_fit <- final_lasso %>% 
                   fit(pine_train)


```

# Data Summary

Row {data-height=220}
-----------------------------------------------------------------------
### **Project Overview**


The purpose of this project is to utilize the 'pine beetle' dataset to predict the minimum linear distance to the nearest brood tree (DeadDist). 

The findings from this project are presented in this dashboard which consists of two tabs that display dataset information, data exploration, and model results. 

- The **Data Summary** tab provides information on key study variables, descriptive statistics, relationships between variables, and a preview of the raw dataset. 

- The **Model Findings** tab provides information on model coefficients, variable importance, and model evaluation metrics for both linear regression and LASSO models to help identify variables that meaningfully predict 'DeadDist'

Additionally, the code used to generate this dashboard can be accessed by selecting the "source code"
button for transparency and reproducability.


Row {.tabset .tabset-fad data-height=350}
-----------------------------------------------------------------------

### **Data Dictionary**

#### Key Variables 

```{r, data dictionary}

# data dictionary 

data_dict <- tibble(Type = c("Outcome",
                             "Predictor",
                             "Predictor",
                             "Predictor",
                             "Predictor"),
                    Variable = c("DeadDist",
                                 "TreeDiam",
                                 "Infest_Serv1",
                                 "SDI_20th",
                                 "BA_20th"),
                    Group = c("Nearest Brood Tree",
                              "Tree Diameter",
                              "Infestation Severity",
                              "Forest Density",
                              "Beetle Population Pressure"),
                    Description = c("Minimum linear distance to nearest brood tree",
                                    "Tree diameter/size",
                                    "Infestation severity nearest to response tree",
                                    "Stand Density Index at 1/20th-acre neighborhood surrounding response                                        tree",
                                    "Basal area total for all infested trees within 1/20th-acre                                                  neighborhood"))

datatable(data_dict,
          rownames = FALSE,
          options = list(dom = 't',
                         paging = FALSE,
                         scrollY = '90%',
                         autoHeight = TRUE))

```


### **Descritpive Statistics**

#### Descriptive Statistics for Key Variables 

```{r, descritpive statistics}

desc_stats <- tibble(
  Variable = c("DeadDist", 
               "TreeDiam", 
               "Infest_Serv1",
               "SDI_20th", 
               "BA_20th"),
  Mean = round(c(mean(pine_tbl$DeadDist, na.rm = TRUE),
                 mean(pine_tbl$TreeDiam, na.rm = TRUE),
                 mean(pine_tbl$Infest_Serv1, na.rm = TRUE),
                 mean(pine_tbl$SDI_20th, na.rm = TRUE),
                 mean(pine_tbl$BA_20th, na.rm = TRUE)), 2),
  SD = round(c(sd(pine_tbl$DeadDist, na.rm = TRUE),
               sd(pine_tbl$TreeDiam, na.rm = TRUE),
               sd(pine_tbl$Infest_Serv1, na.rm = TRUE),
               sd(pine_tbl$SDI_20th, na.rm = TRUE),
               sd(pine_tbl$BA_20th, na.rm = TRUE)), 2),
  Min = round(c(min(pine_tbl$DeadDist, na.rm = TRUE),
                min(pine_tbl$TreeDiam, na.rm = TRUE),
                min(pine_tbl$Infest_Serv1, na.rm = TRUE),
                min(pine_tbl$SDI_20th, na.rm = TRUE), 
                min(pine_tbl$BA_20th, na.rm = TRUE)), 2), 
  Max = round(c(max(pine_tbl$DeadDist, na.rm = TRUE),
                max(pine_tbl$TreeDiam, na.rm = TRUE),
                max(pine_tbl$Infest_Serv1, na.rm = TRUE),
                max(pine_tbl$SDI_20th, na.rm = TRUE),
                max(pine_tbl$BA_20th, na.rm = TRUE)), 2))

datatable(desc_stats,
          rownames = FALSE,
          options = list(dom = 't',
                         paging = FALSE,
                         scrollY = '90%',
                         autoHeight = TRUE))

```

Row {data-width=340}
-----------------------------------------------------------------------

### **Exploration of Key Variables**

```{r, variable relationships}

pine_tbl %>%
  select(DeadDist,
         TreeDiam, 
         Infest_Serv1, SDI_20th,
         BA_20th) %>%
  ggpairs() 

```

### **Raw Data Preview**

```{r, raw data preview}

datatable(pine_tbl)

```


# Model Findings

Row {data-height=270}
-------------------------------------------------------------------------

### **Model Findings Summary**

Both the linear regression and LASSO model sought to predict the outcome 'DeadDist' using tree diameter, infestation severity, forest density, and beetle population pressure variables as predictors. Additionally, 
it was determined that the outcome and predictors should be normalized during analyses.

- Findings from the **linear regression model** indicate that these variables accounted for approximately 16% of the variance in 'DeadDist', with infestation severity appearing to be the most important predictor in this model. 

- Findings from the **LASSO model** focus on identifying the most important predictors of 'DeadDist' when reducing the influence of less important variables. This allows the model to emphasis the key variables that are most strongly associated with 'DeadDist'. Notably, this model suggests that beetle population pressure was the most important predictor and tree diameter was not a significant predictor of 'DeadDist.' 

Additional information about each model can be found below.

Row {data-height=400}
------------------------------------------------------------------------

### **Linear Regression Model Evaluation**

```{r, linear regression evaluation}

# linear regression model evaluation

model_summary <-
  lm_fit %>%
  glance() %>%
  select(r.squared,
         p.value) %>%
  mutate(r.squared = round(r.squared, 2), 
         p.value = if_else(p.value < 0.001, "<0.001",
                           as.character(round(p.value, 3)))) %>%
  datatable(colnames = c("R Squared", 
                         "p-Value"),
            rownames = FALSE,
            options = list(dom = 't'))

model_summary


```

### **Linear Regression Model VIP Plot**

```{r, linear regression VIP plot}

# generates general vip plot

lm_vip <- lm_fit %>%
          extract_fit_parsnip() %>%
          vip()

# VIP plot with formatting
  
lm_fit %>%
  extract_fit_parsnip() %>%
  vip::vi() %>%
  mutate(Variable = fct_reorder(Variable, 
                                Importance)) %>%
  ggplot(aes(x = Importance, 
             y = Variable)) +
  geom_col(fill = "#43699B")+
  labs(x = "Variable Importance",
       y = NULL)


```

### **Linear Regression Model Coefficients**

```{r, linear regression coefficients table}

# linear regression coefficients results table 

coefficients_table <-
  lm_fit %>%
  tidy() %>%
  select(term, 
         estimate,
         std.error, 
         p.value) %>%
  mutate(estimate = round(estimate, 2),
         std.error = round(std.error, 2),
         p.value = if_else(p.value < 0.001, "<0.001",
                           as.character(round(p.value,3)))) %>%
  datatable(colnames = c("Variable", 
                         "Estimate",
                         "Std. Error",
                         "p-value"),
            rownames = FALSE,
            options = list(dom = 't'))

# display coefficients table

coefficients_table

```

Row {data-height=400}
-------------------------------------------------------------------------

### **LASSO Model Evaluation**

```{r, LASSO model evaluation}

lasso_metrics <- 
  last_fit(final_lasso,
           pine_split) %>%
  collect_metrics()

lasso_metrics %>%
  select(.metric,
         .estimate) %>%
  mutate(.estimate = round(.estimate, 2)) %>%
  datatable(colnames = c("Metric", 
                         "Estimate"),
            rownames = FALSE,
            options = list(dom = 't',
                           paging = FALSE, 
                           ordering = FALSE))

```

### **LASSO Model VIP Plot**

```{r, LASSO VIP plot}


# VIP plot with formatting

final_lasso %>%
  fit(pine_train) %>%
  extract_fit_parsnip() %>%
  vi(lambda = best_lambda$penalty) %>%
  filter(Importance != 0) %>%
mutate(
  Variable = fct_reorder(Variable, Importance)
) %>%
  ggplot(aes(x = Importance, y = Variable)) +
  geom_col(fill = "#43699B") +
  labs(x = "Variable Importance",
       y = NULL,
       caption = "Predictors with zero coefficients removed from figure")


```


### **LASSO Model Predictor Estimates**

```{r, LASSO predictor estimates}

# LASSO regression coefficients results table 

lasso_coefs <- final_lasso %>%
  fit(pine_train) %>%
  extract_fit_parsnip() %>%
  tidy() %>%
  select(term, 
         estimate)%>%
  mutate(estimate = round(estimate, 2))

datatable(lasso_coefs,
          colnames = c("Variable",
                       "Estimate"), 
          rownames = FALSE,
          options = list(dom = 't',
                         paging = FALSE,
                         ordering = FALSE))
```