Here’re the list of objectives that are met in this markdown document. Navigate to respective code section to see the work done for each objective.

Objective	Topic	Code Section
Objective 1	Probability & Inference	Project 1 - Multiple Linear Regression Homework 1 - Simple Linear Regression
Objective 2	Generalized Linear Models	Project 2 - Multinomial Logistic Regression Project 2 - LDA Poisson Regression Homework 4 - Multinomial Logistic Regression
Objective 3	Model Selection & Diagnostics	Homework 6 - Model Selection Homework 7 - Lasso, Ridge & PCA Project 1 - Multinomial & LDA Project 2 - Multiple Regression
Objective 4	Communicating to Non-Technical Audiences	Project 1 Project 2
Objective 5	tidymodels Framework	All examples listed in this markdown

set.seed(2025)
#Load necessary libraries
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(mosaic))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(ISLR2))
suppressPackageStartupMessages(library(discrim))
suppressPackageStartupMessages(library(poissonreg))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(ggformula))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(future))
suppressPackageStartupMessages(library(leaps))
suppressPackageStartupMessages(library(olsrr))

# set common theme for all the plots
theme_set(theme_classic())

residualAnalysis <- function(model=NULL) {
  if(!require(gridExtra)) stop('Install the gridExtra package.')
  if(!require(ggformula)) stop('Install the ggformula package.')
  df <- data.frame(Prediction=predict(model$fit),
  Residual=rstandard(model$fit))
  p1 <- gf_point(Residual~Prediction,data=df) %>% gf_hline(yintercept = 0)
  p2 <- gf_qq(~Residual,data=df) %>% gf_qqline()
  grid.arrange(p1, p2, ncol=2)
}

Homework 1

Simple Linear Regression

auto <- data("Auto")

lm_spec <- linear_reg() |>
  set_mode("regression") |>
  set_engine("lm")

slr_mod <- lm_spec |> 
  fit(mpg ~ horsepower, data = Auto)

tidy(slr_mod)

glance(slr_mod$fit)

residualAnalysis(slr_mod)

pred_intervals <- cbind(
  Auto,
  predict(slr_mod,new_data=Auto),
  predict(slr_mod,new_data=Auto,type="conf_int"),
  predict(slr_mod,new_data=Auto,type="pred_int"))

colnames(pred_intervals) <- make.unique(names(pred_intervals))

pred_intervals |>
  gf_point(mpg ~ horsepower, alpha = 0.5) |>
  gf_line(.pred ~ horsepower, color = "blue") |>
  gf_ribbon(.pred_upper + .pred_lower ~ ., alpha = 0.2, fill = "red") |>
  gf_ribbon(.pred_upper.1 + .pred_lower.1 ~ ., alpha = 0.1, fill = "red") +
  gf_theme(theme_classic()) +
  labs(title = "Plot with confidence and prediction intervals")

The plot shows the raw data points, the regression line, and the 95% confidence and prediction intervals. The confidence interval represents the uncertainty in the estimation of the mean response for a given horsepower value, while the prediction interval represents the uncertainty in predicting a single observation for a given horsepower value. We can see that the prediction interval is wider than the confidence interval, reflecting the additional uncertainty associated with predicting a single observation.

Project 1 - Estimating diamond carat weight from visually observable characteristics

Multiple Regression

data("diamonds")

diamonds <- diamonds %>%
  mutate(
    magnitude = x * y *z,
    quality_cut = if_else(cut %in% c("Ideal", "Premium"), "High", "Low"),
    quality_color = if_else(color %in% c("D", "E", "F"), "High", "Low"),
    quality_clarity = if_else(clarity %in% c("IF", "VVS1", "VVS2"), "High", "Low"),
    quality_cut = as.factor(quality_cut),
    quality_color = as.factor(quality_color),
    quality_clarity = as.factor(quality_clarity)
  )

diamonds_split <- createDataPartition(diamonds$carat, p = 0.80, list = FALSE)

train_data <- diamonds[diamonds_split,]
test_data <- diamonds[-diamonds_split,]

mlr_mod <- lm_spec %>%
  fit(carat ~ magnitude * quality_cut + quality_clarity, data = train_data)

tidy(mlr_mod)

Here’s the estimated regression equation for predicting carat

\[ \begin{aligned} \hat{y}_{\text{Carat}} = & 0.0418 \\ & + 0.0056 \times Magnitude \\ & - 0.0563 \times quality\_cutLow \\ & + 0.0199 \times quality\_clarityLow \\ & + 0.00038 \times (Magnitude:quality\_cutLow) \end{aligned} \]

glance(mlr_mod$fit)

residualAnalysis(mlr_mod)

pred_intervals <- cbind(
  test_data,
  predict(mlr_mod, new_data=test_data),
  predict(mlr_mod, new_data=test_data, type="conf_int"),
  predict(mlr_mod, new_data=test_data, type="pred_int"))

colnames(pred_intervals) <- make.unique(names(pred_intervals))

pred_intervals <- pred_intervals |>
  rename(.conf_lower = .pred_lower, .conf_upper = .pred_upper, 
         .pred_lower = .pred_lower.1, .pred_upper = .pred_upper.1)

pred_intervals |>
  gf_point(carat ~ magnitude, col = "grey") |>
  gf_line(.pred ~ magnitude) |>
  gf_line(.conf_lower ~ magnitude, col='red') %>%
  gf_line(.conf_upper ~ magnitude, col='red') %>%
  gf_line(.pred_lower ~ magnitude, col='blue') %>%
  gf_line(.pred_upper ~ magnitude, col='blue') +
  facet_wrap(~ quality_cut + quality_clarity) +
  gf_theme(theme_classic()) +
  labs(title = "Confidence and prediction intervals", x = "Magnitude", y = "Carat")

The plots show how diamond carat weight (y-axis) changes with volume, split by cut & quality (High/Low). The red line is the model’s prediction, the inner blue lines show the average carat weight’s range (confidence interval), and the outer blue lines show the single diamond’s carat weight’s range (prediction interval), wider due to individual variability.

Polynomial Regression

polData <- diamonds %>%
  dplyr::select(carat,table,depth,x,y,z)

polData <- diamonds %>%
  mutate(
    log_x = log(x+1),
    log_y = log(y+1),
    log_z = log(z+1)
  )

trainIndex <- createDataPartition(polData$carat, p = 0.8, list = FALSE)
trainData <- polData[trainIndex, ]
testData  <- polData[-trainIndex, ]

polyModel <- lm(carat ~ poly(x, 2, raw = TRUE) +
                  poly(y, 2, raw = TRUE) +
                  poly(z, 2, raw = TRUE) +
                  depth + table,
                data = trainData)

tidy(polyModel)

Making Predictions on test data

testData$predicted <- predict(polyModel, testData)

rmse <- sqrt(mean((testData$predicted - testData$carat)^2))
print(paste("Test RMSE:", round(rmse, 4)))

## [1] "Test RMSE: 0.7625"

The Root Mean Squared Error on the test data is smaller than Residual Standard Error which shows there is no overfitting in the model.

Confidence Intervals

confint(polyModel, level = 0.95)

##                                 2.5 %        97.5 %
## (Intercept)             -0.5986503294 -0.5624668019
## poly(x, 2, raw = TRUE)1 -0.3321265794 -0.3042329998
## poly(x, 2, raw = TRUE)2  0.0456369513  0.0480814384
## poly(y, 2, raw = TRUE)1 -0.1489510804 -0.1208453162
## poly(y, 2, raw = TRUE)2  0.0232125819  0.0257087508
## poly(z, 2, raw = TRUE)1  0.0323262963  0.0392027077
## poly(z, 2, raw = TRUE)2 -0.0009868181 -0.0007594962
## depth                    0.0197563752  0.0202301127
## table                    0.0032480351  0.0034767025

Check the assumptions

plot(polyModel)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Project 2 - Predicting GVSU K-12 Students’ Educational Success

# Import the datasets
pfi_2016 <- readxl::read_xlsx("pfi-data.xlsx", sheet = "curated 2016")
pfi_2019 <- readxl::read_xlsx("pfi-data.xlsx", sheet = "curated 2019")

# Merge both the datasets
pfi_data <- rbind(pfi_2016 |>
                  mutate(HHPARNX = HHPARN16X) |>
                  dplyr::select(SEGRADES, HHPARNX, PARGRADEX, 
                                EDCPUB, INTACC, SEABSNT, TTLHHINC),
                pfi_2019 |>
                  mutate(HHPARNX = HHPARN19X) |>
                  dplyr::select(SEGRADES, HHPARNX, PARGRADEX, 
                                EDCPUB, INTACC, SEABSNT, TTLHHINC)
                )

# Encode all the variables and convert them to factors
pfi <- pfi_data |>
  mutate(GRADE = case_match(SEGRADES,
                    1 ~ "A",
                    2 ~ "B",
                    3 ~ "C",
                    4 ~ "D",
                    c(5,-1) ~ "Other"),
         PARENT = case_match(HHPARNX,
                             1 ~ "Both",
                             2 ~ "Mother",
                             3 ~ "Father",
                             4 ~ "None"),
         HAS_POST_SEC = case_match(PARGRADEX,
                                 c(1,2) ~ "No",
                                 c(3,4,5) ~ "Yes"),
         PUBLIC_SCHOOL = case_match(EDCPUB,
                                    1 ~ "Yes",
                                    2 ~ "No"),
         INTERNET_ACCESS = case_match(INTACC,
                                     c(1,2,3) ~ "Yes",
                                     4 ~ "No"),
         NO_OF_ABSENT = case_match(SEABSNT,
                                   c(1,2) ~ "0-10 days",
                                   c(3,4) ~ "11-20 days",
                                   .default = "Other"),
         INCOME_CAT = case_match(TTLHHINC,
                                 c(1:8) ~ "$0-100K",
                                 c(9:10) ~ "$100-200K",
                                 c(11:12) ~ "$200K or more"),
  GRADE = factor(GRADE, levels = c("Other", "D", "C", "B", "A")),
  PARENT = factor(PARENT, levels = c("None", "Father", "Mother", "Both")),
  HAS_POST_SEC = factor(HAS_POST_SEC, levels = c("No", "Yes")),
  PUBLIC_SCHOOL = factor(PUBLIC_SCHOOL, levels = c("No", "Yes")),
  INTERNET_ACCESS = factor(INTERNET_ACCESS, levels = c("No", "Yes")),
  NO_OF_ABSENT = factor(NO_OF_ABSENT, levels = c("0-10 days","11-20 days","Other")),
  INCOME_CAT = factor(INCOME_CAT, levels = c("$0-100K", "$100-200K", "$200K or more"))
  ) |>
  dplyr::select(GRADE, PARENT, HAS_POST_SEC, PUBLIC_SCHOOL, 
                INTERNET_ACCESS, NO_OF_ABSENT, INCOME_CAT)

Define k-fold cross validation and split the dataset into train and test splits

# Split the dataset into training (75%) and testing (25%)
pfi_split <- initial_split(pfi, prop = 0.75, strata = GRADE)
pfi_train <- training(pfi_split)
pfi_test <- testing(pfi_split)

# Define k-fold cross validation with 5 levels
pfi.folds <- vfold_cv(pfi_train, v=5, strata = GRADE)

Multinomial Regression

# Multinomial Regression
multi_spec <- multinom_reg() |>
  set_engine("nnet")

multi_wf <- workflow() |>
  add_model(multi_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_multi <- multi_wf |>
  fit_resamples(pfi.folds, 
                control=control_resamples(save_pred=TRUE, 
                                          save_workflow=TRUE, 
                                          extract=extract_model))

multi_metrics_train <- collect_metrics(fit_multi) |>
  mutate(Model = "Multinomial Regression", Data = "Train")

# Confusion Matrix for Multinomial train data
cv_predictions_multi <- collect_predictions(fit_multi)

# Confusion matrix
conf_mat_multi <- cv_predictions_multi %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_multi %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )

# Fit the model
multi_model <- multinom_reg() |>
  set_engine("nnet") |>
  fit(GRADE ~ ., data = pfi_train)

tidy(multi_model)

The log-odds of a student who got “Other/No” grade vs “A” grade.

tidy(multi_model) %>%
  filter(y.level=='A') %>%
  dplyr::select(estimate,term)

\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"A"}}}{\hat{p}_{\texttt{"Other"}}}\right) = & 1.30662064 \\ & + 0.34581034{\space}PARENTFather \\ & + 0.06716781{\space}PARENTMother \\ & + 0.20814503{\space}PARENTBoth \\ & + 0.08797909{\space}HAS\_POST\_SECYes \\ & - 0.26304969{\space}PUBLIC\_SCHOOLYes \\ & + 0.03367262{\space}INTERNET\_ACCESSYes \\ & - 0.19298460{\space}NO\_OF\_ABSENT11-20 days \\ & - 0.10715015{\space}NO\_OF\_ABSENTOther \\ & + 0.03941675{\space}INCOME\_CAT100-200K \\ & + 0.01853510{\space}INCOME\_CAT200K or more \end{aligned} \]

Predict using the test data and check accuracy scores

# predict using the test data
pfi_multi_aug <- augment(multi_model, new_data = pfi_test) 

# check accuracy and roc_auc
multi_test_metrics <- rbind(
pfi_multi_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "Multinomial Regression", Data = "Test"),
pfi_multi_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "Multinomial Regression", Data = "Test")
)

multi_test_metrics

Linear Discriminant Analysis

lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

lda_wf <- workflow() |>
  add_model(lda_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_lda <- lda_wf |>
  fit_resamples(pfi.folds, 
                control=control_resamples(save_pred=TRUE, 
                                          save_workflow=TRUE, 
                                          extract=extract_model))

lda_metrics_train <- collect_metrics(fit_lda) |>
  mutate(Model = "LDA", Data = "Train")

cv_predictions_lda <- collect_predictions(fit_lda)

# Confusion matrix
conf_mat_lda <- cv_predictions_lda %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_lda %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )

Fit the model, predict using the test data and check accuracy scores

# Fit the model
lda_model <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS") |>
  fit(GRADE ~ ., data = pfi_train)

# predict using test data
pfi_lda_aug <- augment(lda_model, new_data = pfi_test) 

# check accuracy and roc_auc
lda_test_metrics <- rbind(
pfi_lda_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "LDA", Data = "Test"),
pfi_lda_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "LDA", Data = "Test")
)

Comparison on Model Performance for test data

# Combine train and test metrics
all_metrics <- bind_rows(multi_metrics_train |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_metrics_train  |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         multi_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data)) |>
  filter(.metric %in% c("accuracy", "roc_auc")) 

ggplot(all_metrics, aes(x = Model, y = .estimate, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(. ~ Data) +
  labs(title = "Model Performance Comparison (Train vs Test)", 
       y = "Metric Value", x = "Model", fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2))

Homework 4 - Multinomial Regression

nonvoters <- read_csv('nonvoters.csv')

## Rows: 5836 Columns: 119
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): educ, race, gender, income_cat, voter_category
## dbl (114): RespId, weight, Q1, Q2_1, Q2_2, Q2_3, Q2_4, Q2_5, Q2_6, Q2_7, Q2_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nonvoters <- nonvoters %>%
  mutate(
    party = 
           case_match(Q30, 
                      1 ~ "Republican", 
                      2 ~ "Democrat",
                      3 ~ "Independent", 
                      c(4,5,-1) ~ "Other"),
    party = factor(party, levels=c("Other","Independent","Democrat","Republican")),
    voter_category = factor(voter_category,
    levels=c("rarely/never","sporadic","always"))
  )

Fit the model

multi_mod_party <- multinom_reg() %>%
  set_engine("nnet") %>%
  fit(voter_category ~ ppage + educ + race + gender + income_cat + party, data = nonvoters)

multi_mod_party <- repair_call(multi_mod_party, data = nonvoters)

tidy(multi_mod_party)

The estimated regression equation for “always” with respect to “rarely/never”

tidy(multi_mod_party) |>
  filter(y.level=='always') |>
  dplyr::select(estimate,term)

\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"always"}}}{\hat{p}_{\texttt{"rarely/never"}}}\right) = & -2.9195567635 \\ & + 0.0582072009 ppage \\ & - 1.2671996403educHigh school or less \\ & - 0.3303178875 educSome college \\ & - 0.3410062108 raceHispanic \\ & - 0.6004728052 raceOther/Mixed \\ & + 0.1271973319 raceWhite \\ & - 0.1922012038 genderMale \\ & - 0.0003797663 incomecat40-75k \\ & + 0.1651906989 incomecat75-125k \\ & - 0.6641096103 incomecatLessthan40k \\ & + 0.8387114497 partyIndependent \\ & + 1.4010025518 partyDemocrat \\ & + 1.2392411768 partyRepublican \end{aligned} \]

Homework 6 - Model Selection: Best Subset, Forward, and Backward Selection

auto <- Auto %>%
  dplyr::select(-name) %>%
  dplyr::select(acceleration,mpg:weight,year,origin) %>%
  mutate(origin=factor(origin,levels=1:3,labels=c('American','European','Japanese')))

model <- lm(acceleration ~ 
              horsepower + weight + cylinders + mpg + displacement + year + origin, 
            data = auto)

# Best subset selection
best_subset_model <- regsubsets(
  acceleration ~ horsepower + weight + cylinders + mpg + displacement + year + origin, 
                                data = auto, 
                                nvmax = 10)
best_subset_summary <- summary(best_subset_model)
coef(best_subset_model, which.max(best_subset_summary$adjr2))

##  (Intercept)   horsepower       weight displacement         year 
## 20.197361522 -0.085400840  0.003137086 -0.010431306 -0.040105886

# Forward Selection
forward_selection_model <- ols_step_forward_aic(model)
forward_selection_model[["model"]][["coefficients"]]

##  (Intercept)   horsepower       weight displacement         year 
## 20.197361522 -0.085400840  0.003137086 -0.010431306 -0.040105886

# Backward Selection
backward_selection_model <- ols_step_backward_aic(model)
backward_selection_model[["model"]][["coefficients"]]

##  (Intercept)   horsepower       weight displacement         year 
## 20.197361522 -0.085400840  0.003137086 -0.010431306 -0.040105886

Homework 7

# Load data and setup training/test/fold partitions
data(Auto)

auto <- Auto |>
  drop_na() |>
  dplyr::select(-name)

auto_split <- initial_split(auto, strata = "acceleration")
auto_train <- training(auto_split)
auto_test <- testing(auto_split)
auto_fold <- vfold_cv(auto_train, v = 10)

# multitasking setup
cores <- availableCores()
plan(strategy=multisession, workers=cores-1)

Ridge regression

ridge_recipe <- 
  recipe(formula = acceleration ~ ., data = auto_train) %>% 
  step_novel(all_nominal_predictors()) %>%  # add 'new' level to factors and chr -> fctr
  step_dummy(all_nominal_predictors()) %>%  # factors to dummy variables
  step_zv(all_predictors()) %>%             # remove zero variance predictors
  step_normalize(all_predictors())          # normalize all predictors
  
ridge_spec <- 
  linear_reg(penalty = tune(), mixture = 0) %>% 
  set_mode("regression") %>% 
  set_engine("glmnet")
  
ridge_workflow <- workflow() %>% 
  add_recipe(ridge_recipe) %>% 
  add_model(ridge_spec)
  
penalty_grid <- grid_regular(penalty(range = c(-5, 5)), levels = 50)

tune_res <- tune_grid(
  ridge_workflow,
  resamples = auto_fold, 
  grid = penalty_grid,
  control(parallel_over = "everything")
)

best_penalty <- select_best(tune_res, metric = "rmse")

ridge_final <- finalize_workflow(ridge_workflow, best_penalty)
ridge_final_fit <- fit(ridge_final, data = auto_train)
ridge_aug <- augment(ridge_final_fit, new_data = auto_test)
ridge_result <- rbind(
  rmse(ridge_aug,truth = acceleration, estimate = .pred),
  rsq(ridge_aug, truth = acceleration, estimate = .pred))
ridge_result

Final ridge regression model

ridge_final_fit <- fit(ridge_final, data=auto)
tidy(ridge_final_fit,conf_int=TRUE)

## Loaded glmnet 4.1-8

ridge_aug <- augment(ridge_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
ridge_aug %>%
  gf_qq(~.resid) %>% gf_qqline()

ridge_aug %>%
  gf_point(.resid~.pred)

Principal components regression

lm_spec <- 
  linear_reg() %>% 
  set_mode("regression") %>% 
  set_engine("lm")
  
pca_recipe <- 
  recipe(formula = acceleration ~ ., data = auto_train) %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), threshold = tune())

pca_workflow <- 
  workflow() %>% 
  add_recipe(pca_recipe) %>% 
  add_model(lm_spec)
  
threshold_grid <- grid_regular(threshold(), levels = 10)

tune_res <- tune_grid(
  pca_workflow,
  resamples = auto_fold, 
  grid = threshold_grid,
  control(parallel_over = "everything")
)

## Warning: The `...` are not used in this function but one or more objects were
## passed: ''

autoplot(tune_res)

best_threshold <- select_best(tune_res, metric = "rmse")
pca_final <- finalize_workflow(pca_workflow, best_threshold)
pca_final_fit <- fit(pca_final, data = auto_train)
pca_aug <- augment(pca_final_fit, new_data = auto_test)
pca_result <- rbind(
  rmse(pca_aug,truth = acceleration, estimate = .pred),
  rsq(pca_aug, truth = acceleration, estimate = .pred))
pca_result

Final pca model

pca_final_fit <- fit(pca_final, data=auto)
tidy(pca_final_fit,conf_int=TRUE)

pca_aug <- augment(pca_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
pca_aug %>%
  gf_qq(~.resid) %>% gf_qqline()

pca_aug %>%
  gf_point(.resid~.pred)

Polynomial Tuning

poly_recipe <- recipe(formula = acceleration ~ ., data = auto_train) %>%
  step_rm(year, origin) %>%
  step_poly(weight, displacement, horsepower, degree = tune()) %>%
  step_normalize(all_predictors())

poly_spec <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

poly_workflow <- workflow() %>%
  add_recipe(poly_recipe) %>%
  add_model(poly_spec)

poly_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)

poly_res <- tune_grid(
  poly_workflow,
  resamples = auto_fold,
  grid = poly_grid,
  control = control_grid(save_pred = TRUE, parallel_over = "everything")
)

best_threshold <- select_best(poly_res, metric = "rmse")

poly_final <- finalize_workflow(poly_workflow, best_threshold)

poly_final_fit <- fit(poly_final, data = auto_train)

poly_aug <- augment(poly_final_fit, new_data = auto_test)

poly_result <- rbind(
  rmse(poly_aug,truth = acceleration, estimate = .pred),
  rsq(poly_aug, truth = acceleration, estimate = .pred))

poly_result

Final polynomial tuning model

poly_final_fit <- fit(poly_final, data=auto)
tidy(poly_final_fit,conf_int=TRUE)

poly_aug <- augment(poly_final_fit, new_data=auto) %>% mutate(.resid=acceleration-.pred)
poly_aug %>%
  gf_qq(~.resid) %>% gf_qqline()

poly_aug %>%
  gf_point(.resid~.pred)

Poisson Regression

bikeshare <- Bikeshare
contrasts(bikeshare$hr) = contr.sum(24)
contrasts(bikeshare$mnth) = contr.sum(12)

pr <- poisson_reg() %>%
  set_engine("glm") %>%
  translate()

mod.pois <- pr %>%
    fit(bikers ~ mnth + hr + workingday + temp + weathersit, data = bikeshare, 
        family = poisson )

tidy(mod.pois)

Plot the coefficients associated with mnth and hr

coef.months <- c(tidy(mod.pois)$estimate[2:12], -sum(tidy(mod.pois)$estimate[2:12]))
coef.hours <- c(tidy(mod.pois)$estimate[13:35], -sum(tidy(mod.pois)$estimate[13:35]))

grid.arrange(
  gf_point(coef.months~1:12,pch=19,type="o") %>% gf_line(col='blue') %>%
    gf_labs(x="Month",y="Coefficient") + 
    scale_x_continuous(breaks=seq(1,12,1),
                       labels = c("J","F","M","A","M","J","J","A","S","O","N","D")),
  gf_point(coef.hours~1:24,pch=19,type="o") %>% gf_line(col='blue') %>%
    gf_labs(x="Hour",y="Coefficient") +
    scale_x_continuous(breaks=seq(1,24,1)),
  nrow=1
)

STA 631 Portfolio: Statistical Modeling

Rohith Anugolu

Sunday, April 20, 2025