library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.9     ✔ rsample      1.3.1
## ✔ dials        1.4.1     ✔ tune         1.3.0
## ✔ infer        1.0.9     ✔ workflows    1.2.0
## ✔ modeldata    1.5.0     ✔ workflowsets 1.1.1
## ✔ parsnip      1.3.2     ✔ yardstick    1.3.2
## ✔ recipes      1.3.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(ISLR)
library(readxl)

1 Objective 1: Describe probability as a foundation of statistical modeling, including inference and maximum likelihood estimation

1.1 Simple Linear Regression

1.2 Loading and inspecting data

data(Auto, package = "ISLR")
summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365

1.3 Specifying and fitting simple linear regression: mpg ~ horsepower

lm_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
lm_spec
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
slr_mod <- lm_spec %>%
fit(mpg ~ horsepower, data = Auto)

slr_mod
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = mpg ~ horsepower, data = data)
## 
## Coefficients:
## (Intercept)   horsepower  
##     39.9359      -0.1578
tidy(slr_mod) # estimates, SEs, t-stats, p-values (inference)
## # A tibble: 2 × 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)   39.9     0.717        55.7 1.22e-187
## 2 horsepower    -0.158   0.00645     -24.5 7.03e- 81
summary(slr_mod$fit)
## 
## Call:
## stats::lm(formula = mpg ~ horsepower, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16
glance(slr_mod)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.606         0.605  4.91      600. 7.03e-81     1 -1179. 2363. 2375.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

1.4 Predicted mpg and 95% CI/PI for horsepower = 98

new_data <- tibble(horsepower = 98)

cbind(
new_data,
predict(slr_mod, new_data),
predict(slr_mod, new_data, type = "conf_int"),
predict(slr_mod, new_data, type = "pred_int")
)
##   horsepower    .pred .pred_lower .pred_upper .pred_lower .pred_upper
## 1         98 24.46708    23.97308    24.96108     14.8094    34.12476

1.5 Base R scatterplot + least-squares regression line

plot(Auto$horsepower, Auto$mpg,
xlab = "Horsepower",
ylab = "MPG",
main = "Simple Linear Regression: MPG vs Horsepower",
pch = 20, col = "blue")

abline(slr_mod %>% extract_fit_engine(), col = "red", lwd = 2)

1.6 Residual diagnostics to check model assumptions

slr_engine <- slr_mod %>% extract_fit_engine()

par(mfrow = c(2, 2))
plot(slr_engine)

par(mfrow = c(1, 1))

Comments: From Assignment 1, I fitted a simple linear regression model relating miles per gallon to horsepower using the Auto data set. That example helped me realize that statistical modeling is a probabilistic process rather than a deterministic one. In that assignment, I fitted the model using linear_reg() with “lm” as the engine, considering both the intercept and slope as estimates subject to sampling variability. In that case, the outputs of tidy(slr_mod) and summary(slr_mod$fit) gave me the coefficient estimates coupled with their respective standard errors, test statistics, and p-values. This reinforced that inference would involve probability rather than fixed outcomes. I furthered the exploration of uncertainty in this model by creating confidence and prediction intervals for a given horsepower value. Whereas the confidence interval indicates uncertainty in the estimated mean response, the prediction interval incorporates extra variability at the level of individual observations. This helped make sense of exactly how regression models can quantify different sources of uncertainty and how predictions for individual observations are necessarily more variable than estimates of the mean response. Finally, I explored diagnostic plots by the command plot(slr_mod$fit) to check the assumptions of least squares regression. These diagnostics keep me thinking critically about linearity, constant variance, and normality of residuals and emphasize that valid inference depends on how well these assumptions are met. This example demonstrates my understanding of probability as the foundation of statistical modeling, including how to use standard errors, interval estimates, hypothesis testing, and residual diagnostics to reason carefully about uncertainty in regression analysis.

2 Objective 2 : Apply the appropriate generalized linear model for a specific data context

# Loading PFI 2019 curated data (From Project 2)
df_2019 <- read_excel("pfi-data.xlsx", sheet = "curated 2019")
# Grade standardization and filtering (exactly as used in Project 2)

df <- df_2019 %>%
  mutate(
    ALLGRADEX_2019raw = ALLGRADEX,
    ALLGRADEX_std = case_when(
      ALLGRADEX %in% c(2, 3) ~ 0,
      ALLGRADEX >= 4 & ALLGRADEX <= 15 ~ ALLGRADEX - 3,
      TRUE ~ NA_real_
    )
  ) %>%
  filter(ALLGRADEX_std >= 6, ALLGRADEX_std <= 12) %>%
  mutate(
    SEGRADES = na_if(SEGRADES, -1),
    SEGRADES = na_if(SEGRADES, 5)
  ) %>%
  filter(!is.na(SEGRADES)) %>%
  mutate(
    success = case_when(
      SEGRADES %in% c(1, 2) ~ "high",
      SEGRADES %in% c(3, 4) ~ "low"
    ),
    success = factor(success, levels = c("low", "high"))
  )

2.1 Logistic regression (appropriate GLM)

library(tidyverse)
library(rsample)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-10
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
set.seed(1986)

# 1. Train / test split, stratified by success
data_split <- initial_split(df, prop = 0.8, strata = success)
train <- training(data_split)
test  <- testing(data_split)

# 2. Design matrices for glmnet (no intercept column, drop response)
train_x <- model.matrix(~ . - success - 1, data = train)
test_x  <- model.matrix(~ . - success - 1, data = test)

train_y <- train$success
test_y  <- test$success

# 3. Fix cross-validation folds for reproducibility
set.seed(123)
foldid <- sample(1:10, size = nrow(train_x), replace = TRUE)

# 4. Cross-validated LASSO logistic regression (GLM with binomial link)
cv.lasso <- cv.glmnet(
  x = train_x,
  y = train_y,
  alpha = 1,             # LASSO penalty
  family = "binomial",   # logistic regression
  type.measure = "class",
  foldid = foldid        # fixed folds → reproducible
)

plot(cv.lasso)           # LASSO cross-validation curve

# 5. Final model at best lambda
best_lambda <- cv.lasso$lambda.min

lasso.final <- glmnet(
  x = train_x,
  y = train_y,
  alpha = 1,
  lambda = best_lambda,
  family = "binomial"
)

coef_final <- coef(lasso.final)  # selected variables & coefficients

# 6. Test-set predictions, accuracy, confusion matrix
lasso_prob <- predict(
  cv.lasso,
  newx = test_x,
  s = "lambda.min",
  type = "response"
)

lasso_pred <- ifelse(lasso_prob > 0.5, "high", "low") |>
  factor(levels = c("low", "high"))

# Overall accuracy
mean(lasso_pred == test_y)
## [1] 1
# 7. ROC curve to summarize classification performance
roc_obj <- roc(test_y, as.numeric(lasso_prob))
## Setting levels: control = low, case = high
## Setting direction: controls < cases
plot(roc_obj)

Comments: In this analysis, building on the work completed in Project 2, I utilized a LASSO-penalized logistic regression to model student academic success, which for this problem was treated as a binary outcome distinguishing between high and low grade earners. Since the response variable takes two possible outcomes, a binomial generalized linear model with a logistic link function is an appropriate modeling framework. Working through Project 2 helped further my understanding of how generalized linear models extend classical regression to non-normal response variables and how the choice of link function is closely tied to the structure of the data. Rather than fitting a standard logistic regression, I opted for a LASSO-regularized version because Project 2 involved a large number of potential predictors. This approach allowed the model to remain interpretable while addressing overfitting and multicollinearity, which are common challenges in high-dimensional data settings.

A major conceptual takeaway from completing Project 2 was understanding how regularization directly impacts variable selection and coefficient stability. The LASSO coefficient path plot provided clear insight into how predictors enter the model as the penalty parameter λ is relaxed. Only a small subset of predictors exhibited meaningful non-zero coefficients early in the path, while many were shrunk to zero across most values of λ. This demonstrated how LASSO effectively filters out noise variables and retains predictors that contribute most strongly to academic success. Observing these coefficient paths helped me recognize that LASSO is not only a predictive technique but also a principled, data-driven method for simplifying complex models.

This understanding was further reinforced through the cross-validation curve produced in Project 2. Examining misclassification error across a wide range of λ values allowed me to identify a penalty that properly balanced model complexity and predictive performance. Selecting λ based on cross-validation improved the model’s ability to generalize to unseen data, rather than overfitting to the training sample. Additional evaluation using test-set accuracy and the ROC curve confirmed strong classification performance and clear separation between high- and low-success students. Overall, Project 2 strengthened my ability to justify model choice, interpret regularization behavior, and connect graphical diagnostics with statistical reasoning. This analysis demonstrated how generalized linear models, when combined with modern regularization techniques such as LASSO, can be applied effectively to real-world educational data.

3 Objective 3 : Demonstrate model selection given a set of candidate models

3.1 K-Fold Classification

library(tidyverse)
library(tidymodels)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
library(boot)
set.seed(123)
# Load Auto data (similar to Homework 3)
data("Auto", package = "ISLR2")

auto_data <- Auto %>%
  select(-name) %>%
  mutate(origin = as.factor(origin))

glimpse(auto_data)
## Rows: 392
## Columns: 8
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <int> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <int> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <int> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <int> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
# Candidate single-predictor models for acceleration (exact HW3 style)
K <- 5

single_predictors <- list(
  acceleration ~ mpg,
  acceleration ~ cylinders,
  acceleration ~ displacement,
  acceleration ~ horsepower,
  acceleration ~ weight,
  acceleration ~ year,
  acceleration ~ origin
)

predictors <- c(
  "mpg", "cylinders", "displacement",
  "horsepower", "weight", "year", "origin"
)
# 5-fold CV errors for each candidate model (Homework 3 loop)
cv.single <- rep(0, 7)

for (i in 1:7) {
  glm.fit <- glm(single_predictors[[i]], data = auto_data)
  cv.single[i] <- cv.glm(auto_data, glm.fit, K = K)$delta[1]
}

cv_results <- data.frame(
  Predictor_with_accel = paste("accel ~", predictors),
  CV_Error = round(cv.single, 6)
)

cv_results
##   Predictor_with_accel CV_Error
## 1          accel ~ mpg 6.330363
## 2    accel ~ cylinders 5.680925
## 3 accel ~ displacement 5.405033
## 4   accel ~ horsepower 4.051086
## 5       accel ~ weight 6.310960
## 6         accel ~ year 7.005236
## 7       accel ~ origin 7.200010
# Final model chosen in Homework 3
final_model <- lm(
  acceleration ~ origin + horsepower + weight + displacement,
  data = auto_data
)

summary(final_model)
## 
## Call:
## lm(formula = acceleration ~ origin + horsepower + weight + displacement, 
##     data = auto_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1175 -1.0562 -0.1803  0.9303  6.7607 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.0465482  0.5225105  32.624   <2e-16 ***
## origin2       0.1216945  0.2896750   0.420   0.6746    
## origin3       0.0209772  0.2861930   0.073   0.9416    
## horsepower   -0.0838621  0.0054500 -15.388   <2e-16 ***
## weight        0.0030547  0.0002931  10.421   <2e-16 ***
## displacement -0.0095935  0.0029606  -3.240   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.72 on 386 degrees of freedom
## Multiple R-squared:  0.6161, Adjusted R-squared:  0.6111 
## F-statistic: 123.9 on 5 and 386 DF,  p-value: < 2.2e-16
plot(final_model$fitted.values, final_model$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "plot of Residuals vs Fitted",
pch = 19, col = "darkgreen")
abline(h = 0, lwd = 2, col = "black")

auto_data$pred <- predict(final_model, newdata = auto_data)

plot(
  auto_data$pred,
  auto_data$acceleration,
  xlab = "Predicted Acceleration",
  ylab = "Observed Acceleration",
  main = "Predicted vs Observed Acceleration",
  pch = 19,
  col = "blue4"
)

abline(0, 1, col = "black", lwd = 2)

Comments: For this objective, I applied model selection to identify a final multiple linear regression model that best explains vehicle acceleration using variables from the Auto data set. Given prior exploratory analysis and comparisons of candidate predictors from Homework 3, I included origin, horsepower, weight, and displacement in the final model. Once this model was determined, I checked its performance by looking at summary statistics and diagnostic plots. A residuals-versus-fitted plot examined linearity and constant variance, while a plot of predicted versus observed helped me gauge how well the model captured the overall patterns of acceleration. One problem that I encountered in this process was redundancy between the predictors-for instance, horsepower, weight, and displacement-made choices less than obvious. I resolved this by basing my choice of predictor on both statistical significance and interpretability rather than including all possible predictors. The process of model validation furthered my understanding of the importance of care in the evaluation of candidate models and the validation of a final choice with diagnostic and predictive checks, as opposed to relying on any one metric.

4 Objective 4: Communicate and interpret statistical modeling results clearly for a general audience.

4.1 Multiple linear regression

library(ggplot2)

# Load diamonds dataset and create diamonds1 (clean modeling data)
data(diamonds, package = "ggplot2")

diamonds1 <- diamonds %>%
  rename(
    length = x,
    width  = y,
    height = z,
    depth_percentage = depth
  ) %>%
  select(
    carat,
    length,
    width,
    height,
    depth_percentage,
    table,
    cut,
    color,
    clarity
  ) %>%
  filter(
    length > 0,
    width > 0,
    height > 0
  )

glimpse(diamonds1)
## Rows: 53,920
## Columns: 9
## $ carat            <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22,…
## $ length           <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87,…
## $ width            <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78,…
## $ height           <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49,…
## $ depth_percentage <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1,…
## $ table            <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 5…
## $ cut              <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very …
## $ color            <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J,…
## $ clarity          <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, S…

4.2 Exploratory Data Analysis (EDA)

# Summary of response
summary(diamonds1$carat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7977  1.0400  5.0100
# Carat vs length (geometric relationship)
ggplot(diamonds1, aes(x = length, y = carat)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Carat vs Length",
    x = "Length",
    y = "Carat"
  )

# Carat by cut
ggplot(diamonds1, aes(x = cut, y = carat)) +
  geom_boxplot() +
  labs(
    title = "Carat by Cut",
    x = "Cut",
    y = "Carat"
  )

# Carat by clarity
ggplot(diamonds1, aes(x = clarity, y = carat)) +
  geom_boxplot() +
  labs(
    title = "Carat by Clarity",
    x = "Clarity",
    y = "Carat"
  )

4.3 Train / test split

set.seed(1986)

# Train / test split
data_split <- initial_split(diamonds1, prop = 0.8)
diamonds1_train <- training(data_split)
diamonds1_test  <- testing(data_split)

4.4 Multiple Linear Regression Model

# Linear regression spec
lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")

# Fit multiple regression: quantitative + qualitative predictors
carat_fit <- lm_spec %>%
  fit(
    carat ~ length + cut + color + clarity + depth_percentage + table,
    data = diamonds1_train
  )
carat_fit
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = carat ~ length + cut + color + clarity + 
##     depth_percentage + table, data = data)
## 
## Coefficients:
##      (Intercept)            length             cut.L             cut.Q  
##        -2.807944          0.412757          0.012509         -0.010353  
##            cut.C             cut^4           color.L           color.Q  
##         0.001530          0.002408          0.041797          0.023124  
##          color.C           color^4           color^5           color^6  
##        -0.002561         -0.008012          0.002919         -0.001208  
##        clarity.L         clarity.Q         clarity.C         clarity^4  
##         0.010290          0.044776         -0.026236          0.002507  
##        clarity^5         clarity^6         clarity^7  depth_percentage  
##        -0.004163         -0.002100          0.007856          0.018591  
##            table  
##         0.001845
tidy(carat_fit)
## # A tibble: 21 × 5
##    term        estimate std.error statistic   p.value
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept) -2.81     0.0328      -85.7  0        
##  2 length       0.413    0.000466    885.   0        
##  3 cut.L        0.0125   0.00210       5.95 2.71e-  9
##  4 cut.Q       -0.0104   0.00168      -6.16 7.33e- 10
##  5 cut.C        0.00153  0.00144       1.06 2.89e-  1
##  6 cut^4        0.00241  0.00115       2.09 3.68e-  2
##  7 color.L      0.0418   0.00160      26.1  1.35e-148
##  8 color.Q      0.0231   0.00146      15.8  5.31e- 56
##  9 color.C     -0.00256  0.00137      -1.87 6.20e-  2
## 10 color^4     -0.00801  0.00126      -6.35 2.13e- 10
## # ℹ 11 more rows
# Train performance
train_pred <- predict(carat_fit, diamonds1_train) %>%
  bind_cols(diamonds1_train %>% select(carat))

train_metrics <- train_pred %>%
  metrics(truth = carat, estimate = .pred)

# Test performance
test_pred <- predict(carat_fit, diamonds1_test) %>%
  bind_cols(diamonds1_test %>% select(carat))

test_metrics <- test_pred %>%
  metrics(truth = carat, estimate = .pred)

# Compare train vs test metrics
train_metrics %>%
  select(.metric, train = .estimate) %>%
  inner_join(
    test_metrics %>% select(.metric, test = .estimate),
    by = ".metric"
  )
## # A tibble: 3 × 3
##   .metric  train   test
##   <chr>    <dbl>  <dbl>
## 1 rmse    0.0942 0.0938
## 2 rsq     0.961  0.960 
## 3 mae     0.0721 0.0715

4.5 # Observed vs predicted carat

ggplot(test_pred, aes(x = carat, y = .pred)) +
  geom_point(alpha = 0.3) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(
    title = "Observed vs Predicted Carat",
    x = "Observed carat",
    y = "Predicted carat"
  )

Comments: In this, I executed multiple linear regression applying both quantitative and qualitative predictors to model diamond carat with respect to geometric measurements and categorical quality attributes. It started with thorough exploratory data analysis, examining the relationships between carat and predictors of interest, including length, width, height, cut, color, and clarity. Such visualizations helped to reveal high nonlinear geometric relationships and highlighted the issue of multicollinearity among the physical dimensions to provide information for later modeling choices. Finally, using the tidymodels framework, I implemented a clean train-test split and fit the regression model only on the training data, allowing for an honest evaluation of predictive performance on held-out data.

Perhaps the most valuable lesson learned in modeling involves recognizing how early decisions within a model can inadvertently introduce performance evaluation bias. Variable selection, to include multicollinearity and the refinement of predictor sets, was directly guided by the full set in the case of exploratory analysis during the original group project 1. While this had been quite useful in drawing valuable insight, it also meant that information from the test data indeed had indirectly trickled into the final model and resulted in overly optimistic test performance. The instructor explained that such a practice is termed data peeking and undermines the validity of predictive assessments.

In this portfolio version, I rectified that with strict data separation in the modeling workflow: data were split into training and test sets prior to model fitting; models were fit only on the training data; and all evaluation metrics along with observed-versus-predicted plots were generated only on the test data. That adjustment made the performance metrics from the models reflect real out-of-sample behavior. More importantly, the errors were assessed through various metrics-RMSE, MAE, R-squared-together with visual diagnostics, which allowed me to clearly communicate the results to a general audience. All in all, this objective furthered my understanding of responsible statistical modeling, the use of validation, and how thoughtful workflow design directly impacts the credibility of the conclusions drawn from modeling.

5 Objective 5 : Apply an appropriate classification method to model a categorical response and evaluate how predictor variables separate outcome groups.

5.1 Load and preprocess the PFI 2019 data (same as Project 2)

df_2019 <- read_excel("pfi-data.xlsx", sheet = "curated 2019")

5.2 Standardize grade and filter to middle/high school

set.seed(123)
df_2019 <- df_2019 %>%
mutate(
ALLGRADEX_2019raw = ALLGRADEX,
ALLGRADEX_std = case_when(
ALLGRADEX %in% c(2, 3) ~ 0, # both K flavors → Kindergarten
ALLGRADEX >= 4 & ALLGRADEX <= 15 ~ ALLGRADEX - 3, # 1st–12th → 1–12
TRUE ~ NA_real_
)
)

df <- df_2019 %>%
filter(ALLGRADEX_std >= 6 & ALLGRADEX_std <= 12)

5.3 Create binary success outcome and clean SEGRADES

df <- df %>%
mutate(
SEGRADES = na_if(SEGRADES, -1),
SEGRADES = na_if(SEGRADES, 5)
) %>%
filter(!is.na(SEGRADES)) %>%
mutate(
success = case_when(
SEGRADES %in% c(1, 2) ~ "high",
SEGRADES %in% c(3, 4) ~ "low"
),
success = factor(success, levels = c("low", "high"))
)
df <- df %>%
  select(-ZIPLOCL, -CDOBMM, -CDOBYY, -BASMID, -ALLGRADEX_std, -MOSTIMPT, -INTNUM, -ALLGRADEX_2019raw, -ALLGRADEX, -SEGRADES, - HHPARN19_BRD)

5.4 Choosing predictors for LDA

predictor_variables <- c(
"HHPARN19X", # number of parents in household
"EDCPUB", # public vs other school
"SCCHOICE", # school choice
"EINTNET", # internet at home
"SCHLHRSWK", # school hours per week
"PARGRADEX", # parent education
"NUMSIBSX" # number of siblings
)

df_model <- df %>%
select(success, all_of(predictor_variables))

# Check that all predictors are numeric

df_model %>%
summarise(across(all_of(predictor_variables), ~ class(.)))
## # A tibble: 1 × 7
##   HHPARN19X EDCPUB  SCCHOICE EINTNET SCHLHRSWK PARGRADEX NUMSIBSX
##   <chr>     <chr>   <chr>    <chr>   <chr>     <chr>     <chr>   
## 1 numeric   numeric numeric  numeric numeric   numeric   numeric

5.5 Stratified train/test split on success

data_split <- initial_split(df_model, prop = 0.8, strata = success)
train_data <- training(data_split)
test_data  <- testing(data_split)

nrow(train_data)
## [1] 7472
nrow(test_data)
## [1] 1869

5.6 LDA model using tidymodels (MASS engine with equal priors)

library(discrim)
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
lda_model <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS", prior = c(low = 0.5, high = 0.5)) %>%
  fit(
    success ~ HHPARN19X + EDCPUB + SCCHOICE +
      EINTNET + SCHLHRSWK + PARGRADEX + NUMSIBSX,
    data = train_data
  )

lda_model
## parsnip model object
## 
## Call:
## lda(success ~ HHPARN19X + EDCPUB + SCCHOICE + EINTNET + SCHLHRSWK + 
##     PARGRADEX + NUMSIBSX, data = data, prior = ~c(low = 0.5, 
##     high = 0.5))
## 
## Prior probabilities of groups:
##  low high 
##  0.5  0.5 
## 
## Group means:
##      HHPARN19X   EDCPUB SCCHOICE  EINTNET SCHLHRSWK PARGRADEX  NUMSIBSX
## low   1.786437 1.042510 1.471660 3.872470  3.688259  3.126518 0.9321862
## high  1.409007 1.111814 1.344695 3.910703  3.776218  3.676434 1.0413325
## 
## Coefficients of linear discriminants:
##                   LD1
## HHPARN19X -0.70532349
## EDCPUB     0.68012945
## SCCHOICE  -0.59014236
## EINTNET    0.31094456
## SCHLHRSWK  0.12774174
## PARGRADEX  0.46740115
## NUMSIBSX   0.08922897

5.7 Predictions and performance on test data

lda_test <- augment(lda_model, new_data = test_data)

head(lda_test)
## # A tibble: 6 × 11
##   .pred_class .pred_low .pred_high success HHPARN19X EDCPUB SCCHOICE EINTNET
##   <fct>           <dbl>      <dbl> <fct>       <dbl>  <dbl>    <dbl>   <dbl>
## 1 high            0.275      0.725 high            1      1        1       4
## 2 high            0.275      0.725 high            1      1        1       4
## 3 high            0.348      0.652 high            1      1        2       4
## 4 high            0.376      0.624 high            1      1        2       4
## 5 low             0.517      0.483 high            1      1        2       4
## 6 high            0.236      0.764 high            1      2        1       4
## # ℹ 3 more variables: SCHLHRSWK <dbl>, PARGRADEX <dbl>, NUMSIBSX <dbl>
conf_mat(lda_test, truth = success, estimate = .pred_class)
##           Truth
## Prediction  low high
##       low   144  530
##       high  103 1092

5.8 Confusion matrix heatmap

cm <- conf_mat(lda_test, truth = success, estimate = .pred_class)
cm_df <- as.data.frame(cm$table)

ggplot(cm_df, aes(x = Prediction, y = Truth, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 6) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(
title = "LDA Confusion Matrix – Low vs High Success",
x = "Predicted class",
y = "True class"
)

Comments: For this goal, I implemented LDA under the tidymodels framework using the MASS engine, directly continuing from Project 2. I utilized the same PFI 2019 dataset, steps in preprocessing, the set of predictors, and a stratified train–test split to ensure reproducibility and consistency. I validated model performance on the test set using a confusion matrix, with results identical to those obtained in Project 2. Based on this confirmation that my implementation was correct and stable, I went on to explore how the predictors like parental education, internet access, and school characteristics contributed to separating students into two groups according to their success, furthering my understanding of how LDA works in classification.

This objective required a great deal of effort and learning in parallel with implementation. Understanding conceptual issues, such as how LDA differs from logistic regression, especially between group separation versus probability estimation, was one challenge. Other challenges included working with the tidymodels–parsnip interface, getting all the predictors correctly formatted, and reproducing identical results through careful control of data filtering and splitting. In revisiting this analysis beyond Project 2, I strengthened both my technical confidence and my conceptual understandings around classification methods, learning how to interpret clearly outputs from LDA and how to build reproducible modeling pipelines.

6 Objective 6 : Apply an appropriate classification method for a categorical response with more than two outcome levels and evaluate model performance.

6.1 Multinomial regression with multiple predictors

library(nnet) 
set.seed(123)

6.2 Use cleaned diamonds1 data from Project 1

diamonds_multiclass <- diamonds1 %>%
select(cut, carat, length, width, depth_percentage, color, clarity) %>%
drop_na() %>%
mutate(
cut = fct_drop(cut), # make sure factor has only used levels
color = fct_drop(color),
clarity = fct_drop(clarity)
)

diamonds_multiclass %>%
count(cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1609
## 2 Good       4902
## 3 Very Good 12081
## 4 Premium   13780
## 5 Ideal     21548

6.3 Carat by cut (shows separation across classes)

ggplot(diamonds_multiclass, aes(x = cut, y = carat)) +
  geom_boxplot() +
  labs(
    title = "Carat by Diamond Cut",
    x = "Cut",
    y = "Carat"
  )

6.4 Train / test split stratified on cut

mn_split <- initial_split(diamonds_multiclass, prop = 0.8, strata = cut)
mn_train <- training(mn_split)
mn_test <- testing(mn_split)

6.5 Recipe: dummy-code factors and standardize predictors

mn_rec <- recipe(
cut ~ carat + length + width + depth_percentage + color + clarity,
data = mn_train
) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_predictors())

6.6 Multinomial logistic regression specification

mn_spec <- multinom_reg() %>%
set_mode("classification") %>%
set_engine("nnet", trace = FALSE)

mn_wf <- workflow() %>%
add_model(mn_spec) %>%
add_recipe(mn_rec)

mn_fit <- fit(mn_wf, data = mn_train)

mn_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: multinom_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_dummy()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Call:
## nnet::multinom(formula = ..y ~ ., data = data, trace = ~FALSE)
## 
## Coefficients:
##           (Intercept)     carat     length     width depth_percentage
## Good         2.162662 0.3346835 -25.810662 25.141096       -0.5425625
## Very Good    3.094372 0.7244061 -33.876372 32.742486       -1.0596477
## Premium      3.107019 0.9182927  -1.893564  0.735827       -1.6315121
## Ideal        3.823593 0.1564855 -21.845664 21.187865       -1.1756534
##                color_1     color_2     color_3     color_4      color_5
## Good      -0.008345848  0.02704029 -0.03668973 -0.09333443 -0.039724897
## Very Good -0.027299436 -0.01931523 -0.06232950 -0.09427425  0.011562211
## Premium    0.036813993 -0.04696361 -0.07884707 -0.02868667  0.005842831
## Ideal     -0.010229247 -0.01815031 -0.11602554 -0.05988645 -0.024045432
##               color_6  clarity_1   clarity_2 clarity_3  clarity_4   clarity_5
## Good      -0.01643402 0.04069123 -0.12048053 0.1023798 -0.1708981 -0.09622782
## Very Good -0.00984659 0.25437360 -0.19778385 0.1077597 -0.2471526 -0.06052508
## Premium   -0.04239995 0.26580447 -0.21962990 0.2078173 -0.1755349 -0.05210220
## Ideal     -0.05180139 0.49822380 -0.09634712 0.1242807 -0.1216619 -0.05361314
##             clarity_6    clarity_7
## Good      -0.08192723 -0.055394532
## Very Good -0.04454605  0.012898510
## Premium   -0.11630372  0.003367068
## Ideal     -0.07661510  0.050342994
## 
## Residual Deviance: 101032.8 
## AIC: 101176.8

6.7 Predictions and performance on test data

mn_test_pred <- predict(mn_fit, mn_test, type = "prob") %>%
bind_cols(predict(mn_fit, mn_test, type = "class")) %>%
bind_cols(mn_test %>% select(cut))

6.8 Overall accuracy and log-loss

mn_test_pred %>%
metrics(truth = cut, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.517
## 2 kap      multiclass     0.278

6.9 Confusion matrix for the 5 cut categories

mn_test_pred %>%
conf_mat(truth = cut, estimate = .pred_class)
##            Truth
## Prediction  Fair Good Very Good Premium Ideal
##   Fair       208   17         4       2     2
##   Good        19   19         0       0     2
##   Very Good   15  230       544     100   364
##   Premium     61  185       227    1589   710
##   Ideal       56  564      1622    1029  3217

Comments: Here, I expanded on earlier coursework by incorporating a multiclass classification model that I had not used previously in my projects. I know and learnt concepts from the classes you taught and tried as per my understanding. Even though I had worked on the diamonds dataset for multiple linear regression, I had not used it in a multiclass classification context. In service of this goal, I modeled the categorical outcome cut using the diamonds data, which includes more than two levels of quality and therefore requires a multinomial classification. This provided an opportunity to go beyond binary classification methods and to apply a more general framework suitable for multi-category responses. Prior to model fitting, I performed a brief exploratory analysis to confirm the multiclass nature of the outcome and to understand how key predictors vary across categories. In particular, the carat-by-cut boxplot showed clear differences in typical carat size across cut levels, motivating the use of a multiclass classification approach. I trained the model using several quantitative and qualitative predictors and then evaluated its performance on a held-out test set using a confusion matrix and overall accuracy. One challenge in doing this was thinking about classification performance across multiple categories rather than a single success-failure outcome. The value of this exercise was in learning how to organize multiclass models within the tidymodels framework, how to prepare predictors accordingly, and how classification methods can be generalized to handle more complex outcome structures. Overall, this objective reflects how I used the portfolio to build on earlier work and develop modeling skills that I had not fully implemented previously in the course.

7 Objective 7: Extend linear regression by incorporating polynomial terms to model nonlinear predictor–response relationships

7.1 Polynomial regression

7.2 Load Auto data (same dataset used in homework1)

data("Auto", package = "ISLR")

auto_poly <- Auto %>%
drop_na(mpg, horsepower)

glimpse(auto_poly)
## Rows: 392
## Columns: 9
## $ mpg          <dbl> 18, 15, 18, 16, 17, 15, 14, 14, 14, 15, 15, 14, 15, 14, 2…
## $ cylinders    <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, …
## $ displacement <dbl> 307, 350, 318, 304, 302, 429, 454, 440, 455, 390, 383, 34…
## $ horsepower   <dbl> 130, 165, 150, 150, 140, 198, 220, 215, 225, 190, 170, 16…
## $ weight       <dbl> 3504, 3693, 3436, 3433, 3449, 4341, 4354, 4312, 4425, 385…
## $ acceleration <dbl> 12.0, 11.5, 11.0, 12.0, 10.5, 10.0, 9.0, 8.5, 10.0, 8.5, …
## $ year         <dbl> 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 7…
## $ origin       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …
## $ name         <fct> chevrolet chevelle malibu, buick skylark 320, plymouth sa…

7.3 Train / test split

poly_split <- initial_split(auto_poly, prop = 0.8, strata = mpg)
poly_train <- training(poly_split)
poly_test <- testing(poly_split)

7.4 Recipe: add quadratic term in horsepower

poly_rec <- recipe(mpg ~ horsepower, data = poly_train) %>%
step_poly(horsepower, degree = 2) %>% # creates hp and hp^2
step_normalize(all_predictors())

7.5 Linear regression spec

poly_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")


poly_wf <- workflow() %>%
add_model(poly_spec) %>%
add_recipe(poly_rec)

poly_fit <- fit(poly_wf, data = poly_train)

tidy(poly_fit)
## # A tibble: 3 × 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          23.5      0.244     96.3  1.58e-232
## 2 horsepower_poly_1    -6.18     0.244    -25.3  3.19e- 77
## 3 horsepower_poly_2     2.27     0.244      9.29 2.94e- 18

7.6 Show the curved relationship captured by the polynomial

hp_grid <- tibble(horsepower = seq(
min(auto_poly$horsepower),
max(auto_poly$horsepower),
length.out = 100
))

curve_pred <- predict(poly_fit, hp_grid) %>%
bind_cols(hp_grid)

ggplot(auto_poly, aes(x = horsepower, y = mpg)) +
geom_point(alpha = 0.3) +
geom_line(data = curve_pred, aes(y = .pred), color = "red", size = 1) +
labs(
title = "Polynomial Regression: MPG vs Horsepower",
x = "Horsepower",
y = "MPG"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Comments:

I learned how to extend linear regression in order to capture nonlinear relationships with polynomial regression within the tidymodels framework. Even though I had worked with the Auto dataset for simple linear regression, I had not, until now, modeled nonlinear effects explicitly. I know and learnt concepts from the classes you taught and tried as per my understanding. In this example, I added a quadratic term for horsepower to better represent the relationship between horsepower and miles per gallon, mpg, that was curved across exploratory plots. I did this with a recipe that uses step_poly() to create polynomial terms and then used a train-test split to validate the model rather than fit on the full dataset. The main difficulty was understanding how the polynomial terms come in via preprocessing without touching the formula manually; that helped crystallize the separation between data preparation and model specification in tidymodels. The visualization of the results clearly shows that a quadratic relationship captures the trend of decreasing and leveling-off in mpg better than the simple linear fit. This exercise gave me a new skill of modeling beyond what I originally implemented during course projects and strengthened how to model nonlinear patterns responsibly with proper validation.This graph shows that the relationship between horsepower and MPG is nonlinear, and the polynomial regression curve captures this curved trend more accurately than a straight-line model.