1 Multiple Regression with Quantitative and Qualitative Predictors

data(Auto)

auto_clean <- Auto |>
  mutate(origin = as.factor(origin)) |>
  select(-name) |>
  drop_na()

auto_split <- initial_split(auto_clean, prop = 0.8, strata = acceleration)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)

multi_lm_recipe <- recipe(acceleration ~ weight + horsepower + origin, data = auto_train) |>
  step_dummy(origin) |>
  step_normalize(all_numeric_predictors())

multi_lm_model <- linear_reg() |>
  set_engine("lm")

multi_lm_workflow <- workflow() |>
  add_recipe(multi_lm_recipe) |>
  add_model(multi_lm_model)

multi_lm_fit <- fit(multi_lm_workflow, data = auto_train)

multi_lm_preds <- predict(multi_lm_fit, auto_test) |>
  bind_cols(auto_test)

rmse(data = multi_lm_preds, truth = acceleration, estimate = .pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.73

2 Multinomial Logistic Regression with Multiple Predictors

df_2016 <- read_excel("pfi-data.xlsx", sheet = "curated 2016") |>
  mutate(SEFUTUREX = as.factor(SEFUTUREX))

numeric_vars_2016 <- df_2016 |>
  select(-BASMID, -SEFUTUREX) |>
  select(where(is.numeric))

pca_result_2016 <- prcomp(numeric_vars_2016, scale. = TRUE)

pca_scores <- as.data.frame(pca_result_2016$x[, 1:3])
colnames(pca_scores) <- c("PC1", "PC2", "PC3")

multi_data <- bind_cols(SEFUTUREX = df_2016$SEFUTUREX, pca_scores) |>
  drop_na()

multi_split <- initial_split(multi_data, prop = 0.7, strata = SEFUTUREX)
multi_train <- training(multi_split)
multi_test <- testing(multi_split)

multi_spec <- multinom_reg() |>
  set_engine("nnet") |>
  set_mode("classification")

multi_recipe <- recipe(SEFUTUREX ~ ., data = multi_train)

multi_workflow <- workflow() |>
  add_model(multi_spec) |>
  add_recipe(multi_recipe)

multi_fit <- fit(multi_workflow, data = multi_train)

multi_preds <- predict(multi_fit, new_data = multi_test, type = "class") |>
  bind_cols(multi_test)

metrics(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
## # A tibble: 2 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass    0.445 
## 2 kap      multiclass    0.0981
accuracy(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.445
conf_mat(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
##           Truth
## Prediction    1    2    3    4    5    6
##          1    0    0    0    0    0    0
##          2    7   57   24   34   23   24
##          3    0    1    1    0    0    0
##          4    0   29   16   51   41   38
##          5    4   54   66   92  168  152
##          6   14  121  144  351 1006 1522

3 Poisson Regression

df_2019 <- read_excel("pfi-data.xlsx", sheet = "curated 2019")

numeric_vars <- df_2019 |>
  select(-BASMID, -NUMSIBSX)

pca_result <- prcomp(numeric_vars, scale. = TRUE)

pca_scores <- as.data.frame(pca_result$x[, 1:3])
poisson_data <- cbind(pca_scores, y = df_2019$NUMSIBSX)

split <- initial_split(poisson_data, prop = 0.7)
train_data <- training(split)
test_data  <- testing(split)

poisson_model <- glm(y ~ PC1 + PC2 + PC3,
                     data = train_data,
                     family = poisson(link = "log"))

predicted <- predict(poisson_model, newdata = test_data, type = "response")
results <- data.frame(actual = test_data$y, predicted = predicted)

rmse <- sqrt(mean((results$actual - results$predicted)^2))
rmse
## [1] 0.9667707
poisson_data$predicted <- predict(poisson_model, newdata = poisson_data, type = "response")

ggplot(poisson_data, aes(x = y, y = predicted)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = paste("Poisson Regression: Actual vs Predicted (RMSE =", round(rmse, 3), ")"),
    x = "Actual Number of Siblings",
    y = "Predicted Number of Siblings"
  ) +
  theme_minimal()

4 Ridge Regression

data(Auto)

auto_clean <- Auto |>
  select(-name) |>
  drop_na()

auto_split <- initial_split(auto_clean, prop = 0.8, strata = acceleration)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)

auto_recipe <- recipe(acceleration ~ ., data = auto_train)

ridge_spec <- linear_reg(penalty = tune(), mixture = 0) |>
  set_engine("glmnet")

ridge_workflow <- workflow() |>
  add_model(ridge_spec) |>
  add_recipe(auto_recipe)


ridge_res <- tune_grid(
  ridge_workflow,
  resamples = vfold_cv(auto_train, v = 5),
  grid = 20,
  metrics = yardstick::metric_set(yardstick::rmse)
)

ridge_best <- select_best(ridge_res, metric = "rmse")
ridge_final <- finalize_workflow(ridge_workflow, ridge_best)

ridge_fit <- fit(ridge_final, auto_train)

ridge_rmse <- rmse(
  data = bind_cols(auto_test, predict(ridge_fit, auto_test)),
  truth = acceleration,
  estimate = .pred
)
ridge_rmse
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.44
ridge_preds <- predict(ridge_fit, auto_test) |>
  bind_cols(auto_test)

ggplot(ridge_preds, aes(x = acceleration, y = .pred)) +
  geom_point(alpha = 0.6, color = "#2c3e50") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Predicted vs Actual - Ridge Regression",
    x = "Actual Acceleration",
    y = "Predicted Acceleration"
  ) +
  theme_minimal()

5 Polynomial Regression

auto_poly_recipe <- recipe(acceleration ~ ., data = auto_train) |>
  step_rm(year, origin) |>
  step_poly(weight, horsepower, displacement, degree = tune()) |>
  step_normalize(all_predictors())

poly_spec <- linear_reg() |> set_engine("lm")

poly_workflow <- workflow() |>
  add_model(poly_spec) |>
  add_recipe(auto_poly_recipe)

poly_res <- tune_grid(
  poly_workflow,
  resamples = vfold_cv(auto_train, v = 5),
  grid = expand.grid(degree = 1:5),
  metrics = yardstick::metric_set(yardstick::rmse)
)

poly_best <- select_best(poly_res, metric = "rmse")
poly_final <- finalize_workflow(poly_workflow, poly_best)
poly_fit <- fit(poly_final, auto_train)

poly_rmse <- rmse(
  data = bind_cols(auto_test, predict(poly_fit, auto_test)),
  truth = acceleration,
  estimate = .pred
)
poly_rmse
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.33
poly_preds <- predict(poly_fit, auto_test) |>
  bind_cols(auto_test)

ggplot(poly_preds, aes(x = acceleration, y = .pred)) +
  geom_point(alpha = 0.6, color = "#2c3e50") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Predicted vs Actual - Polynomial Regression",
    x = "Actual Acceleration",
    y = "Predicted Acceleration"
  ) +
  theme_minimal()