Multiple Regression
with Quantitative and Qualitative Predictors
data(Auto)
auto_clean <- Auto |>
mutate(origin = as.factor(origin)) |>
select(-name) |>
drop_na()
auto_split <- initial_split(auto_clean, prop = 0.8, strata = acceleration)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)
multi_lm_recipe <- recipe(acceleration ~ weight + horsepower + origin, data = auto_train) |>
step_dummy(origin) |>
step_normalize(all_numeric_predictors())
multi_lm_model <- linear_reg() |>
set_engine("lm")
multi_lm_workflow <- workflow() |>
add_recipe(multi_lm_recipe) |>
add_model(multi_lm_model)
multi_lm_fit <- fit(multi_lm_workflow, data = auto_train)
multi_lm_preds <- predict(multi_lm_fit, auto_test) |>
bind_cols(auto_test)
rmse(data = multi_lm_preds, truth = acceleration, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.73
Multinomial Logistic
Regression with Multiple Predictors
df_2016 <- read_excel("pfi-data.xlsx", sheet = "curated 2016") |>
mutate(SEFUTUREX = as.factor(SEFUTUREX))
numeric_vars_2016 <- df_2016 |>
select(-BASMID, -SEFUTUREX) |>
select(where(is.numeric))
pca_result_2016 <- prcomp(numeric_vars_2016, scale. = TRUE)
pca_scores <- as.data.frame(pca_result_2016$x[, 1:3])
colnames(pca_scores) <- c("PC1", "PC2", "PC3")
multi_data <- bind_cols(SEFUTUREX = df_2016$SEFUTUREX, pca_scores) |>
drop_na()
multi_split <- initial_split(multi_data, prop = 0.7, strata = SEFUTUREX)
multi_train <- training(multi_split)
multi_test <- testing(multi_split)
multi_spec <- multinom_reg() |>
set_engine("nnet") |>
set_mode("classification")
multi_recipe <- recipe(SEFUTUREX ~ ., data = multi_train)
multi_workflow <- workflow() |>
add_model(multi_spec) |>
add_recipe(multi_recipe)
multi_fit <- fit(multi_workflow, data = multi_train)
multi_preds <- predict(multi_fit, new_data = multi_test, type = "class") |>
bind_cols(multi_test)
metrics(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.445
## 2 kap multiclass 0.0981
accuracy(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.445
conf_mat(multi_preds, truth = SEFUTUREX, estimate = .pred_class)
## Truth
## Prediction 1 2 3 4 5 6
## 1 0 0 0 0 0 0
## 2 7 57 24 34 23 24
## 3 0 1 1 0 0 0
## 4 0 29 16 51 41 38
## 5 4 54 66 92 168 152
## 6 14 121 144 351 1006 1522
Poisson Regression
df_2019 <- read_excel("pfi-data.xlsx", sheet = "curated 2019")
numeric_vars <- df_2019 |>
select(-BASMID, -NUMSIBSX)
pca_result <- prcomp(numeric_vars, scale. = TRUE)
pca_scores <- as.data.frame(pca_result$x[, 1:3])
poisson_data <- cbind(pca_scores, y = df_2019$NUMSIBSX)
split <- initial_split(poisson_data, prop = 0.7)
train_data <- training(split)
test_data <- testing(split)
poisson_model <- glm(y ~ PC1 + PC2 + PC3,
data = train_data,
family = poisson(link = "log"))
predicted <- predict(poisson_model, newdata = test_data, type = "response")
results <- data.frame(actual = test_data$y, predicted = predicted)
rmse <- sqrt(mean((results$actual - results$predicted)^2))
rmse
## [1] 0.9667707
poisson_data$predicted <- predict(poisson_model, newdata = poisson_data, type = "response")
ggplot(poisson_data, aes(x = y, y = predicted)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = paste("Poisson Regression: Actual vs Predicted (RMSE =", round(rmse, 3), ")"),
x = "Actual Number of Siblings",
y = "Predicted Number of Siblings"
) +
theme_minimal()

Ridge Regression
data(Auto)
auto_clean <- Auto |>
select(-name) |>
drop_na()
auto_split <- initial_split(auto_clean, prop = 0.8, strata = acceleration)
auto_train <- training(auto_split)
auto_test <- testing(auto_split)
auto_recipe <- recipe(acceleration ~ ., data = auto_train)
ridge_spec <- linear_reg(penalty = tune(), mixture = 0) |>
set_engine("glmnet")
ridge_workflow <- workflow() |>
add_model(ridge_spec) |>
add_recipe(auto_recipe)
ridge_res <- tune_grid(
ridge_workflow,
resamples = vfold_cv(auto_train, v = 5),
grid = 20,
metrics = yardstick::metric_set(yardstick::rmse)
)
ridge_best <- select_best(ridge_res, metric = "rmse")
ridge_final <- finalize_workflow(ridge_workflow, ridge_best)
ridge_fit <- fit(ridge_final, auto_train)
ridge_rmse <- rmse(
data = bind_cols(auto_test, predict(ridge_fit, auto_test)),
truth = acceleration,
estimate = .pred
)
ridge_rmse
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.44
ridge_preds <- predict(ridge_fit, auto_test) |>
bind_cols(auto_test)
ggplot(ridge_preds, aes(x = acceleration, y = .pred)) +
geom_point(alpha = 0.6, color = "#2c3e50") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Predicted vs Actual - Ridge Regression",
x = "Actual Acceleration",
y = "Predicted Acceleration"
) +
theme_minimal()

Polynomial
Regression
auto_poly_recipe <- recipe(acceleration ~ ., data = auto_train) |>
step_rm(year, origin) |>
step_poly(weight, horsepower, displacement, degree = tune()) |>
step_normalize(all_predictors())
poly_spec <- linear_reg() |> set_engine("lm")
poly_workflow <- workflow() |>
add_model(poly_spec) |>
add_recipe(auto_poly_recipe)
poly_res <- tune_grid(
poly_workflow,
resamples = vfold_cv(auto_train, v = 5),
grid = expand.grid(degree = 1:5),
metrics = yardstick::metric_set(yardstick::rmse)
)
poly_best <- select_best(poly_res, metric = "rmse")
poly_final <- finalize_workflow(poly_workflow, poly_best)
poly_fit <- fit(poly_final, auto_train)
poly_rmse <- rmse(
data = bind_cols(auto_test, predict(poly_fit, auto_test)),
truth = acceleration,
estimate = .pred
)
poly_rmse
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.33
poly_preds <- predict(poly_fit, auto_test) |>
bind_cols(auto_test)
ggplot(poly_preds, aes(x = acceleration, y = .pred)) +
geom_point(alpha = 0.6, color = "#2c3e50") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Predicted vs Actual - Polynomial Regression",
x = "Actual Acceleration",
y = "Predicted Acceleration"
) +
theme_minimal()
