library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.3.0 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## Warning: package 'dials' was built under R version 4.4.2
## Warning: package 'infer' was built under R version 4.4.2
## Warning: package 'modeldata' was built under R version 4.4.2
## Warning: package 'recipes' was built under R version 4.4.2
## Warning: package 'rsample' was built under R version 4.4.2
## Warning: package 'tune' was built under R version 4.4.2
## Warning: package 'workflows' was built under R version 4.4.2
## Warning: package 'workflowsets' was built under R version 4.4.2
## Warning: package 'yardstick' was built under R version 4.4.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
load("C:/Users/User/OneDrive/CSC 530 Data Analysis/Data Analysis/cdc3.Rdata")
We will work on feature engineering for the purpose of predicting gender based on other characteristics. Use cdc3.
Put 70% of the data in training. Set the strata to gender. Create cdc3_training and cdc3_test.
set.seed(123)
cdc3_split <- initial_split(cdc3,
prop = .7,
strata = gender)
cdc3_training <- cdc3_split %>%
training()
cdc3_test <- cdc3_split %>%
testing()
This workflow is complete for a bivariate choice model using logistic regression. It does not include hyperparameter tuning, which is not required for logistic regression.
# Specify a logistic regression model
logistic_model <- logistic_reg() %>%
# Set the engine
set_engine('glm') %>%
# Set the mode
set_mode('classification')
# Split the data
cdc3_split = initial_split(cdc3,
prop = .7,
strata = gender)
cdc3_training = cdc3_split %>%
training()
cdc3_test = cdc3_split %>%
testing()
cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>%
# Removed correlated predictors
# You may want to use step_rm() instead
step_corr(all_numeric(), threshold = 0.8) %>%
# Log transform numeric predictors
# This is optional
step_log(all_numeric(), base = 10) %>%
# Normalize numeric predictors
step_normalize(all_numeric()) %>%
# Create dummy variables
step_dummy(all_nominal(), -all_outcomes())
# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>%
prep(training = cdc3_training)
# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>%
bake(new_data = NULL)
# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>%
bake(new_data = cdc3_test)
# Train logistic model
logistic_fit <- logistic_model %>%
fit(gender ~ ., data = cdc3_training_prep)
# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'class')
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'prob')
# Create a confusion matrix
cdc3_results <- cdc3_test_prep %>%
select(gender) %>%
bind_cols(class_preds, prob_preds)
cdc3_results %>%
conf_mat(truth = gender, estimate = .pred_class)
## Truth
## Prediction f m
## f 2836 303
## m 294 2567
# Calculate sensitivity
cdc3_results %>%
sens(truth = gender, estimate = .pred_class)
# Calculate specificity
cdc3_results %>%
spec(truth = gender, estimate = .pred_class)
# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>%
roc_curve(truth = gender, .pred_f) %>%
autoplot()
roc_auc(cdc3_results,
truth = gender,
.pred_f)
##Change 1 Modifying by changing the correlation threshold to 0.2:
# Feature Engineering
cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>%
step_corr(all_numeric(), threshold = 0.2) %>%
step_log(all_numeric(), base = 10) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>%
prep(training = cdc3_training)
# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>%
bake(new_data = NULL)
# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>%
bake(new_data = cdc3_test)
# Train logistic model
logistic_fit <- logistic_model %>%
fit(gender ~ ., data = cdc3_training_prep)
# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'class')
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'prob')
# Create a confusion matrix
cdc3_results <- cdc3_test_prep %>%
select(gender) %>%
bind_cols(class_preds, prob_preds)
cdc3_results %>%
conf_mat(truth = gender, estimate = .pred_class)
## Truth
## Prediction f m
## f 2820 364
## m 310 2506
# Calculate sensitivity
cdc3_results %>%
sens(truth = gender, estimate = .pred_class)
# Calculate specificity
cdc3_results %>%
spec(truth = gender, estimate = .pred_class)
# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>%
roc_curve(truth = gender, .pred_f) %>%
autoplot()
roc_auc(cdc3_results,
truth = gender,
.pred_f)
##Change 1 reults:
The first thing I changed was the correlation threshold to 0.2, causing a decrease in the AUC estimate which came out to 0.9521409. By lowering the correlation threshold from 0.8 to 0.2 I removed more predictors that were considered “highly correlated”. While removing the correlation threshold can reduce redundancy, having it low can remove important predictors that the model relies on causing the AUC to decrease. In this example critical information might have been lost when more features were excluded.
##Change 2 Modifying by adding interaction terms:
# Feature Engineering
cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_log(all_numeric(), base = 10) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_interact(~ all_numeric_predictors() * all_numeric_predictors()) # Add interaction terms
# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>%
prep(training = cdc3_training)
# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>%
bake(new_data = NULL)
# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>%
bake(new_data = cdc3_test)
# Train logistic model
logistic_fit <- logistic_model %>%
fit(gender ~ ., data = cdc3_training_prep)
# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'class')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
# Create a confusion matrix
cdc3_results <- cdc3_test_prep %>%
select(gender) %>%
bind_cols(class_preds, prob_preds)
cdc3_results %>%
conf_mat(truth = gender, estimate = .pred_class)
## Truth
## Prediction f m
## f 2837 309
## m 293 2561
# Calculate sensitivity
cdc3_results %>%
sens(truth = gender, estimate = .pred_class)
# Calculate specificity
cdc3_results %>%
spec(truth = gender, estimate = .pred_class)
# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>%
roc_curve(truth = gender, .pred_f) %>%
autoplot()
roc_auc(cdc3_results,
truth = gender,
.pred_f)
##Change 2 results:
The second change I made was to add interaction terms, causing an increase in the AUC estimate which came out to 0.9598483. By adding the interaction term, I allowed the model to capture relationships between pairs of numeric features. This additional information more than likely helped the model to make better predictions, improving its ability to rank probabilities correctly. This is likely the reason that the AUC increased.
##Change 3 Modifying by removing log transformation:
# Feature Engineering
cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
#step_log(all_numeric(), base = 10) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>%
prep(training = cdc3_training)
# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>%
bake(new_data = NULL)
# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>%
bake(new_data = cdc3_test)
# Train logistic model
logistic_fit <- logistic_model %>%
fit(gender ~ ., data = cdc3_training_prep)
# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'class')
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'prob')
# Create a confusion matrix
cdc3_results <- cdc3_test_prep %>%
select(gender) %>%
bind_cols(class_preds, prob_preds)
cdc3_results %>%
conf_mat(truth = gender, estimate = .pred_class)
## Truth
## Prediction f m
## f 2847 314
## m 283 2556
# Calculate sensitivity
cdc3_results %>%
sens(truth = gender, estimate = .pred_class)
# Calculate specificity
cdc3_results %>%
spec(truth = gender, estimate = .pred_class)
# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>%
roc_curve(truth = gender, .pred_f) %>%
autoplot()
roc_auc(cdc3_results,
truth = gender,
.pred_f)
##Change 3 results:
The third change that I made was to remove the log transformation, causing a decrease in the AUC estimate which came out to 0.9587525. The log transformation is used to reduce skewness in the numeric predictors, making them easier for the model to interpret. This removal allowed the model to work with raw, skewed data which might have made it harder to correctly rank predictions. The slight decrease in AUC suggests that the log transformation was providing a small improvement in the model’s performance overall.
##Change 4 Modifying by using discretize (converting continuious numeric variables to bins):
# Feature Engineering
cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_log(all_numeric(), base = 10) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_discretize(all_numeric(), options = list(num_breaks = 4))
# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>%
prep(training = cdc3_training)
## Warning: Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>%
bake(new_data = NULL)
# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>%
bake(new_data = cdc3_test)
# Train logistic model
logistic_fit <- logistic_model %>%
fit(gender ~ ., data = cdc3_training_prep)
# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'class')
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
type = 'prob')
# Create a confusion matrix
cdc3_results <- cdc3_test_prep %>%
select(gender) %>%
bind_cols(class_preds, prob_preds)
cdc3_results %>%
conf_mat(truth = gender, estimate = .pred_class)
## Truth
## Prediction f m
## f 2844 322
## m 286 2548
# Calculate sensitivity
cdc3_results %>%
sens(truth = gender, estimate = .pred_class)
# Calculate specificity
cdc3_results %>%
spec(truth = gender, estimate = .pred_class)
# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>%
roc_curve(truth = gender, .pred_f) %>%
autoplot()
roc_auc(cdc3_results,
truth = gender,
.pred_f)
#Change 4 results:
The fourth change that I made was using discretization, or binning of numeric features. This caused an increase in the AUC estimate which came out to 0.9591692. Using this feature, it groups continuous numeric variables into categories which can help to capture threshold effects or non-linear patterns in the data. This approach simplifies numeric features into categories, which can help capture relationships, but it risks oversimplifying data if too many details are lost. With this increasing the AUC it indicates that the binning made some relationships easier for the model to interpret without overly simplifying the data.