Setup

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.4.2

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0

## Warning: package 'dials' was built under R version 4.4.2

## Warning: package 'infer' was built under R version 4.4.2

## Warning: package 'modeldata' was built under R version 4.4.2

## Warning: package 'recipes' was built under R version 4.4.2

## Warning: package 'rsample' was built under R version 4.4.2

## Warning: package 'tune' was built under R version 4.4.2

## Warning: package 'workflows' was built under R version 4.4.2

## Warning: package 'workflowsets' was built under R version 4.4.2

## Warning: package 'yardstick' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

load("C:/Users/User/OneDrive/CSC 530 Data Analysis/Data Analysis/cdc3.Rdata")

Intro

We will work on feature engineering for the purpose of predicting gender based on other characteristics. Use cdc3.

Split the data

Put 70% of the data in training. Set the strata to gender. Create cdc3_training and cdc3_test.

Solution

set.seed(123)
cdc3_split <- initial_split(cdc3, 
                  prop = .7, 
                  strata = gender)

cdc3_training <- cdc3_split %>%
 training()

cdc3_test <- cdc3_split %>%
 testing()

Complete Modeling Workflow

This workflow is complete for a bivariate choice model using logistic regression. It does not include hyperparameter tuning, which is not required for logistic regression.

Solution

# Specify a logistic regression model
logistic_model <- logistic_reg() %>% 
  # Set the engine
  set_engine('glm') %>% 
  # Set the mode
  set_mode('classification')

# Split the data 

cdc3_split = initial_split(cdc3,
  prop = .7,
  strata = gender)

cdc3_training = cdc3_split %>% 
  training()

cdc3_test = cdc3_split %>% 
  testing()

Feature Engineering

cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>% 
  # Removed correlated predictors
  # You may want to use step_rm() instead
  step_corr(all_numeric(), threshold = 0.8) %>% 
  
  # Log transform numeric predictors
  # This is optional
  step_log(all_numeric(), base = 10) %>%
  # Normalize numeric predictors
  step_normalize(all_numeric()) %>% 
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>% 
  prep(training = cdc3_training)

# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>% 
  bake(new_data = cdc3_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc3_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
                       type = 'class')

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep, 
                      type = 'prob')

# Create a confusion matrix

cdc3_results <- cdc3_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc3_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)

##           Truth
## Prediction    f    m
##          f 2836  303
##          m  294 2567

# Calculate sensitivity
cdc3_results %>% 
  sens(truth = gender, estimate = .pred_class)

# Calculate specificity
cdc3_results %>% 
  spec(truth = gender, estimate = .pred_class)

# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>% 
  roc_curve(truth = gender, .pred_f) %>% 
  autoplot()

roc_auc(cdc3_results,
        truth = gender, 
        .pred_f)

##Change 1 Modifying by changing the correlation threshold to 0.2:

# Feature Engineering

cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>% 
  step_corr(all_numeric(), threshold = 0.2) %>% 
  step_log(all_numeric(), base = 10) %>%
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>% 
  prep(training = cdc3_training)

# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>% 
  bake(new_data = cdc3_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc3_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
                       type = 'class')

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep, 
                      type = 'prob')

# Create a confusion matrix

cdc3_results <- cdc3_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc3_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)

##           Truth
## Prediction    f    m
##          f 2820  364
##          m  310 2506

# Calculate sensitivity
cdc3_results %>% 
  sens(truth = gender, estimate = .pred_class)

# Calculate specificity
cdc3_results %>% 
  spec(truth = gender, estimate = .pred_class)

# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>% 
  roc_curve(truth = gender, .pred_f) %>% 
  autoplot()

roc_auc(cdc3_results,
        truth = gender, 
        .pred_f)

##Change 1 reults:

The first thing I changed was the correlation threshold to 0.2, causing a decrease in the AUC estimate which came out to 0.9521409. By lowering the correlation threshold from 0.8 to 0.2 I removed more predictors that were considered “highly correlated”. While removing the correlation threshold can reduce redundancy, having it low can remove important predictors that the model relies on causing the AUC to decrease. In this example critical information might have been lost when more features were excluded.

##Change 2 Modifying by adding interaction terms:

# Feature Engineering

cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_log(all_numeric(), base = 10) %>%
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_interact(~ all_numeric_predictors() * all_numeric_predictors())  # Add interaction terms

# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>% 
  prep(training = cdc3_training)

# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>% 
  bake(new_data = cdc3_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc3_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
                       type = 'class')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep, 
                      type = 'prob')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

# Create a confusion matrix

cdc3_results <- cdc3_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc3_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)

##           Truth
## Prediction    f    m
##          f 2837  309
##          m  293 2561

# Calculate sensitivity
cdc3_results %>% 
  sens(truth = gender, estimate = .pred_class)

# Calculate specificity
cdc3_results %>% 
  spec(truth = gender, estimate = .pred_class)

# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>% 
  roc_curve(truth = gender, .pred_f) %>% 
  autoplot()

roc_auc(cdc3_results,
        truth = gender, 
        .pred_f)

##Change 2 results:

The second change I made was to add interaction terms, causing an increase in the AUC estimate which came out to 0.9598483. By adding the interaction term, I allowed the model to capture relationships between pairs of numeric features. This additional information more than likely helped the model to make better predictions, improving its ability to rank probabilities correctly. This is likely the reason that the AUC increased.

##Change 3 Modifying by removing log transformation:

# Feature Engineering

cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  #step_log(all_numeric(), base = 10) %>%
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>% 
  prep(training = cdc3_training)

# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>% 
  bake(new_data = cdc3_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc3_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
                       type = 'class')

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep, 
                      type = 'prob')

# Create a confusion matrix

cdc3_results <- cdc3_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc3_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)

##           Truth
## Prediction    f    m
##          f 2847  314
##          m  283 2556

# Calculate sensitivity
cdc3_results %>% 
  sens(truth = gender, estimate = .pred_class)

# Calculate specificity
cdc3_results %>% 
  spec(truth = gender, estimate = .pred_class)

# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>% 
  roc_curve(truth = gender, .pred_f) %>% 
  autoplot()

roc_auc(cdc3_results,
        truth = gender, 
        .pred_f)

##Change 3 results:

The third change that I made was to remove the log transformation, causing a decrease in the AUC estimate which came out to 0.9587525. The log transformation is used to reduce skewness in the numeric predictors, making them easier for the model to interpret. This removal allowed the model to work with raw, skewed data which might have made it harder to correctly rank predictions. The slight decrease in AUC suggests that the log transformation was providing a small improvement in the model’s performance overall.

##Change 4 Modifying by using discretize (converting continuious numeric variables to bins):

# Feature Engineering

cdc3_recipe <- recipe(gender ~ ., data = cdc3_training) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_log(all_numeric(), base = 10) %>%
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_discretize(all_numeric(), options = list(num_breaks = 4)) 

# Train recipe
cdc3_recipe_prep <- cdc3_recipe %>% 
  prep(training = cdc3_training)

## Warning: Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.
## Data not binned; too few unique values per bin. Adjust `min_unique` as needed.

# Transform training data
cdc3_training_prep <- cdc3_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc3_test_prep <- cdc3_recipe_prep %>% 
  bake(new_data = cdc3_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc3_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc3_test_prep,
                       type = 'class')

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc3_test_prep, 
                      type = 'prob')

# Create a confusion matrix

cdc3_results <- cdc3_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc3_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)

##           Truth
## Prediction    f    m
##          f 2844  322
##          m  286 2548

# Calculate sensitivity
cdc3_results %>% 
  sens(truth = gender, estimate = .pred_class)

# Calculate specificity
cdc3_results %>% 
  spec(truth = gender, estimate = .pred_class)

# Plot ROC curve
# Predicted prob is pred_f in this case.
cdc3_results %>% 
  roc_curve(truth = gender, .pred_f) %>% 
  autoplot()

roc_auc(cdc3_results,
        truth = gender, 
        .pred_f)

#Change 4 results:

The fourth change that I made was using discretization, or binning of numeric features. This caused an increase in the AUC estimate which came out to 0.9591692. Using this feature, it groups continuous numeric variables into categories which can help to capture threshold effects or non-linear patterns in the data. This approach simplifies numeric features into categories, which can help capture relationships, but it risks oversimplifying data if too many details are lost. With this increasing the AUC it indicates that the binning made some relationships easier for the model to interpret without overly simplifying the data.

Feature Engineering 3

Connie Rodriguez

2024-11-26

Setup

Intro

Split the data

Solution

Complete Modeling Workflow

Solution

Feature Engineering