Feature Engineering

Harold Nelson

2024-04-15

Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.2.0 
## ✔ infer        1.0.7      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.1.0 
## ✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
load("cdc2.Rdata")

Intro

We will work on feature engineering for the purpose of predicting gender based on other characteristics. Use cdc2.

Split the data

Put 70% of the data in training. Set the strata to gender. Create cdc2_training and cdc2_test.

Solution

set.seed(123)
cdc2_split <- initial_split(cdc2, 
                  prop = .7, 
                  strata = gender)

cdc2_training <- cdc2_split %>%
 training()

cdc2_test <- cdc2_split %>%
 testing()

Basic Recipe

Create a recipe object with gender as the outcome variable and all other variables in cdc2 as predictors. Then add a step to transform height and weight to logs base 10. Print the

Solution

cdc2_log_rec <- recipe(gender ~ ., 
                          data = cdc2_training) %>%
  # Add log transformation step
  step_log(weight, height, base = 10)

# Print recipe object
cdc2_log_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 14
## 
## ── Operations
## • Log transformation on: weight and height

Train

Train the recipe you just created using the prep function.

Solution

cdc2_log_rec_prep <- cdc2_log_rec %>% 
  prep(training = cdc2_training)

# View results
cdc2_log_rec_prep
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 14
## 
## ── Training information
## Training data contained 13997 data points and no incomplete rows.
## 
## ── Operations
## • Log transformation on: weight and height | Trained

Correlated Predictors

Get the correlation matrix of all the numeric variables. See if any variables are highly correlated.

Solution

cdc2_training %>% 
  # Select numeric columns
  select_if(is.numeric) %>% 
  # Calculate correlation matrix
  cor()
##                 exerany     hlthplan     smoke100      height       weight
## exerany      1.00000000  0.070955212 -0.043527207  0.11738043 -0.027142201
## hlthplan     0.07095521  1.000000000 -0.035244745  0.02587647  0.012615287
## smoke100    -0.04352721 -0.035244745  1.000000000  0.08741575  0.059794660
## height       0.11738043  0.025876473  0.087415754  1.00000000  0.555625537
## weight      -0.02714220  0.012615287  0.059794660  0.55562554  1.000000000
## wtdesire     0.04379964  0.008970419  0.070332357  0.76527666  0.808262442
## age         -0.10044332  0.193724415  0.108791751 -0.12485019  0.001249224
## BMI         -0.10953520 -0.003864531  0.015994418  0.03038970  0.840865080
## BMIDes      -0.04582997 -0.011297148  0.025311973  0.23228399  0.705095721
## DesActRatio  0.09292346 -0.013114220  0.006036446  0.18416809 -0.488159874
##                 wtdesire          age          BMI      BMIDes  DesActRatio
## exerany      0.043799642 -0.100443317 -0.109535195 -0.04582997  0.092923460
## hlthplan     0.008970419  0.193724415 -0.003864531 -0.01129715 -0.013114220
## smoke100     0.070332357  0.108791751  0.015994418  0.02531197  0.006036446
## height       0.765276662 -0.124850194  0.030389699  0.23228399  0.184168094
## weight       0.808262442  0.001249224  0.840865080  0.70509572 -0.488159874
## wtdesire     1.000000000 -0.023945258  0.475335478  0.79782114  0.090697377
## age         -0.023945258  1.000000000  0.080982717  0.08866912 -0.043142798
## BMI          0.475335478  0.080982717  1.000000000  0.69989874 -0.711127325
## BMIDes       0.797821140  0.088669124  0.699898738  1.00000000 -0.037189701
## DesActRatio  0.090697377 -0.043142798 -0.711127325 -0.03718970  1.000000000

Plot Correlated Predictors

BMI and weight have the highest correlation at .84. Create a scatterplot of these two.

Solution

# Plot correlated predictors
ggplot(cdc2, aes(x = weight, y = BMI)) + 
  # Add points
  geom_point(size = .2)  + 
  # Add title
  labs(title = "Weight vs. BMI",
       y = 'BMI', x = 'weight') 

The Decision

We now know that BMI and weight are highly correlated and that we should probably remove one of them. I decided to ask ChatGPT how tidymodels makes this decision. Here’s a link to the conversation. https://chat.openai.com/share/3162f941-f906-4771-abc1-d6936ed3d586.

Remove BMI

I decided to remove BMI since I know that it was constructed from weight and height. Here’s a modification of the datacamp code that does this.

Solution

# Specify a recipe object
cdc2_cor_rec <- recipe(gender ~ .,
                          data = cdc2_training) %>% 
  # Remove correlated variables
  step_rm(BMI)

# Train the recipe
cdc2_cor_rec_prep <- cdc2_cor_rec %>% 
  prep(training = cdc2_training)

# Apply to training data
cdc2_cor_rec_prep %>% 
  bake(new_data = NULL)
## # A tibble: 13,997 × 14
##    genhlth   exerany hlthplan smoke100 height weight wtdesire   age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>  <dbl>  <int>    <int> <int>  <dbl>
##  1 good            0        1        1     64    125      115    33   19.7
##  2 good            1        1        1     60    105      105    49   20.5
##  3 very good       1        1        0     64    114      114    55   19.6
##  4 very good       1        1        0     67    125      120    33   18.8
##  5 very good       1        1        0     69    200      150    48   22.1
##  6 good            1        1        1     65    160      140    54   23.3
##  7 good            0        1        0     68    190      150    33   22.8
##  8 excellent       1        0        1     67    190      165    44   25.8
##  9 excellent       1        0        1     69    160      150    42   22.1
## 10 very good       1        1        0     61    115      105    32   19.8
## # ℹ 13,987 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>
# Apply to test data
cdc2_cor_rec_prep %>% 
  bake(cdc2_test)
## # A tibble: 6,000 × 14
##    genhlth   exerany hlthplan smoke100 height weight wtdesire   age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>  <dbl>  <int>    <int> <int>  <dbl>
##  1 good            1        1        0     66    132      124    42   20.0
##  2 very good       0        1        0     61    150      130    55   24.6
##  3 good            0        1        1     65    150      130    27   21.6
##  4 excellent       1        1        1     70    170      170    69   24.4
##  5 fair            0        1        1     71    185      185    76   25.8
##  6 very good       0        0        1     64    105      120    27   20.6
##  7 good            1        1        1     68    185      160    67   24.3
##  8 very good       1        1        0     67    160      145    29   22.7
##  9 very good       1        1        1     73    210      170    36   22.4
## 10 excellent       1        1        1     64    200      160    65   27.5
## # ℹ 5,990 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>

Add a Step

Revise the code in the previous chunk. Call the recipe cdc_norm_rec. Repeat the step that removes BMI and include another step to normalize all the numerical variables.

Solution

# Specify a recipe object
cdc2_norm_rec <- recipe(gender ~ .,
                          data = cdc2_training) %>% 
# Remove correlated variables
step_rm(BMI) %>% 

# Normalize numeric predictors
  step_normalize(all_numeric())

# Train the recipe
cdc2_norm_rec_prep <- cdc2_norm_rec %>% 
  prep(training = cdc2_training)

# Apply to test data
cdc2_norm_rec_prep %>% 
  bake(new_data = cdc2_test)
## # A tibble: 6,000 × 14
##    genhlth   exerany hlthplan smoke100  height  weight wtdesire    age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>
##  1 good        0.585    0.375   -0.940 -0.285  -0.937    -0.979 -0.175 -1.23 
##  2 very good  -1.71     0.375   -0.940 -1.50   -0.487    -0.790  0.578  0.186
##  3 good       -1.71     0.375    1.06  -0.528  -0.487    -0.790 -1.04  -0.723
##  4 excellent   0.585    0.375    1.06   0.684   0.0134    0.474  1.39   0.133
##  5 fair       -1.71     0.375    1.06   0.926   0.389     0.947  1.79   0.570
##  6 very good  -1.71    -2.67     1.06  -0.770  -1.61     -1.11  -1.04  -1.04 
##  7 good        0.585    0.375    1.06   0.199   0.389     0.158  1.27   0.113
##  8 very good   0.585    0.375   -0.940 -0.0430 -0.237    -0.316 -0.928 -0.389
##  9 very good   0.585    0.375    1.06   1.41    1.01      0.474 -0.523 -0.476
## 10 excellent   0.585    0.375    1.06  -0.770   0.764     0.158  1.16   1.09 
## # ℹ 5,990 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>

Complete Recipe

Create a new cdc2_recipe which does the following:

  1. Remove BMI.
  2. Normalize all the numeric variables.
  3. Convert all the nominal variables to dummy variables.

Solution

# Create a recipe that predicts gender using the training data
cdc2_recipe <- recipe(gender ~ ., data = cdc2_training) %>% 
  # Remove correlated predictors
  step_rm(BMI) %>% 
  # Normalize numeric predictors
  step_normalize(all_numeric()) %>% 
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

Finish

Train your recipe and apply it to the test data.

Solution

cdc2_recipe %>% 
  prep(training = cdc2_training) %>% 
  bake(new_data = cdc2_test)
## # A tibble: 6,000 × 25
##    exerany hlthplan smoke100  height  weight wtdesire    age BMIDes DesActRatio
##      <dbl>    <dbl>    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>       <dbl>
##  1   0.585    0.375   -0.940 -0.285  -0.937    -0.979 -0.175 -1.23        0.109
##  2  -1.71     0.375   -0.940 -1.50   -0.487    -0.790  0.578  0.186      -0.550
##  3  -1.71     0.375    1.06  -0.528  -0.487    -0.790 -1.04  -0.723      -0.550
##  4   0.585    0.375    1.06   0.684   0.0134    0.474  1.39   0.133       0.658
##  5  -1.71     0.375    1.06   0.926   0.389     0.947  1.79   0.570       0.658
##  6  -1.71    -2.67     1.06  -0.770  -1.61     -1.11  -1.04  -1.04        1.95 
##  7   0.585    0.375    1.06   0.199   0.389     0.158  1.27   0.113      -0.567
##  8   0.585    0.375   -0.940 -0.0430 -0.237    -0.316 -0.928 -0.389      -0.191
##  9   0.585    0.375    1.06   1.41    1.01      0.474 -0.523 -0.476      -1.07 
## 10   0.585    0.375    1.06  -0.770   0.764     0.158  1.16   1.09       -1.15 
## # ℹ 5,990 more rows
## # ℹ 16 more variables: gender <fct>, genhlth_very.good <dbl>,
## #   genhlth_good <dbl>, genhlth_fair <dbl>, genhlth_poor <dbl>,
## #   BMICat_Normal <dbl>, BMICat_Overweight <dbl>, BMICat_Obese <dbl>,
## #   BMICat_Morbidly.Obese <dbl>, BMIDesCat_Normal <dbl>,
## #   BMIDesCat_Overweight <dbl>, BMIDesCat_Obese <dbl>,
## #   BMIDesCat_Morbidly.Obese <dbl>, ageCat_X32.43 <dbl>, ageCat_X44.57 <dbl>, …

Complete Modeling Workflow

This workflow is complete for a bivariate choice model using logistic regression. It does not include hyperparameter tuning, which is not required for logistic regression.

Solution

# Specify a logistic regression model
logistic_model <- logistic_reg() %>% 
  # Set the engine
  set_engine('glm') %>% 
  # Set the mode
  set_mode('classification')

# Split the data 

cdc2_split = initial_split(cdc2,
  prop = .7,
  strata = gender)

cdc2_training = cdc2_split %>% 
  training()

cdc2_test = cdc2_split %>% 
  testing()

# Check
nrow(cdc2_training)
## [1] 13997
nrow(cdc2_test)
## [1] 6000
summary(cdc2_test$gender)/nrow(cdc2_test)
##         m         f 
## 0.4783333 0.5216667
summary(cdc2_training$gender)/nrow(cdc2_training)
##         m         f 
## 0.4783882 0.5216118
# Feature Engineering

cdc2_recipe <- recipe(gender ~ ., data = cdc2_training) %>% 
  # Removed correlated predictors
  # You may want to use step_rm() instead
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_rm(exerany,smoke100,hlthplan) %>% 
  # Had to remove numerics with zero values
  # Log transform numeric predictors
  # This is optional
  step_log(all_numeric(), base = 10) %>%
  # Normalize numeric predictors
  step_normalize(all_numeric()) %>% 
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
cdc2_recipe_prep <- cdc2_recipe %>% 
  prep(training = cdc2_training)

# Transform training data
cdc2_training_prep <- cdc2_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc2_test_prep <- cdc2_recipe_prep %>% 
  bake(new_data = cdc2_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc2_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc2_test_prep,
                       type = 'class')

# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc2_test_prep, 
                      type = 'prob')

# Create a confusion matrix

cdc2_results <- cdc2_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc2_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)
##           Truth
## Prediction    m    f
##          m 2577  288
##          f  293 2842
# Calculate sensitivity
cdc2_results %>% 
  sens(truth = gender, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 sens    binary         0.898
# Calculate specificity
cdc2_results %>% 
  spec(truth = gender, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 spec    binary         0.908
# Plot ROC curve
# Predicted prob is pred_m in this case.
cdc2_results %>% 
  roc_curve(truth = gender, .pred_m) %>% 
  autoplot()

roc_auc(cdc2_results,
        truth = gender, 
        .pred_m)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.961