Feature Engineering

Harold Nelson

2023-11-21

Setup

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tune         1.1.1
## ✔ infer        1.0.3     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.0     ✔ yardstick    1.2.0
## ✔ recipes      1.0.5     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
load("cdc2.Rdata")

Intro

We will work on feature engineering for the purpose of predicting gender based on other characteristics.

Split the data

Put 70% of the data in training. Set the strata to gender. Create cdc2_training and cdc2_test.

Solution

set.seed(123)
cdc2_split <- initial_split(cdc2, 
                  prop = .7, 
                  strata = gender)

cdc2_training <- cdc2_split %>%
 training()

cdc2_test <- cdc2_split %>%
 testing()

Basic Recipe

Create a recipe object with gender as the outcome variable and all other variables in cdc2 as predictors. Then add a step to transform height and weight to logs base 10. Print the

Solution

cdc2_log_rec <- recipe(gender ~ ., 
                          data = cdc2_training) %>%
  # Add log transformation step
  step_log(weight, height, base = 10)

# Print recipe object
cdc2_log_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 14
## 
## ── Operations
## • Log transformation on: weight, height

Train

Train the recipe you just created using the prep function.

Solution

cdc2_log_rec_prep <- cdc2_log_rec %>% 
  prep(training = cdc2_training)

# View results
cdc2_log_rec_prep
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 14
## 
## ── Training information
## Training data contained 13997 data points and no incomplete rows.
## 
## ── Operations
## • Log transformation on: weight, height | Trained

Correlated Predictors

Get the correlation matrix of all the numeric variables. See if any variables are highly correlated.

Solution

cdc2_training %>% 
  # Select numeric columns
  select_if(is.numeric) %>% 
  # Calculate correlation matrix
  cor()
##                 exerany     hlthplan     smoke100      height       weight
## exerany      1.00000000  0.070955212 -0.043527207  0.11738043 -0.027142201
## hlthplan     0.07095521  1.000000000 -0.035244745  0.02587647  0.012615287
## smoke100    -0.04352721 -0.035244745  1.000000000  0.08741575  0.059794660
## height       0.11738043  0.025876473  0.087415754  1.00000000  0.555625537
## weight      -0.02714220  0.012615287  0.059794660  0.55562554  1.000000000
## wtdesire     0.04379964  0.008970419  0.070332357  0.76527666  0.808262442
## age         -0.10044332  0.193724415  0.108791751 -0.12485019  0.001249224
## BMI         -0.10953520 -0.003864531  0.015994418  0.03038970  0.840865080
## BMIDes      -0.04582997 -0.011297148  0.025311973  0.23228399  0.705095721
## DesActRatio  0.09292346 -0.013114220  0.006036446  0.18416809 -0.488159874
##                 wtdesire          age          BMI      BMIDes  DesActRatio
## exerany      0.043799642 -0.100443317 -0.109535195 -0.04582997  0.092923460
## hlthplan     0.008970419  0.193724415 -0.003864531 -0.01129715 -0.013114220
## smoke100     0.070332357  0.108791751  0.015994418  0.02531197  0.006036446
## height       0.765276662 -0.124850194  0.030389699  0.23228399  0.184168094
## weight       0.808262442  0.001249224  0.840865080  0.70509572 -0.488159874
## wtdesire     1.000000000 -0.023945258  0.475335478  0.79782114  0.090697377
## age         -0.023945258  1.000000000  0.080982717  0.08866912 -0.043142798
## BMI          0.475335478  0.080982717  1.000000000  0.69989874 -0.711127325
## BMIDes       0.797821140  0.088669124  0.699898738  1.00000000 -0.037189701
## DesActRatio  0.090697377 -0.043142798 -0.711127325 -0.03718970  1.000000000

Plot Correlated Predictors

BMI and weight have the highest correlation at .84. Create a scatterplot of these two.

Solution

# Plot correlated predictors
ggplot(cdc2, aes(x = weight, y = BMI)) + 
  # Add points
  geom_point(size = .2)  + 
  # Add title
  labs(title = "Weight vs. BMI",
       y = 'BMI', x = 'weight') 

The Decision

We now know that BMI and weight are highly correlated and that we should probably remove one of them. I decided to ask ChatGPT how tidymodels makes this decision. Here’s a link to the conversation. https://chat.openai.com/share/3162f941-f906-4771-abc1-d6936ed3d586.

Remove BMI

I decided to remove BMI since I know that it was constructed from weight and height. Here’s a modification of the datacamp code that does this.

Solution

# Specify a recipe object
cdc2_cor_rec <- recipe(gender ~ .,
                          data = cdc2_training) %>% 
  # Remove correlated variables
  step_rm(BMI)

# Train the recipe
cdc2_cor_rec_prep <- cdc2_cor_rec %>% 
  prep(training = cdc2_training)

# Apply to training data
cdc2_cor_rec_prep %>% 
  bake(new_data = NULL)
## # A tibble: 13,997 × 14
##    genhlth   exerany hlthplan smoke100 height weight wtdesire   age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>  <dbl>  <int>    <int> <int>  <dbl>
##  1 good            0        1        1     64    125      115    33   19.7
##  2 good            1        1        1     60    105      105    49   20.5
##  3 very good       1        1        0     64    114      114    55   19.6
##  4 very good       1        1        0     67    125      120    33   18.8
##  5 very good       1        1        0     69    200      150    48   22.1
##  6 good            1        1        1     65    160      140    54   23.3
##  7 good            0        1        0     68    190      150    33   22.8
##  8 excellent       1        0        1     67    190      165    44   25.8
##  9 excellent       1        0        1     69    160      150    42   22.1
## 10 very good       1        1        0     61    115      105    32   19.8
## # ℹ 13,987 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>
# Apply to test data
cdc2_cor_rec_prep %>% 
  bake(cdc2_test)
## # A tibble: 6,000 × 14
##    genhlth   exerany hlthplan smoke100 height weight wtdesire   age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>  <dbl>  <int>    <int> <int>  <dbl>
##  1 good            1        1        0     66    132      124    42   20.0
##  2 very good       0        1        0     61    150      130    55   24.6
##  3 good            0        1        1     65    150      130    27   21.6
##  4 excellent       1        1        1     70    170      170    69   24.4
##  5 fair            0        1        1     71    185      185    76   25.8
##  6 very good       0        0        1     64    105      120    27   20.6
##  7 good            1        1        1     68    185      160    67   24.3
##  8 very good       1        1        0     67    160      145    29   22.7
##  9 very good       1        1        1     73    210      170    36   22.4
## 10 excellent       1        1        1     64    200      160    65   27.5
## # ℹ 5,990 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>

Add a Step

Revise the code in the previous chunk. Call the recipe cdc_norm_rec. Repeat the step that removes BMI and include another step to normalize all the numerical variables.

Solution

# Specify a recipe object
cdc2_norm_rec <- recipe(gender ~ .,
                          data = cdc2_training) %>% 
# Remove correlated variables
step_rm(BMI) %>% 

# Normalize numeric predictors
  step_normalize(all_numeric())

# Train the recipe
cdc2_norm_rec_prep <- cdc2_norm_rec %>% 
  prep(training = cdc2_training)

# Apply to test data
cdc2_norm_rec_prep %>% 
  bake(new_data = cdc2_test)
## # A tibble: 6,000 × 14
##    genhlth   exerany hlthplan smoke100  height  weight wtdesire    age BMIDes
##    <fct>       <dbl>    <dbl>    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>
##  1 good        0.585    0.375   -0.940 -0.285  -0.937    -0.979 -0.175 -1.23 
##  2 very good  -1.71     0.375   -0.940 -1.50   -0.487    -0.790  0.578  0.186
##  3 good       -1.71     0.375    1.06  -0.528  -0.487    -0.790 -1.04  -0.723
##  4 excellent   0.585    0.375    1.06   0.684   0.0134    0.474  1.39   0.133
##  5 fair       -1.71     0.375    1.06   0.926   0.389     0.947  1.79   0.570
##  6 very good  -1.71    -2.67     1.06  -0.770  -1.61     -1.11  -1.04  -1.04 
##  7 good        0.585    0.375    1.06   0.199   0.389     0.158  1.27   0.113
##  8 very good   0.585    0.375   -0.940 -0.0430 -0.237    -0.316 -0.928 -0.389
##  9 very good   0.585    0.375    1.06   1.41    1.01      0.474 -0.523 -0.476
## 10 excellent   0.585    0.375    1.06  -0.770   0.764     0.158  1.16   1.09 
## # ℹ 5,990 more rows
## # ℹ 5 more variables: DesActRatio <dbl>, BMICat <fct>, BMIDesCat <fct>,
## #   ageCat <fct>, gender <fct>

Complete Recipe

Create a new cdc2_recipe which does the following:

  1. Remove BMI.
  2. Normalize all the numeric variables.
  3. Convert all the nominal variables to dummy variables.

Solution

# Create a recipe that predicts gender using the training data
cdc2_recipe <- recipe(gender ~ ., data = cdc2_training) %>% 
  # Remove correlated predictors
  step_rm(BMI) %>% 
  # Normalize numeric predictors
  step_normalize(all_numeric()) %>% 
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

Finish

Train your recipe and apply it to the test data.

Solution

cdc2_recipe %>% 
  prep(training = cdc2_training) %>% 
  bake(new_data = cdc2_test)
## # A tibble: 6,000 × 25
##    exerany hlthplan smoke100  height  weight wtdesire    age BMIDes DesActRatio
##      <dbl>    <dbl>    <dbl>   <dbl>   <dbl>    <dbl>  <dbl>  <dbl>       <dbl>
##  1   0.585    0.375   -0.940 -0.285  -0.937    -0.979 -0.175 -1.23        0.109
##  2  -1.71     0.375   -0.940 -1.50   -0.487    -0.790  0.578  0.186      -0.550
##  3  -1.71     0.375    1.06  -0.528  -0.487    -0.790 -1.04  -0.723      -0.550
##  4   0.585    0.375    1.06   0.684   0.0134    0.474  1.39   0.133       0.658
##  5  -1.71     0.375    1.06   0.926   0.389     0.947  1.79   0.570       0.658
##  6  -1.71    -2.67     1.06  -0.770  -1.61     -1.11  -1.04  -1.04        1.95 
##  7   0.585    0.375    1.06   0.199   0.389     0.158  1.27   0.113      -0.567
##  8   0.585    0.375   -0.940 -0.0430 -0.237    -0.316 -0.928 -0.389      -0.191
##  9   0.585    0.375    1.06   1.41    1.01      0.474 -0.523 -0.476      -1.07 
## 10   0.585    0.375    1.06  -0.770   0.764     0.158  1.16   1.09       -1.15 
## # ℹ 5,990 more rows
## # ℹ 16 more variables: gender <fct>, genhlth_very.good <dbl>,
## #   genhlth_good <dbl>, genhlth_fair <dbl>, genhlth_poor <dbl>,
## #   BMICat_Normal <dbl>, BMICat_Overweight <dbl>, BMICat_Obese <dbl>,
## #   BMICat_Morbidly.Obese <dbl>, BMIDesCat_Normal <dbl>,
## #   BMIDesCat_Overweight <dbl>, BMIDesCat_Obese <dbl>,
## #   BMIDesCat_Morbidly.Obese <dbl>, ageCat_X32.43 <dbl>, ageCat_X44.57 <dbl>, …

Complete Modeling Workflow

This workflow is complete for a bivariate choice model using logistic regression. It does not include hyperparameter tuning, which is not required for logistic regression.

Solution

# Specify a logistic regression model
logistic_model <- logistic_reg() %>% 
  # Set the engine
  set_engine('glm') %>% 
  # Set the mode
  set_mode('classification')

# Split the data 

cdc2_split = initial_split(cdc2,
  prop = .7,
  strata = gender)

cdc2_training = cdc2_split %>% 
  training()

cdc2_test = cdc2_split %>% 
  testing()

# Check
nrow(cdc2_training)
## [1] 13997
nrow(cdc2_test)
## [1] 6000
summary(cdc2_test$gender)/nrow(cdc2_test)
##         m         f 
## 0.4783333 0.5216667
summary(cdc2_training$gender)/nrow(cdc2_training)
##         m         f 
## 0.4783882 0.5216118
# Feature Engineering

cdc2_recipe <- recipe(gender ~ ., data = cdc2_training) %>% 
  # Removed correlated predictors
  # You may want to use step_rm() instead
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_rm(exerany,smoke100,hlthplan) %>% 
  # Had to remove numerics with zero values
  # Log transform numeric predictors
  # This is optional
  step_log(all_numeric(), base = 10) %>%
  # Normalize numeric predictors
  step_normalize(all_numeric()) %>% 
  # Create dummy variables
  step_dummy(all_nominal(), -all_outcomes())

# Train recipe
cdc2_recipe_prep <- cdc2_recipe %>% 
  prep(training = cdc2_training)

# Transform training data
cdc2_training_prep <- cdc2_recipe_prep %>% 
  bake(new_data = NULL)

# Transform test data
cdc2_test_prep <- cdc2_recipe_prep %>% 
  bake(new_data = cdc2_test)


# Train logistic model
logistic_fit <- logistic_model %>% 
  fit(gender ~ ., data = cdc2_training_prep)

# Obtain class predictions
class_preds <- predict(logistic_fit, new_data = cdc2_test_prep,
                       type = 'class')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Obtain estimated probabilities
prob_preds <- predict(logistic_fit, new_data = cdc2_test_prep, 
                      type = 'prob')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# Create a confusion matrix

cdc2_results <- cdc2_test_prep %>% 
  select(gender) %>% 
  bind_cols(class_preds, prob_preds)

cdc2_results %>% 
  conf_mat(truth = gender, estimate = .pred_class)
##           Truth
## Prediction    m    f
##          m 2577  288
##          f  293 2842
# Calculate sensitivity
cdc2_results %>% 
  sens(truth = gender, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 sens    binary         0.898
# Calculate specificity
cdc2_results %>% 
  spec(truth = gender, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 spec    binary         0.908
# Plot ROC curve
# Predicted prob is pred_m in this case.
cdc2_results %>% 
  roc_curve(truth = gender, .pred_m) %>% 
  autoplot()

roc_auc(cdc2_results,
        truth = gender, 
        .pred_m)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.961