Feature Selections

Here we try to figure out the important predictors to predict the Balance

# CHUNK 1
# Uncomment the next line the first time you use ISLR
# install.packages("ISLR")
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.3.3

data(Credit)

# Delete the first column containing row indices
Credit$ID <- NULL

summary(Credit)

##      Income           Limit           Rating          Cards      
##  Min.   : 10.35   Min.   :  855   Min.   : 93.0   Min.   :1.000  
##  1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2   1st Qu.:2.000  
##  Median : 33.12   Median : 4622   Median :344.0   Median :3.000  
##  Mean   : 45.22   Mean   : 4736   Mean   :354.9   Mean   :2.958  
##  3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2   3rd Qu.:4.000  
##  Max.   :186.63   Max.   :13913   Max.   :982.0   Max.   :9.000  
##       Age          Education        Gender    Student   Married  
##  Min.   :23.00   Min.   : 5.00    Male :193   No :360   No :155  
##  1st Qu.:41.75   1st Qu.:11.00   Female:207   Yes: 40   Yes:245  
##  Median :56.00   Median :14.00                                   
##  Mean   :55.67   Mean   :13.45                                   
##  3rd Qu.:70.00   3rd Qu.:16.00                                   
##  Max.   :98.00   Max.   :20.00                                   
##             Ethnicity      Balance       
##  African American: 99   Min.   :   0.00  
##  Asian           :102   1st Qu.:  68.75  
##  Caucasian       :199   Median : 459.50  
##                         Mean   : 520.01  
##                         3rd Qu.: 863.00  
##                         Max.   :1999.00

# CHUNK 2
library(ggplot2)
ggplot(Credit, aes(x = Balance)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

nrow(Credit[Credit$Balance == 0, ])  # OR sum(Credit$Balance == 0)

## [1] 90

# CHUNK 3
# Calculate the correlation matrix for numeric variables
# The numeric predictors are in the first 6 columns
cor.matrix <- cor(Credit[, c(1:6, 11)])
print("Correlation Matrix")

## [1] "Correlation Matrix"

round(cor.matrix, digits = 4)

##            Income   Limit  Rating   Cards    Age Education Balance
## Income     1.0000  0.7921  0.7914 -0.0183 0.1753   -0.0277  0.4637
## Limit      0.7921  1.0000  0.9969  0.0102 0.1009   -0.0235  0.8617
## Rating     0.7914  0.9969  1.0000  0.0532 0.1032   -0.0301  0.8636
## Cards     -0.0183  0.0102  0.0532  1.0000 0.0429   -0.0511  0.0865
## Age        0.1753  0.1009  0.1032  0.0429 1.0000    0.0036  0.0018
## Education -0.0277 -0.0235 -0.0301 -0.0511 0.0036    1.0000 -0.0081
## Balance    0.4637  0.8617  0.8636  0.0865 0.0018   -0.0081  1.0000

### Notice the correlation coefficient between the limit and rating

# CHUNK 4
ggplot(Credit, aes(x = Limit, y = Rating)) +
  geom_point()

# Delete Limit
Credit$Limit <- NULL

Association of the predictor variables and the target variable…

# CHUNK 5
# first save the names of the numeric predictors as a vector
vars.numeric <- colnames(Credit[, 1:5])
for (i in vars.numeric) {
  plot <- ggplot(Credit, aes(x = Credit[, i], y = Balance)) +
    geom_point() +
    geom_smooth(method = "lm", se = FALSE) +
    labs(x = i)
  print(plot)
}

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

# CHUNK 6
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.2

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Save the names of the categorical predictors as a vector
vars.categorical <- c("Gender", "Student", "Married", "Ethnicity")
for (i in vars.categorical) {
  x <- Credit %>%
    group_by_(i) %>%
    summarize(
      mean = mean(Balance),
      median = median(Balance),
      n = n()
      )
  print(x)
}

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## # A tibble: 2 × 4
##   Gender    mean median     n
##   <fct>    <dbl>  <int> <int>
## 1 " Male"   510.    463   193
## 2 "Female"  530.    456   207

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## # A tibble: 2 × 4
##   Student  mean median     n
##   <fct>   <dbl>  <dbl> <int>
## 1 No       480.    424   360
## 2 Yes      877.    953    40

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## # A tibble: 2 × 4
##   Married  mean median     n
##   <fct>   <dbl>  <int> <int>
## 1 No       523.    467   155
## 2 Yes      518.    454   245

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## # A tibble: 3 × 4
##   Ethnicity         mean median     n
##   <fct>            <dbl>  <dbl> <int>
## 1 African American  531     480    99
## 2 Asian             512.    414   102
## 3 Caucasian         518.    465   199

# CHUNK 7
for (i in vars.categorical) {
  plot <- ggplot(Credit, aes(x = Credit[, i], y = Balance)) +
    geom_boxplot() +
    labs(x = i)
  print(plot)
}

Here we try to see if we should consider any interaction terms between numerical and cat variables. Think about the difference between the “correlation” and the “interaction”.

# CHUNK 8
ggplot(Credit, aes(x = Student, y = Age)) +
  geom_boxplot()

ggplot(Credit, aes(x = Income, y = Balance, color = Student)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

Here, we start partitioning the dataset and fit the lm (ols).

# CHUNK 9
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(8964)
partition <- createDataPartition(Credit$Balance, p = 0.75, list = FALSE)
data.train <- Credit[partition, ]
data.test <- Credit[-partition, ]

print("TRAIN")

## [1] "TRAIN"

mean(data.train$Balance)

## [1] 520.4352

print("TEST")

## [1] "TEST"

mean(data.test$Balance)

## [1] 518.7374

In R, the binarization is done automatically, but using the alphanumeric order of the levels.

# CHUNK 10
model.full <- lm(Balance ~ . + Income:Student, data = data.train)
summary(model.full)

## 
## Call:
## lm(formula = Balance ~ . + Income:Student, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189.17  -76.46  -15.65   66.69  296.17 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -552.34353   40.34543 -13.690   <2e-16 ***
## Income               -7.93864    0.29465 -26.943   <2e-16 ***
## Rating                4.01246    0.06416  62.540   <2e-16 ***
## Cards                 2.89772    4.69168   0.618   0.5373    
## Age                  -0.78778    0.35994  -2.189   0.0294 *  
## Education             0.88922    1.92035   0.463   0.6437    
## GenderFemale        -18.37030   12.00867  -1.530   0.1272    
## StudentYes          395.67744   30.72302  12.879   <2e-16 ***
## MarriedYes          -20.46469   12.39735  -1.651   0.0999 .  
## EthnicityAsian        6.53269   17.13794   0.381   0.7033    
## EthnicityCaucasian   11.95386   14.82444   0.806   0.4207    
## Income:StudentYes     0.50102    0.50028   1.001   0.3174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared:  0.952,  Adjusted R-squared:  0.9502 
## F-statistic: 521.4 on 11 and 289 DF,  p-value: < 2.2e-16

Do the binarization manually.

# CHUNK 11
for (i in vars.categorical){
  # Use the table() function to calculate the frequencies for each factor
  table <- as.data.frame(table(Credit[, i]))
  # Determine the level with the highest frequency
  max <- which.max(table[, 2])
  # Save the name of the level with the highest frequency
  level.name <- as.character(table[max, 1])
  # Set the baseline level to the most populous level
  Credit[, i] <- relevel(Credit[, i], ref = level.name) ## ref is the baseline
}

summary(Credit[, vars.categorical])

##     Gender    Student   Married              Ethnicity  
##  Female:207   No :360   Yes:245   Caucasian       :199  
##   Male :193   Yes: 40   No :155   African American: 99  
##                                   Asian           :102

fitting the model by changing the baseline level.

# CHUNK 12
# To make sure factors in the training set are releveled
data.train <- Credit[partition, ]

model.full <- lm(Balance ~ . + Income:Student, data = data.train)
summary(model.full)

## 
## Call:
## lm(formula = Balance ~ . + Income:Student, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189.17  -76.46  -15.65   66.69  296.17 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -579.22466   40.08600 -14.450   <2e-16 ***
## Income                      -7.93864    0.29465 -26.943   <2e-16 ***
## Rating                       4.01246    0.06416  62.540   <2e-16 ***
## Cards                        2.89772    4.69168   0.618   0.5373    
## Age                         -0.78778    0.35994  -2.189   0.0294 *  
## Education                    0.88922    1.92035   0.463   0.6437    
## Gender Male                 18.37030   12.00867   1.530   0.1272    
## StudentYes                 395.67744   30.72302  12.879   <2e-16 ***
## MarriedNo                   20.46469   12.39735   1.651   0.0999 .  
## EthnicityAfrican American  -11.95386   14.82444  -0.806   0.4207    
## EthnicityAsian              -5.42117   14.68523  -0.369   0.7123    
## Income:StudentYes            0.50102    0.50028   1.001   0.3174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared:  0.952,  Adjusted R-squared:  0.9502 
## F-statistic: 521.4 on 11 and 289 DF,  p-value: < 2.2e-16

Create new columns with the dummy variables based on the manual binarization.

# CHUNK 13
library(caret)
binarizer <- dummyVars(paste("~", paste(vars.categorical, collapse = "+")),
                       data = Credit, fullRank = TRUE)
# OR type out the categorical predictors one by one
#binarizer <- dummyVars(~ Gender + Student + Married + Ethnicity,
#                       data = Credit, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = Credit))
head(binarized_vars)

##   Gender..Male Student.Yes Married.No Ethnicity.African.American
## 1            1           0          0                          0
## 2            0           1          0                          0
## 3            1           0          1                          0
## 4            0           0          1                          0
## 5            1           0          0                          0
## 6            1           0          1                          0
##   Ethnicity.Asian
## 1               0
## 2               1
## 3               1
## 4               1
## 5               0
## 6               0

Here, we include the dummy variables in our CREDIT dataframe and delete the original categorical variables. Later, you will see the reason for manual binarization (feature selection part)

# CHUNK 14
Credit.bin <- cbind(Credit, binarized_vars)
head(Credit.bin)

##    Income Rating Cards Age Education Gender Student Married Ethnicity Balance
## 1  14.891    283     2  34        11   Male      No     Yes Caucasian     333
## 2 106.025    483     3  82        15 Female     Yes     Yes     Asian     903
## 3 104.593    514     4  71        11   Male      No      No     Asian     580
## 4 148.924    681     3  36        11 Female      No      No     Asian     964
## 5  55.882    357     2  68        16   Male      No     Yes Caucasian     331
## 6  80.180    569     4  77        10   Male      No      No Caucasian    1151
##   Gender..Male Student.Yes Married.No Ethnicity.African.American
## 1            1           0          0                          0
## 2            0           1          0                          0
## 3            1           0          1                          0
## 4            0           0          1                          0
## 5            1           0          0                          0
## 6            1           0          1                          0
##   Ethnicity.Asian
## 1               0
## 2               1
## 3               1
## 4               1
## 5               0
## 6               0

Credit.bin$Gender <- NULL
Credit.bin$Student <- NULL
Credit.bin$Married <- NULL
Credit.bin$Ethnicity <- NULL
head(Credit.bin)

##    Income Rating Cards Age Education Balance Gender..Male Student.Yes
## 1  14.891    283     2  34        11     333            1           0
## 2 106.025    483     3  82        15     903            0           1
## 3 104.593    514     4  71        11     580            1           0
## 4 148.924    681     3  36        11     964            0           0
## 5  55.882    357     2  68        16     331            1           0
## 6  80.180    569     4  77        10    1151            1           0
##   Married.No Ethnicity.African.American Ethnicity.Asian
## 1          0                          0               0
## 2          0                          0               1
## 3          1                          0               1
## 4          1                          0               1
## 5          0                          0               0
## 6          1                          0               0

data.train.bin <- Credit.bin[partition, ]
data.test.bin <- Credit.bin[-partition, ]

Fit the linear model to the new training data set (with the manually created dummy variables)

# CHUNK 15
# The interaction term is now Income:Student.Yes, not Income:Student
model.full.bin <- lm(Balance ~ . + Income:Student.Yes, data = data.train.bin)
summary(model.full.bin)

## 
## Call:
## lm(formula = Balance ~ . + Income:Student.Yes, data = data.train.bin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -189.17  -76.46  -15.65   66.69  296.17 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -579.22466   40.08600 -14.450   <2e-16 ***
## Income                       -7.93864    0.29465 -26.943   <2e-16 ***
## Rating                        4.01246    0.06416  62.540   <2e-16 ***
## Cards                         2.89772    4.69168   0.618   0.5373    
## Age                          -0.78778    0.35994  -2.189   0.0294 *  
## Education                     0.88922    1.92035   0.463   0.6437    
## Gender..Male                 18.37030   12.00867   1.530   0.1272    
## Student.Yes                 395.67744   30.72302  12.879   <2e-16 ***
## Married.No                   20.46469   12.39735   1.651   0.0999 .  
## Ethnicity.African.American  -11.95386   14.82444  -0.806   0.4207    
## Ethnicity.Asian              -5.42117   14.68523  -0.369   0.7123    
## Income:Student.Yes            0.50102    0.50028   1.001   0.3174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared:  0.952,  Adjusted R-squared:  0.9502 
## F-statistic: 521.4 on 11 and 289 DF,  p-value: < 2.2e-16

Selecting the best predictor substet using backward and forward feature selection method.

# CHUNK 16
# install.packages("MASS")  # uncomment this line the first time you use MASS
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

model.backward.AIC <- stepAIC(model.full.bin)

## Start:  AIC=2802.25
## Balance ~ Income + Rating + Cards + Age + Education + Gender..Male + 
##     Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes
## 
##                              Df Sum of Sq      RSS    AIC
## - Ethnicity.Asian             1      1448  3071419 2800.4
## - Education                   1      2278  3072249 2800.5
## - Cards                       1      4052  3074024 2800.6
## - Ethnicity.African.American  1      6907  3076879 2800.9
## - Income:Student.Yes          1     10654  3080626 2801.3
## <none>                                     3069972 2802.3
## - Gender..Male                1     24859  3094830 2802.7
## - Married.No                  1     28946  3098918 2803.1
## - Age                         1     50885  3120857 2805.2
## - Rating                      1  41548705 44618677 3605.9
## 
## Step:  AIC=2800.39
## Balance ~ Income + Rating + Cards + Age + Education + Gender..Male + 
##     Student.Yes + Married.No + Ethnicity.African.American + Income:Student.Yes
## 
##                              Df Sum of Sq      RSS    AIC
## - Education                   1      2255  3073675 2798.6
## - Cards                       1      3911  3075331 2798.8
## - Ethnicity.African.American  1      5595  3077014 2798.9
## - Income:Student.Yes          1     10176  3081595 2799.4
## <none>                                     3071419 2800.4
## - Gender..Male                1     25165  3096584 2800.8
## - Married.No                  1     29351  3100771 2801.3
## - Age                         1     51462  3122881 2803.4
## - Rating                      1  41550276 44621695 3603.9
## 
## Step:  AIC=2798.61
## Balance ~ Income + Rating + Cards + Age + Gender..Male + Student.Yes + 
##     Married.No + Ethnicity.African.American + Income:Student.Yes
## 
##                              Df Sum of Sq      RSS    AIC
## - Cards                       1      3865  3077540 2797.0
## - Ethnicity.African.American  1      5586  3079261 2797.2
## - Income:Student.Yes          1     10214  3083888 2797.6
## <none>                                     3073675 2798.6
## - Gender..Male                1     25155  3098830 2799.1
## - Married.No                  1     28499  3102174 2799.4
## - Age                         1     51652  3125327 2801.6
## - Rating                      1  41550265 44623939 3601.9
## 
## Step:  AIC=2796.99
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes + 
##     Married.No + Ethnicity.African.American + Income:Student.Yes
## 
##                              Df Sum of Sq      RSS    AIC
## - Ethnicity.African.American  1      5579  3083119 2795.5
## - Income:Student.Yes          1     10767  3088307 2796.0
## <none>                                     3077540 2797.0
## - Gender..Male                1     23659  3101198 2797.3
## - Married.No                  1     27917  3105457 2797.7
## - Age                         1     49660  3127200 2799.8
## - Rating                      1  42161655 45239195 3604.0
## 
## Step:  AIC=2795.54
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes + 
##     Married.No + Income:Student.Yes
## 
##                      Df Sum of Sq      RSS    AIC
## - Income:Student.Yes  1      9873  3092992 2794.5
## <none>                             3083119 2795.5
## - Gender..Male        1     23435  3106554 2795.8
## - Married.No          1     24716  3107835 2795.9
## - Age                 1     50672  3133791 2798.4
## - Rating              1  42165674 45248793 3602.1
## 
## Step:  AIC=2794.5
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes + 
##     Married.No
## 
##                Df Sum of Sq      RSS    AIC
## <none>                       3092992 2794.5
## - Gender..Male  1     22314  3115306 2794.7
## - Married.No    1     25027  3118019 2794.9
## - Age           1     49133  3142125 2797.2
## - Student.Yes   1   4931600  8024592 3079.5
## - Income        1   8357794 11450786 3186.5
## - Rating        1  42247226 45340218 3600.7

summary(model.backward.AIC)

## 
## Call:
## lm(formula = Balance ~ Income + Rating + Age + Gender..Male + 
##     Student.Yes + Married.No, data = data.train.bin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -195.16  -76.45  -14.03   65.33  285.30 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -565.22267   26.73779 -21.139   <2e-16 ***
## Income         -7.87666    0.27946 -28.186   <2e-16 ***
## Rating          4.01180    0.06331  63.370   <2e-16 ***
## Age            -0.77016    0.35638  -2.161   0.0315 *  
## Gender..Male   17.32103   11.89326   1.456   0.1464    
## Student.Yes   418.87031   19.34645  21.651   <2e-16 ***
## Married.No     18.73773   12.14860   1.542   0.1241    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 102.6 on 294 degrees of freedom
## Multiple R-squared:  0.9517, Adjusted R-squared:  0.9507 
## F-statistic: 964.8 on 6 and 294 DF,  p-value: < 2.2e-16

What if we use the model without the bins (manual binarization). You can see that the levels of the categorical variables will not stay in the model if we do not use the manual binarization.

** Just dfor the demonstration Only one step is included.

# CHUNK 17
model.no.binarize <- stepAIC(model.full, steps = 1)

## Start:  AIC=2802.25
## Balance ~ Income + Rating + Cards + Age + Education + Gender + 
##     Student + Married + Ethnicity + Income:Student
## 
##                  Df Sum of Sq      RSS    AIC
## - Ethnicity       2      7042  3077014 2798.9
## - Education       1      2278  3072249 2800.5
## - Cards           1      4052  3074024 2800.6
## - Income:Student  1     10654  3080626 2801.3
## <none>                         3069972 2802.3
## - Gender          1     24859  3094830 2802.7
## - Married         1     28946  3098918 2803.1
## - Age             1     50885  3120857 2805.2
## - Rating          1  41548705 44618677 3605.9
## 
## Step:  AIC=2798.94
## Balance ~ Income + Rating + Cards + Age + Education + Gender + 
##     Student + Married + Income:Student

Here, we use the forward selection, with BIC.

# CHUNK 18
# first fit the null model (i.e., model with no predictors)
model.null <- lm(Balance ~ 1, data = data.train.bin)

model.forward.BIC <- stepAIC(model.null,
                             direction = "forward",
                             scope = list(upper = model.full.bin,
                                          lower = model.null),
                             k = log(nrow(data.train.bin))) ## k=2 is AIC (default) k = log(nrow(data.train.bin)) generates the BIC

## Start:  AIC=3698.12
## Balance ~ 1
## 
##                              Df Sum of Sq      RSS    AIC
## + Rating                      1  47082983 16908423 3303.2
## + Income                      1  12910007 51081399 3636.0
## + Student.Yes                 1   5425335 58566071 3677.2
## <none>                                    63991406 3698.1
## + Cards                       1    163291 63828115 3703.1
## + Gender..Male                1    127201 63864205 3703.2
## + Age                         1     79880 63911526 3703.5
## + Education                   1     13121 63978284 3703.8
## + Married.No                  1     10920 63980486 3703.8
## + Ethnicity.Asian             1      3049 63988357 3703.8
## + Ethnicity.African.American  1         0 63991406 3703.8
## 
## Step:  AIC=3303.21
## Balance ~ Rating
## 
##                              Df Sum of Sq      RSS    AIC
## + Income                      1   8650956  8257467 3093.2
## + Student.Yes                 1   4817765 12090659 3208.0
## + Age                         1    707560 16200863 3296.1
## <none>                                    16908423 3303.2
## + Married.No                  1    174547 16733876 3305.8
## + Education                   1     71402 16837022 3307.6
## + Ethnicity.Asian             1     49620 16858804 3308.0
## + Gender..Male                1     31887 16876536 3308.4
## + Cards                       1     30049 16878374 3308.4
## + Ethnicity.African.American  1      4304 16904120 3308.8
## 
## Step:  AIC=3093.2
## Balance ~ Rating + Income
## 
##                              Df Sum of Sq     RSS    AIC
## + Student.Yes                 1   5069114 3188353 2812.5
## <none>                                    8257467 3093.2
## + Married.No                  1    127904 8129563 3094.2
## + Age                         1     95208 8162259 3095.4
## + Education                   1     44496 8212971 3097.3
## + Ethnicity.Asian             1     40115 8217352 3097.4
## + Cards                       1     22019 8235448 3098.1
## + Ethnicity.African.American  1      9464 8248003 3098.6
## + Gender..Male                1       244 8257223 3098.9
## 
## Step:  AIC=2812.47
## Balance ~ Rating + Income + Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## <none>                                    3188353 2812.5
## + Age                         1     46257 3142096 2813.8
## + Gender..Male                1     24128 3164225 2815.9
## + Married.No                  1     23759 3164595 2815.9
## + Income:Student.Yes          1      7569 3180784 2817.5
## + Ethnicity.African.American  1      2346 3186007 2817.9
## + Education                   1      1600 3186753 2818.0
## + Cards                       1       865 3187488 2818.1
## + Ethnicity.Asian             1       747 3187606 2818.1

summary(model.forward.BIC)

## 
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes, data = data.train.bin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -220.292  -78.120   -7.473   63.273  299.895 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -589.16636   16.11012  -36.57   <2e-16 ***
## Rating         4.01634    0.06353   63.22   <2e-16 ***
## Income        -7.97413    0.27691  -28.80   <2e-16 ***
## Student.Yes  421.17339   19.38206   21.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 103.6 on 297 degrees of freedom
## Multiple R-squared:  0.9502, Adjusted R-squared:  0.9497 
## F-statistic:  1888 on 3 and 297 DF,  p-value: < 2.2e-16

RMSE for the test datasets. This explain the prediction accuracy.

# CHUNK 19
RMSE(data.test$Balance, predict(model.null, newdata = data.test.bin)) # none.

## [1] 453.3665

RMSE(data.test$Balance, predict(model.full.bin, newdata = data.test.bin)) # 11 predictors

## [1] 105.0656

RMSE(data.test$Balance, predict(model.backward.AIC, newdata = data.test.bin)) # 7 predictors

## [1] 104.2375

RMSE(data.test$Balance, predict(model.forward.BIC, newdata = data.test.bin)) ## 5 predictors

## [1] 102.6785

We choose the model with the smallest RMSE, and the one we choose using forward selection is the best. Now, let do the model diagnostics for the best model.

# CHUNK 20
plot(model.forward.BIC)

# CHUNK 21
X.train <- model.matrix(Balance ~ . + Income:Student, data = data.train)
head(X.train)  # print out a few rows of the design matrix

##   (Intercept)  Income Rating Cards Age Education Gender Male StudentYes
## 1           1  14.891    283     2  34        11           1          0
## 2           1 106.025    483     3  82        15           0          1
## 3           1 104.593    514     4  71        11           1          0
## 6           1  80.180    569     4  77        10           1          0
## 8           1  71.408    512     2  87         9           1          0
## 9           1  15.125    266     5  66        13           0          0
##   MarriedNo EthnicityAfrican American EthnicityAsian Income:StudentYes
## 1         0                         0              0             0.000
## 2         0                         0              1           106.025
## 3         1                         0              1             0.000
## 6         1                         0              0             0.000
## 8         1                         0              1             0.000
## 9         1                         0              0             0.000

# CHUNK 22
# install.packages("glmnet")  # uncomment this line the first time you use glmnet
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-10

m.lambda <- glmnet(x = X.train,
                   y = data.train$Balance,
                   family = "gaussian",
                   lambda = c(0, 10, 100, 500, 1000),
                   alpha = 0.5)

# first way to extract coefficient estimates
m.lambda$a0

##        s0        s1        s2        s3        s4 
##  520.4352  301.2973 -224.9669 -506.0179 -579.1925

m.lambda$beta

## 12 x 5 sparse Matrix of class "dgCMatrix"
##                           s0        s1         s2           s3          s4
## (Intercept)                . .           .          .            .        
## Income                     . .           .         -6.63360837  -7.9320588
## Rating                     . 0.6182676   2.034434   3.71042914   4.0114839
## Cards                      . .           .          0.51433943   2.9161946
## Age                        . .           .         -0.70550849  -0.7885101
## Education                  . .           .          .            0.8874960
## Gender Male                . .           .          3.08510246  18.3671409
## StudentYes                 . .         228.762987 396.62014836 396.2181065
## MarriedNo                  . .           .          9.36968005  20.4646766
## EthnicityAfrican American  . .           .          .          -11.9281110
## EthnicityAsian             . .           .          .           -5.3983474
## Income:StudentYes          . .           .          0.01473463   0.4929168

# second way
coef(m.lambda)

## 13 x 5 sparse Matrix of class "dgCMatrix"
##                                 s0          s1          s2            s3
## (Intercept)               520.4352 301.2973356 -224.966926 -506.01794831
## (Intercept)                 .        .            .           .         
## Income                      .        .            .          -6.63360837
## Rating                      .        0.6182676    2.034434    3.71042914
## Cards                       .        .            .           0.51433943
## Age                         .        .            .          -0.70550849
## Education                   .        .            .           .         
## Gender Male                 .        .            .           3.08510246
## StudentYes                  .        .          228.762987  396.62014836
## MarriedNo                   .        .            .           9.36968005
## EthnicityAfrican American   .        .            .           .         
## EthnicityAsian              .        .            .           .         
## Income:StudentYes           .        .            .           0.01473463
##                                     s4
## (Intercept)               -579.1924633
## (Intercept)                  .        
## Income                      -7.9320588
## Rating                       4.0114839
## Cards                        2.9161946
## Age                         -0.7885101
## Education                    0.8874960
## Gender Male                 18.3671409
## StudentYes                 396.2181065
## MarriedNo                   20.4646766
## EthnicityAfrican American  -11.9281110
## EthnicityAsian              -5.3983474
## Income:StudentYes            0.4929168

mean(data.train$Balance)

## [1] 520.4352

# CHUNK 23
set.seed(1111)
m <- cv.glmnet(x = X.train,
               y = data.train$Balance,
               family = "gaussian",
               alpha = 0.5)
plot(m)

m$lambda.min

## [1] 0.8885539

m$lambda.1se

## [1] 7.550517

# CHUNK 24
# Fit the regularized model using lambda.min
m.min <- glmnet(x = X.train,
                y = data.train$Balance,
                family = "gaussian",
                lambda = m$lambda.min,
                alpha = 0.5)

# List the coefficient estimates
m.min$beta

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                    s0
## (Intercept)                 .        
## Income                     -7.8131829
## Rating                      3.9839963
## Cards                       2.6915109
## Age                        -0.7829239
## Education                   0.7557949
## Gender Male                17.0049498
## StudentYes                396.3329355
## MarriedNo                  19.4182020
## EthnicityAfrican American  -9.9708020
## EthnicityAsian             -3.6479011
## Income:StudentYes           0.4458288

# Set up the design matrix for the test set
X.test <- model.matrix(Balance ~ . + Income:Student, data = data.test)

# Make predictions on the test set and calculate test RMSE
m.min.pred <- predict(m.min, newx = X.test)
RMSE(data.test$Balance, m.min.pred)

## [1] 103.9201

# CHUNK 25
# Fit the regularized model using lambda.1se
m.1se <- glmnet(x = X.train,
                y = data.train$Balance,
                family = "gaussian",
                lambda = m$lambda.1se,
                alpha = 0.5)

# List the coefficient estimates
m.1se$beta

## 12 x 1 sparse Matrix of class "dgCMatrix"
##                                    s0
## (Intercept)                 .        
## Income                     -6.9378441
## Rating                      3.7813573
## Cards                       1.1075391
## Age                        -0.7305808
## Education                   .        
## Gender Male                 6.7717419
## StudentYes                397.5199346
## MarriedNo                  11.6768447
## EthnicityAfrican American   .        
## EthnicityAsian              .        
## Income:StudentYes           0.1148304

# Make predictions on the test set and calculate test RMSE
m.1se.pred <- predict(m.1se, newx = X.test)
RMSE(data.test$Balance, m.1se.pred)

## [1] 102.8241

Feature Selection Case Study

Feature Selections