credit <- read.csv('Credit.csv')
str(credit)
## 'data.frame':    400 obs. of  12 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Income   : num  14.9 106 104.6 148.9 55.9 ...
##  $ Limit    : int  3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
##  $ Rating   : int  283 483 514 681 357 569 259 512 266 491 ...
##  $ Cards    : int  2 3 4 3 2 4 2 2 5 3 ...
##  $ Age      : int  34 82 71 36 68 77 37 87 66 41 ...
##  $ Education: int  11 15 11 11 16 10 12 9 13 19 ...
##  $ Gender   : chr  "Males" "Female" "Males" "Female" ...
##  $ Student  : chr  "No" "Yes" "No" "No" ...
##  $ Married  : chr  "Yes" "Yes" "No" "No" ...
##  $ Ethnicity: chr  "Caucasian" "Asian" "Asian" "Asian" ...
##  $ Balance  : int  333 903 580 964 331 1151 203 872 279 1350 ...
credit$X <- NULL
summary(credit)
##      Income           Limit           Rating          Cards      
##  Min.   : 10.35   Min.   :  855   Min.   : 93.0   Min.   :1.000  
##  1st Qu.: 21.01   1st Qu.: 3088   1st Qu.:247.2   1st Qu.:2.000  
##  Median : 33.12   Median : 4622   Median :344.0   Median :3.000  
##  Mean   : 45.22   Mean   : 4736   Mean   :354.9   Mean   :2.958  
##  3rd Qu.: 57.47   3rd Qu.: 5873   3rd Qu.:437.2   3rd Qu.:4.000  
##  Max.   :186.63   Max.   :13913   Max.   :982.0   Max.   :9.000  
##       Age          Education        Gender            Student         
##  Min.   :23.00   Min.   : 5.00   Length:400         Length:400        
##  1st Qu.:41.75   1st Qu.:11.00   Class :character   Class :character  
##  Median :56.00   Median :14.00   Mode  :character   Mode  :character  
##  Mean   :55.67   Mean   :13.45                                        
##  3rd Qu.:70.00   3rd Qu.:16.00                                        
##  Max.   :98.00   Max.   :20.00                                        
##    Married           Ethnicity            Balance       
##  Length:400         Length:400         Min.   :   0.00  
##  Class :character   Class :character   1st Qu.:  68.75  
##  Mode  :character   Mode  :character   Median : 459.50  
##                                        Mean   : 520.01  
##                                        3rd Qu.: 863.00  
##                                        Max.   :1999.00

distribution of Balance

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
credit %>% ggplot(aes(x=Balance)) +
  geom_histogram(bins = 15) +
  labs(x = 'Balance')

calculate the number of 0 Balance

nrow(credit[credit$Balance == 0, ])
## [1] 90

relationship between Balance and numeric predictors

vars.numeric <- colnames(credit)[1:6]
numeric_df <- cbind(credit[, 1:6], credit$Balance)

for (col_i in 1:6) {
  plot <- numeric_df %>% ggplot(
    aes(x = numeric_df[, col_i], y = credit$Balance)
  ) +
    geom_point() +
    geom_smooth(method='lm', se = FALSE) +
    labs(x = vars.numeric[col_i])
  print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

explore the relationship between Rating and Limit

numeric_df %>% ggplot(aes(x = Limit, y = Rating)) +
  geom_point() +
  geom_smooth(method='lm', se=F)
## `geom_smooth()` using formula = 'y ~ x'

credit$Limit <- NULL

relationship between Balance and categorical predictors

vars.category <- colnames(credit)[6:9]
category_df <- cbind(credit[, 6:9], credit$Balance)

for (col_i in 1:4) {
  plot <- category_df %>% ggplot(
    aes(x = category_df[, col_i], y = credit$Balance)
  ) +
    geom_boxplot() +
    labs(x = vars.category[col_i])
  print(plot)
}

get the interaction relationship

interaction between Student and Income

credit %>% ggplot(aes(x = Income, y = Balance)) +
  geom_point(aes(color = Student)) +
  geom_smooth(aes(color = Student), method='lm', se = F)
## `geom_smooth()` using formula = 'y ~ x'

interaction between Student and Rating

credit %>% ggplot(aes(x=Rating, y=Balance)) +
  geom_point(aes(color=Student)) +
  geom_smooth(aes(color=Student), method='lm', se=F)
## `geom_smooth()` using formula = 'y ~ x'

change the categorical variables to factor and set the base level with the most common level

for (var in vars.category) {
  var_table <- as.data.frame(table(credit[, var]))
  most_common <- which.max(var_table$Freq)
  base_level <- as.character(var_table[most_common, 'Var1'])
  credit[[var]] <- relevel(factor(credit[, var]), ref=base_level)
}
str(credit)
## 'data.frame':    400 obs. of  10 variables:
##  $ Income   : num  14.9 106 104.6 148.9 55.9 ...
##  $ Rating   : int  283 483 514 681 357 569 259 512 266 491 ...
##  $ Cards    : int  2 3 4 3 2 4 2 2 5 3 ...
##  $ Age      : int  34 82 71 36 68 77 37 87 66 41 ...
##  $ Education: int  11 15 11 11 16 10 12 9 13 19 ...
##  $ Gender   : Factor w/ 2 levels "Female","Males": 2 1 2 1 2 2 1 2 1 1 ...
##  $ Student  : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ Married  : Factor w/ 2 levels "Yes","No": 1 1 2 2 1 2 2 2 2 1 ...
##  $ Ethnicity: Factor w/ 3 levels "Caucasian","African American",..: 1 3 3 3 1 1 2 3 1 2 ...
##  $ Balance  : int  333 903 580 964 331 1151 203 872 279 1350 ...

training and test dataset splitting

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(123)
partition <- createDataPartition(credit$Balance, p=0.7, list=F)
trains <- credit[partition, ]
tests <- credit[-partition,]

summary(trains$Balance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    70.0   463.0   512.4   863.0  1779.0
summary(tests$Balance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    60.5   453.0   538.0   853.5  1999.0

model fitting with full variables

model.full <- lm(Balance ~ . + Income:Student + Rating:Student, data=trains)
summary(model.full)
## 
## Call:
## lm(formula = Balance ~ . + Income:Student + Rating:Student, data = trains)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -217.28  -73.14  -13.93   71.91  282.53 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -532.27287   41.58440 -12.800  < 2e-16 ***
## Income                      -7.46589    0.31320 -23.838  < 2e-16 ***
## Rating                       3.85969    0.06905  55.899  < 2e-16 ***
## Cards                        4.51095    4.71744   0.956 0.339819    
## Age                         -0.64226    0.37220  -1.726 0.085574 .  
## Education                   -0.25118    2.01099  -0.125 0.900693    
## GenderMales                  8.34677   12.68353   0.658 0.511052    
## StudentYes                 191.55757   70.17185   2.730 0.006756 ** 
## MarriedNo                   19.06950   13.26737   1.437 0.151793    
## EthnicityAfrican American  -20.67684   15.57792  -1.327 0.185533    
## EthnicityAsian              -3.14081   15.34472  -0.205 0.837975    
## Income:StudentYes           -2.56262    1.28661  -1.992 0.047412 *  
## Rating:StudentYes            0.96083    0.26575   3.616 0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104.2 on 268 degrees of freedom
## Multiple R-squared:  0.9476, Adjusted R-squared:  0.9452 
## F-statistic: 403.5 on 12 and 268 DF,  p-value: < 2.2e-16

Dummy variables

binarizer <- dummyVars(paste('~ ', paste(vars.category, collapse = '+')), data=credit, fullRank = TRUE)

binarized_vars <- data.frame(predict(binarizer, newdata=credit))

binarized_vars %>% head()
##   Gender.Males Student.Yes Married.No Ethnicity.African.American
## 1            1           0          0                          0
## 2            0           1          0                          0
## 3            1           0          1                          0
## 4            0           0          1                          0
## 5            1           0          0                          0
## 6            1           0          1                          0
##   Ethnicity.Asian
## 1               0
## 2               1
## 3               1
## 4               1
## 5               0
## 6               0

combine the original credit data and binarized data

credit.bin <- cbind(credit, binarized_vars)
credit.bin$Gender <- NULL
credit.bin$Student <- NULL
credit.bin$Married <- NULL
credit.bin$Ethnicity <- NULL

head(credit.bin)
##    Income Rating Cards Age Education Balance Gender.Males Student.Yes
## 1  14.891    283     2  34        11     333            1           0
## 2 106.025    483     3  82        15     903            0           1
## 3 104.593    514     4  71        11     580            1           0
## 4 148.924    681     3  36        11     964            0           0
## 5  55.882    357     2  68        16     331            1           0
## 6  80.180    569     4  77        10    1151            1           0
##   Married.No Ethnicity.African.American Ethnicity.Asian
## 1          0                          0               0
## 2          0                          0               1
## 3          1                          0               1
## 4          1                          0               1
## 5          0                          0               0
## 6          1                          0               0
trains.bin <- credit.bin[partition, ]
tests.bin <- credit.bin[-partition, ]
model.full.bin <- lm(Balance ~.+Income:Student.Yes+Rating:Student.Yes, data=trains.bin)
summary(model.full.bin)
## 
## Call:
## lm(formula = Balance ~ . + Income:Student.Yes + Rating:Student.Yes, 
##     data = trains.bin)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -217.28  -73.14  -13.93   71.91  282.53 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -532.27287   41.58440 -12.800  < 2e-16 ***
## Income                       -7.46589    0.31320 -23.838  < 2e-16 ***
## Rating                        3.85969    0.06905  55.899  < 2e-16 ***
## Cards                         4.51095    4.71744   0.956 0.339819    
## Age                          -0.64226    0.37220  -1.726 0.085574 .  
## Education                    -0.25118    2.01099  -0.125 0.900693    
## Gender.Males                  8.34677   12.68353   0.658 0.511052    
## Student.Yes                 191.55757   70.17185   2.730 0.006756 ** 
## Married.No                   19.06950   13.26737   1.437 0.151793    
## Ethnicity.African.American  -20.67684   15.57792  -1.327 0.185533    
## Ethnicity.Asian              -3.14081   15.34472  -0.205 0.837975    
## Income:Student.Yes           -2.56262    1.28661  -1.992 0.047412 *  
## Rating:Student.Yes            0.96083    0.26575   3.616 0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104.2 on 268 degrees of freedom
## Multiple R-squared:  0.9476, Adjusted R-squared:  0.9452 
## F-statistic: 403.5 on 12 and 268 DF,  p-value: < 2.2e-16

feature selections

using drop1() function

drop1(model.full)
## Single term deletions
## 
## Model:
## Balance ~ Income + Rating + Cards + Age + Education + Gender + 
##     Student + Married + Ethnicity + Income:Student + Rating:Student
##                Df Sum of Sq     RSS    AIC
## <none>                      2908151 2623.8
## Cards           1      9922 2918074 2622.7
## Age             1     32311 2940463 2624.9
## Education       1       169 2908321 2621.8
## Gender          1      4699 2912851 2622.2
## Married         1     22418 2930569 2623.9
## Ethnicity       2     19759 2927910 2621.7
## Income:Student  1     43048 2951200 2625.9
## Rating:Student  1    141853 3050005 2635.1
drop1(model.full.bin)
## Single term deletions
## 
## Model:
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males + 
##     Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes + Rating:Student.Yes
##                            Df Sum of Sq     RSS    AIC
## <none>                                  2908151 2623.8
## Cards                       1      9922 2918074 2622.7
## Age                         1     32311 2940463 2624.9
## Education                   1       169 2908321 2621.8
## Gender.Males                1      4699 2912851 2622.2
## Married.No                  1     22418 2930569 2623.9
## Ethnicity.African.American  1     19118 2927269 2623.6
## Ethnicity.Asian             1       455 2908606 2621.8
## Income:Student.Yes          1     43048 2951200 2625.9
## Rating:Student.Yes          1    141853 3050005 2635.1

using stepAIC() function with direction = ‘backward’

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
model.backward <- stepAIC(model.full.bin)
## Start:  AIC=2623.75
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males + 
##     Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Education                   1       169 2908321 2621.8
## - Ethnicity.Asian             1       455 2908606 2621.8
## - Gender.Males                1      4699 2912851 2622.2
## - Cards                       1      9922 2918074 2622.7
## - Ethnicity.African.American  1     19118 2927269 2623.6
## <none>                                    2908151 2623.8
## - Married.No                  1     22418 2930569 2623.9
## - Age                         1     32311 2940463 2624.9
## - Income:Student.Yes          1     43048 2951200 2625.9
## - Rating:Student.Yes          1    141853 3050005 2635.1
## 
## Step:  AIC=2621.77
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes + 
##     Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Ethnicity.Asian             1       502 2908823 2619.8
## - Gender.Males                1      4739 2913060 2620.2
## - Cards                       1      9985 2918306 2620.7
## - Ethnicity.African.American  1     19591 2927911 2621.7
## <none>                                    2908321 2621.8
## - Married.No                  1     22882 2931203 2622.0
## - Age                         1     32222 2940543 2622.9
## - Income:Student.Yes          1     43236 2951557 2623.9
## - Rating:Student.Yes          1    141697 3050018 2633.1
## 
## Step:  AIC=2619.82
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes + 
##     Married.No + Ethnicity.African.American + Income:Student.Yes + 
##     Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Gender.Males                1      4947 2913770 2618.3
## - Cards                       1      9785 2918608 2618.8
## - Ethnicity.African.American  1     19665 2928488 2619.7
## <none>                                    2908823 2619.8
## - Married.No                  1     23493 2932316 2620.1
## - Age                         1     31983 2940806 2620.9
## - Income:Student.Yes          1     42736 2951559 2621.9
## - Rating:Student.Yes          1    141344 3050167 2631.2
## 
## Step:  AIC=2618.3
## Balance ~ Income + Rating + Cards + Age + Student.Yes + Married.No + 
##     Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Cards                       1     10369 2924139 2617.3
## - Ethnicity.African.American  1     18936 2932706 2618.1
## <none>                                    2913770 2618.3
## - Married.No                  1     22375 2936144 2618.4
## - Age                         1     33957 2947727 2619.6
## - Income:Student.Yes          1     42294 2956063 2620.3
## - Rating:Student.Yes          1    141371 3055141 2629.6
## 
## Step:  AIC=2617.29
## Balance ~ Income + Rating + Age + Student.Yes + Married.No + 
##     Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Ethnicity.African.American  1     19299 2943438 2617.1
## <none>                                    2924139 2617.3
## - Married.No                  1     24906 2949045 2617.7
## - Age                         1     32954 2957093 2618.4
## - Income:Student.Yes          1     45904 2970043 2619.7
## - Rating:Student.Yes          1    144824 3068963 2628.9
## 
## Step:  AIC=2617.14
## Balance ~ Income + Rating + Age + Student.Yes + Married.No + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                      Df Sum of Sq     RSS    AIC
## - Married.No          1     19081 2962519 2617.0
## <none>                            2943438 2617.1
## - Age                 1     34920 2978358 2618.5
## - Income:Student.Yes  1     50232 2993669 2619.9
## - Rating:Student.Yes  1    144746 3088184 2628.6
## 
## Step:  AIC=2616.96
## Balance ~ Income + Rating + Age + Student.Yes + Income:Student.Yes + 
##     Rating:Student.Yes
## 
##                      Df Sum of Sq     RSS    AIC
## <none>                            2962519 2617.0
## - Age                 1     30873 2993392 2617.9
## - Income:Student.Yes  1     46530 3009049 2619.3
## - Rating:Student.Yes  1    141946 3104465 2628.1
summary(model.backward)
## 
## Call:
## lm(formula = Balance ~ Income + Rating + Age + Student.Yes + 
##     Income:Student.Yes + Rating:Student.Yes, data = trains.bin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -226.047  -75.284   -8.434   67.997  270.338 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -519.53416   26.39998 -19.679  < 2e-16 ***
## Income               -7.48106    0.30934 -24.184  < 2e-16 ***
## Rating                3.86492    0.06819  56.679  < 2e-16 ***
## Age                  -0.62272    0.36852  -1.690 0.092207 .  
## Student.Yes         195.14000   69.79429   2.796 0.005541 ** 
## Income:Student.Yes   -2.62991    1.26774  -2.074 0.038967 *  
## Rating:Student.Yes    0.95495    0.26356   3.623 0.000347 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104 on 274 degrees of freedom
## Multiple R-squared:  0.9466, Adjusted R-squared:  0.9454 
## F-statistic: 809.1 on 6 and 274 DF,  p-value: < 2.2e-16

using stepAIC() function with direction = ‘forward’

model.null <- lm(Balance ~ 1, data=trains.bin)
model.forward <- stepAIC(model.null, direction = 'forward', 
                         scope = list(upper = model.full.bin, lower = model.null))
## Start:  AIC=3428.15
## Balance ~ 1
## 
##                              Df Sum of Sq      RSS    AIC
## + Rating                      1  41741772 13711196 3037.5
## + Income                      1  10817597 44635371 3369.2
## + Student.Yes                 1   2516926 52936042 3417.1
## + Cards                       1    841494 54611473 3425.8
## <none>                                    55452968 3428.1
## + Education                   1     10792 55442176 3430.1
## + Ethnicity.Asian             1      7512 55445456 3430.1
## + Ethnicity.African.American  1      5377 55447591 3430.1
## + Married.No                  1      2997 55449971 3430.1
## + Age                         1      2145 55450823 3430.1
## + Gender.Males                1      1839 55451129 3430.1
## 
## Step:  AIC=3037.5
## Balance ~ Rating
## 
##                              Df Sum of Sq      RSS    AIC
## + Income                      1   7423815  6287380 2820.4
## + Student.Yes                 1   3270058 10441138 2962.9
## + Age                         1    340310 13370885 3032.4
## + Cards                       1    302801 13408394 3033.2
## + Gender.Males                1    105665 13605530 3037.3
## <none>                                    13711196 3037.5
## + Ethnicity.Asian             1     53549 13657647 3038.4
## + Education                   1     42676 13668519 3038.6
## + Married.No                  1     15971 13695225 3039.2
## + Ethnicity.African.American  1      7749 13703447 3039.3
## 
## Step:  AIC=2820.41
## Balance ~ Rating + Income
## 
##                              Df Sum of Sq     RSS    AIC
## + Student.Yes                 1   3146343 3141037 2627.4
## + Age                         1     65582 6221798 2819.5
## + Married.No                  1     58475 6228905 2819.8
## <none>                                    6287380 2820.4
## + Cards                       1     32787 6254593 2820.9
## + Education                   1     28101 6259279 2821.2
## + Gender.Males                1     17478 6269902 2821.6
## + Ethnicity.Asian             1      8109 6279271 2822.1
## + Ethnicity.African.American  1        23 6287357 2822.4
## 
## Step:  AIC=2627.4
## Balance ~ Rating + Income + Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## + Rating:Student.Yes          1     97940 3043098 2620.5
## + Age                         1     33213 3107824 2626.4
## <none>                                    3141037 2627.4
## + Ethnicity.African.American  1     14529 3126508 2628.1
## + Cards                       1     14116 3126922 2628.1
## + Married.No                  1     13083 3127954 2628.2
## + Gender.Males                1      6046 3134991 2628.9
## + Income:Student.Yes          1      2671 3138366 2629.2
## + Ethnicity.Asian             1      2253 3138784 2629.2
## + Education                   1        78 3140959 2629.4
## 
## Step:  AIC=2620.5
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## + Income:Student.Yes          1     49706 2993392 2617.9
## + Age                         1     34048 3009049 2619.3
## <none>                                    3043098 2620.5
## + Ethnicity.African.American  1     20131 3022967 2620.6
## + Cards                       1     15298 3027800 2621.1
## + Married.No                  1     11516 3031582 2621.4
## + Gender.Males                1      5312 3037786 2622.0
## + Ethnicity.Asian             1      2138 3040960 2622.3
## + Education                   1      1421 3041677 2622.4
## 
## Step:  AIC=2617.87
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes + 
##     Income:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## + Age                         1   30872.6 2962519 2617.0
## <none>                                    2993392 2617.9
## + Ethnicity.African.American  1   15567.0 2977825 2618.4
## + Married.No                  1   15033.6 2978358 2618.5
## + Cards                       1   11605.2 2981787 2618.8
## + Gender.Males                1    5613.0 2987779 2619.3
## + Education                   1    1105.5 2992286 2619.8
## + Ethnicity.Asian             1     306.9 2993085 2619.8
## 
## Step:  AIC=2616.96
## Balance ~ Rating + Income + Student.Yes + Age + Rating:Student.Yes + 
##     Income:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## <none>                                    2962519 2617.0
## + Married.No                  1   19081.3 2943438 2617.1
## + Ethnicity.African.American  1   13473.9 2949045 2617.7
## + Cards                       1   12927.6 2949591 2617.7
## + Gender.Males                1    3889.0 2958630 2618.6
## + Education                   1    1461.7 2961057 2618.8
## + Ethnicity.Asian             1      79.7 2962439 2618.9
summary(model.forward)
## 
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes + Age + 
##     Rating:Student.Yes + Income:Student.Yes, data = trains.bin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -226.047  -75.284   -8.434   67.997  270.338 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -519.53416   26.39998 -19.679  < 2e-16 ***
## Rating                3.86492    0.06819  56.679  < 2e-16 ***
## Income               -7.48106    0.30934 -24.184  < 2e-16 ***
## Student.Yes         195.14000   69.79429   2.796 0.005541 ** 
## Age                  -0.62272    0.36852  -1.690 0.092207 .  
## Rating:Student.Yes    0.95495    0.26356   3.623 0.000347 ***
## Income:Student.Yes   -2.62991    1.26774  -2.074 0.038967 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104 on 274 degrees of freedom
## Multiple R-squared:  0.9466, Adjusted R-squared:  0.9454 
## F-statistic: 809.1 on 6 and 274 DF,  p-value: < 2.2e-16

using stepAIC() function with direction = ‘backward’ and BIC criterion

model.backward.BIC <- stepAIC(model.full.bin, k = log(nrow(trains.bin)))
## Start:  AIC=2671.05
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males + 
##     Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Education                   1       169 2908321 2665.4
## - Ethnicity.Asian             1       455 2908606 2665.5
## - Gender.Males                1      4699 2912851 2665.9
## - Cards                       1      9922 2918074 2666.4
## - Ethnicity.African.American  1     19118 2927269 2667.2
## - Married.No                  1     22418 2930569 2667.6
## - Age                         1     32311 2940463 2668.5
## - Income:Student.Yes          1     43048 2951200 2669.5
## <none>                                    2908151 2671.1
## - Rating:Student.Yes          1    141853 3050005 2678.8
## 
## Step:  AIC=2665.43
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes + 
##     Married.No + Ethnicity.African.American + Ethnicity.Asian + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Ethnicity.Asian             1       502 2908823 2659.8
## - Gender.Males                1      4739 2913060 2660.2
## - Cards                       1      9985 2918306 2660.8
## - Ethnicity.African.American  1     19591 2927911 2661.7
## - Married.No                  1     22882 2931203 2662.0
## - Age                         1     32222 2940543 2662.9
## - Income:Student.Yes          1     43236 2951557 2663.9
## <none>                                    2908321 2665.4
## - Rating:Student.Yes          1    141697 3050018 2673.2
## 
## Step:  AIC=2659.84
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes + 
##     Married.No + Ethnicity.African.American + Income:Student.Yes + 
##     Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Gender.Males                1      4947 2913770 2654.7
## - Cards                       1      9785 2918608 2655.2
## - Ethnicity.African.American  1     19665 2928488 2656.1
## - Married.No                  1     23493 2932316 2656.5
## - Age                         1     31983 2940806 2657.3
## - Income:Student.Yes          1     42736 2951559 2658.3
## <none>                                    2908823 2659.8
## - Rating:Student.Yes          1    141344 3050167 2667.5
## 
## Step:  AIC=2654.68
## Balance ~ Income + Rating + Cards + Age + Student.Yes + Married.No + 
##     Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Cards                       1     10369 2924139 2650.0
## - Ethnicity.African.American  1     18936 2932706 2650.9
## - Married.No                  1     22375 2936144 2651.2
## - Age                         1     33957 2947727 2652.3
## - Income:Student.Yes          1     42294 2956063 2653.1
## <none>                                    2913770 2654.7
## - Rating:Student.Yes          1    141371 3055141 2662.3
## 
## Step:  AIC=2650.04
## Balance ~ Income + Rating + Age + Student.Yes + Married.No + 
##     Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## - Ethnicity.African.American  1     19299 2943438 2646.2
## - Married.No                  1     24906 2949045 2646.8
## - Age                         1     32954 2957093 2647.6
## - Income:Student.Yes          1     45904 2970043 2648.8
## <none>                                    2924139 2650.0
## - Rating:Student.Yes          1    144824 3068963 2658.0
## 
## Step:  AIC=2646.25
## Balance ~ Income + Rating + Age + Student.Yes + Married.No + 
##     Income:Student.Yes + Rating:Student.Yes
## 
##                      Df Sum of Sq     RSS    AIC
## - Married.No          1     19081 2962519 2642.4
## - Age                 1     34920 2978358 2643.9
## - Income:Student.Yes  1     50232 2993669 2645.4
## <none>                            2943438 2646.2
## - Rating:Student.Yes  1    144746 3088184 2654.1
## 
## Step:  AIC=2642.43
## Balance ~ Income + Rating + Age + Student.Yes + Income:Student.Yes + 
##     Rating:Student.Yes
## 
##                      Df Sum of Sq     RSS    AIC
## - Age                 1     30873 2993392 2639.7
## - Income:Student.Yes  1     46530 3009049 2641.2
## <none>                            2962519 2642.4
## - Rating:Student.Yes  1    141946 3104465 2649.9
## 
## Step:  AIC=2639.7
## Balance ~ Income + Rating + Student.Yes + Income:Student.Yes + 
##     Rating:Student.Yes
## 
##                      Df Sum of Sq     RSS    AIC
## - Income:Student.Yes  1     49706 3043098 2638.7
## <none>                            2993392 2639.7
## - Rating:Student.Yes  1    144974 3138366 2647.3
## 
## Step:  AIC=2638.69
## Balance ~ Income + Rating + Student.Yes + Rating:Student.Yes
## 
##                      Df Sum of Sq      RSS    AIC
## <none>                             3043098 2638.7
## - Rating:Student.Yes  1     97940  3141037 2641.9
## - Income              1   7197076 10240174 2974.0
summary(model.backward.BIC)
## 
## Call:
## lm(formula = Balance ~ Income + Rating + Student.Yes + Rating:Student.Yes, 
##     data = trains.bin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -226.919  -81.823   -9.319   67.552  280.877 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -555.91531   17.40197 -31.946  < 2e-16 ***
## Income               -7.69738    0.30128 -25.549  < 2e-16 ***
## Rating                3.89654    0.06757  57.663  < 2e-16 ***
## Student.Yes         216.70361   69.85880   3.102  0.00212 ** 
## Rating:Student.Yes    0.57813    0.19398   2.980  0.00314 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105 on 276 degrees of freedom
## Multiple R-squared:  0.9451, Adjusted R-squared:  0.9443 
## F-statistic:  1188 on 4 and 276 DF,  p-value: < 2.2e-16

using stepAIC() function with direction = ‘forward’ and AIC criterion

model.forward.BIC <- stepAIC(model.null, direction = 'forward', 
                         scope = list(upper = model.full.bin, lower = model.null),
                         k = log(nrow(trains.bin)))
## Start:  AIC=3431.78
## Balance ~ 1
## 
##                              Df Sum of Sq      RSS    AIC
## + Rating                      1  41741772 13711196 3044.8
## + Income                      1  10817597 44635371 3376.4
## + Student.Yes                 1   2516926 52936042 3424.4
## <none>                                    55452968 3431.8
## + Cards                       1    841494 54611473 3433.1
## + Education                   1     10792 55442176 3437.4
## + Ethnicity.Asian             1      7512 55445456 3437.4
## + Ethnicity.African.American  1      5377 55447591 3437.4
## + Married.No                  1      2997 55449971 3437.4
## + Age                         1      2145 55450823 3437.4
## + Gender.Males                1      1839 55451129 3437.4
## 
## Step:  AIC=3044.78
## Balance ~ Rating
## 
##                              Df Sum of Sq      RSS    AIC
## + Income                      1   7423815  6287380 2831.3
## + Student.Yes                 1   3270058 10441138 2973.8
## + Age                         1    340310 13370885 3043.3
## + Cards                       1    302801 13408394 3044.1
## <none>                                    13711196 3044.8
## + Gender.Males                1    105665 13605530 3048.2
## + Ethnicity.Asian             1     53549 13657647 3049.3
## + Education                   1     42676 13668519 3049.5
## + Married.No                  1     15971 13695225 3050.1
## + Ethnicity.African.American  1      7749 13703447 3050.2
## 
## Step:  AIC=2831.33
## Balance ~ Rating + Income
## 
##                              Df Sum of Sq     RSS    AIC
## + Student.Yes                 1   3146343 3141037 2641.9
## <none>                                    6287380 2831.3
## + Age                         1     65582 6221798 2834.0
## + Married.No                  1     58475 6228905 2834.3
## + Cards                       1     32787 6254593 2835.5
## + Education                   1     28101 6259279 2835.7
## + Gender.Males                1     17478 6269902 2836.2
## + Ethnicity.Asian             1      8109 6279271 2836.6
## + Ethnicity.African.American  1        23 6287357 2837.0
## 
## Step:  AIC=2641.95
## Balance ~ Rating + Income + Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## + Rating:Student.Yes          1     97940 3043098 2638.7
## <none>                                    3141037 2641.9
## + Age                         1     33213 3107824 2644.6
## + Ethnicity.African.American  1     14529 3126508 2646.3
## + Cards                       1     14116 3126922 2646.3
## + Married.No                  1     13083 3127954 2646.4
## + Gender.Males                1      6046 3134991 2647.1
## + Income:Student.Yes          1      2671 3138366 2647.3
## + Ethnicity.Asian             1      2253 3138784 2647.4
## + Education                   1        78 3140959 2647.6
## 
## Step:  AIC=2638.69
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes
## 
##                              Df Sum of Sq     RSS    AIC
## <none>                                    3043098 2638.7
## + Income:Student.Yes          1     49706 2993392 2639.7
## + Age                         1     34048 3009049 2641.2
## + Ethnicity.African.American  1     20131 3022967 2642.5
## + Cards                       1     15298 3027800 2642.9
## + Married.No                  1     11516 3031582 2643.3
## + Gender.Males                1      5312 3037786 2643.8
## + Ethnicity.Asian             1      2138 3040960 2644.1
## + Education                   1      1421 3041677 2644.2
summary(model.forward.BIC)
## 
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes, 
##     data = trains.bin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -226.919  -81.823   -9.319   67.552  280.877 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -555.91531   17.40197 -31.946  < 2e-16 ***
## Rating                3.89654    0.06757  57.663  < 2e-16 ***
## Income               -7.69738    0.30128 -25.549  < 2e-16 ***
## Student.Yes         216.70361   69.85880   3.102  0.00212 ** 
## Rating:Student.Yes    0.57813    0.19398   2.980  0.00314 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105 on 276 degrees of freedom
## Multiple R-squared:  0.9451, Adjusted R-squared:  0.9443 
## F-statistic:  1188 on 4 and 276 DF,  p-value: < 2.2e-16

model validatioin with RMSE

RMSE(predict(model.null, newdata=tests.bin), tests.bin$Balance)
## [1] 492.8913
RMSE(predict(model.full.bin, newdata=tests.bin), tests.bin$Balance)
## [1] 90.10518
RMSE(predict(model.backward.BIC, newdata=tests.bin), tests.bin$Balance)
## [1] 95.2667
RMSE(predict(model.forward.BIC, newdata=tests.bin), tests.bin$Balance)
## [1] 95.2667

model assumption diagnostic with plot

plot(model.backward.BIC)

Fitting model with regularization - glmnet() function

change the data to matrix type

X.train <- model.matrix(Balance ~ . + Rating:Student + Income:Student,
                        data = trains)
head(X.train)
##   (Intercept)  Income Rating Cards Age Education GenderMales StudentYes
## 1           1  14.891    283     2  34        11           1          0
## 3           1 104.593    514     4  71        11           1          0
## 4           1 148.924    681     3  36        11           0          0
## 6           1  80.180    569     4  77        10           1          0
## 8           1  71.408    512     2  87         9           1          0
## 9           1  15.125    266     5  66        13           0          0
##   MarriedNo EthnicityAfrican American EthnicityAsian Rating:StudentYes
## 1         0                         0              0                 0
## 3         1                         0              1                 0
## 4         1                         0              1                 0
## 6         1                         0              0                 0
## 8         1                         0              1                 0
## 9         1                         0              0                 0
##   Income:StudentYes
## 1                 0
## 3                 0
## 4                 0
## 6                 0
## 8                 0
## 9                 0

model fitting

# install.packages('glmnet')
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-6
lasso <- glmnet(x = X.train, y = trains$Balance,
                lambda = 10^(0:3), family = "gaussian", alpha = 1)

summary(lasso)
##           Length Class     Mode   
## a0         4     -none-    numeric
## beta      52     dgCMatrix S4     
## df         4     -none-    numeric
## dim        2     -none-    numeric
## lambda     4     -none-    numeric
## dev.ratio  4     -none-    numeric
## nulldev    1     -none-    numeric
## npasses    1     -none-    numeric
## jerr       1     -none-    numeric
## offset     1     -none-    logical
## call       6     -none-    call   
## nobs       1     -none-    numeric

get the coefficient estimates

lasso$a0
##        s0        s1        s2        s3 
##  512.4093 -168.7323 -497.5085 -532.5532
lasso$beta
## 13 x 4 sparse Matrix of class "dgCMatrix"
##                           s0        s1          s2           s3
## (Intercept)                . .           .           .         
## Income                     . .          -6.3144833  -7.38152149
## Rating                     . 1.9075860   3.5890555   3.83905399
## Cards                      . .           1.6482442   4.41507745
## Age                        . .          -0.2351995  -0.60617563
## Education                  . .           .          -0.01266776
## GenderMales                . .           .           5.74946970
## StudentYes                 . .         158.6984028 196.47780589
## MarriedNo                  . .           .          16.01901966
## EthnicityAfrican American  . .           .         -17.60124298
## EthnicityAsian             . .           .           .         
## Rating:StudentYes          . 0.1628325   0.6326980   0.82441046
## Income:StudentYes          . .           .          -1.66140890
coef(lasso)
## 14 x 4 sparse Matrix of class "dgCMatrix"
##                                 s0           s1           s2            s3
## (Intercept)               512.4093 -168.7323079 -497.5085492 -532.55320909
## (Intercept)                 .         .            .            .         
## Income                      .         .           -6.3144833   -7.38152149
## Rating                      .         1.9075860    3.5890555    3.83905399
## Cards                       .         .            1.6482442    4.41507745
## Age                         .         .           -0.2351995   -0.60617563
## Education                   .         .            .           -0.01266776
## GenderMales                 .         .            .            5.74946970
## StudentYes                  .         .          158.6984028  196.47780589
## MarriedNo                   .         .            .           16.01901966
## EthnicityAfrican American   .         .            .          -17.60124298
## EthnicityAsian              .         .            .            .         
## Rating:StudentYes           .         0.1628325    0.6326980    0.82441046
## Income:StudentYes           .         .            .           -1.66140890

hyperparameter tuning on lambda

set.seed(1234)

cv.model <- cv.glmnet(x = X.train, y = trains$Balance, family="gaussian", alpha=1)
plot(cv.model)

print(cv.model$lambda.min)
## [1] 0.4751616
print(cv.model$lambda.1se)
## [1] 7.743951

fitting the lasso with the optimal lambda

lasso.best <- glmnet(x = X.train, y = trains$Balance, lambda=cv.model$lambda.min,
                     alpha=1, family="gaussian")
coef(lasso.best)
## 14 x 1 sparse Matrix of class "dgCMatrix"
##                                     s0
## (Intercept)               -532.2864406
## (Intercept)                  .        
## Income                      -7.4216677
## Rating                       3.8488378
## Cards                        4.4386545
## Age                         -0.6240161
## Education                   -0.1500531
## GenderMales                  7.1516664
## StudentYes                 190.9096211
## MarriedNo                   17.6798239
## EthnicityAfrican American  -19.0066617
## EthnicityAsian              -1.0573350
## Rating:StudentYes            0.9110834
## Income:StudentYes           -2.1852890

make prediction best on the best lasso model

x.test <- model.matrix(Balance ~ . + Rating:Student + Income:Student, data = tests)

y.pred <- predict(lasso.best, newx=x.test)

RMSE(y.pred, tests$Balance)
## [1] 90.15426