credit <- read.csv('Credit.csv')
str(credit)
## 'data.frame': 400 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : chr "Males" "Female" "Males" "Female" ...
## $ Student : chr "No" "Yes" "No" "No" ...
## $ Married : chr "Yes" "Yes" "No" "No" ...
## $ Ethnicity: chr "Caucasian" "Asian" "Asian" "Asian" ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...
credit$X <- NULL
summary(credit)
## Income Limit Rating Cards
## Min. : 10.35 Min. : 855 Min. : 93.0 Min. :1.000
## 1st Qu.: 21.01 1st Qu.: 3088 1st Qu.:247.2 1st Qu.:2.000
## Median : 33.12 Median : 4622 Median :344.0 Median :3.000
## Mean : 45.22 Mean : 4736 Mean :354.9 Mean :2.958
## 3rd Qu.: 57.47 3rd Qu.: 5873 3rd Qu.:437.2 3rd Qu.:4.000
## Max. :186.63 Max. :13913 Max. :982.0 Max. :9.000
## Age Education Gender Student
## Min. :23.00 Min. : 5.00 Length:400 Length:400
## 1st Qu.:41.75 1st Qu.:11.00 Class :character Class :character
## Median :56.00 Median :14.00 Mode :character Mode :character
## Mean :55.67 Mean :13.45
## 3rd Qu.:70.00 3rd Qu.:16.00
## Max. :98.00 Max. :20.00
## Married Ethnicity Balance
## Length:400 Length:400 Min. : 0.00
## Class :character Class :character 1st Qu.: 68.75
## Mode :character Mode :character Median : 459.50
## Mean : 520.01
## 3rd Qu.: 863.00
## Max. :1999.00
distribution of Balance
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
credit %>% ggplot(aes(x=Balance)) +
geom_histogram(bins = 15) +
labs(x = 'Balance')

calculate the number of 0 Balance
nrow(credit[credit$Balance == 0, ])
## [1] 90
relationship between Balance and numeric predictors
vars.numeric <- colnames(credit)[1:6]
numeric_df <- cbind(credit[, 1:6], credit$Balance)
for (col_i in 1:6) {
plot <- numeric_df %>% ggplot(
aes(x = numeric_df[, col_i], y = credit$Balance)
) +
geom_point() +
geom_smooth(method='lm', se = FALSE) +
labs(x = vars.numeric[col_i])
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

explore the relationship between Rating and Limit
numeric_df %>% ggplot(aes(x = Limit, y = Rating)) +
geom_point() +
geom_smooth(method='lm', se=F)
## `geom_smooth()` using formula = 'y ~ x'

credit$Limit <- NULL
get the interaction relationship
interaction between Student and Income
credit %>% ggplot(aes(x = Income, y = Balance)) +
geom_point(aes(color = Student)) +
geom_smooth(aes(color = Student), method='lm', se = F)
## `geom_smooth()` using formula = 'y ~ x'

interaction between Student and Rating
credit %>% ggplot(aes(x=Rating, y=Balance)) +
geom_point(aes(color=Student)) +
geom_smooth(aes(color=Student), method='lm', se=F)
## `geom_smooth()` using formula = 'y ~ x'

change the categorical variables to factor and set the base level
with the most common level
for (var in vars.category) {
var_table <- as.data.frame(table(credit[, var]))
most_common <- which.max(var_table$Freq)
base_level <- as.character(var_table[most_common, 'Var1'])
credit[[var]] <- relevel(factor(credit[, var]), ref=base_level)
}
str(credit)
## 'data.frame': 400 obs. of 10 variables:
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels "Female","Males": 2 1 2 1 2 2 1 2 1 1 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "Yes","No": 1 1 2 2 1 2 2 2 2 1 ...
## $ Ethnicity: Factor w/ 3 levels "Caucasian","African American",..: 1 3 3 3 1 1 2 3 1 2 ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...
training and test dataset splitting
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123)
partition <- createDataPartition(credit$Balance, p=0.7, list=F)
trains <- credit[partition, ]
tests <- credit[-partition,]
summary(trains$Balance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 70.0 463.0 512.4 863.0 1779.0
summary(tests$Balance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 60.5 453.0 538.0 853.5 1999.0
model fitting with full variables
model.full <- lm(Balance ~ . + Income:Student + Rating:Student, data=trains)
summary(model.full)
##
## Call:
## lm(formula = Balance ~ . + Income:Student + Rating:Student, data = trains)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217.28 -73.14 -13.93 71.91 282.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -532.27287 41.58440 -12.800 < 2e-16 ***
## Income -7.46589 0.31320 -23.838 < 2e-16 ***
## Rating 3.85969 0.06905 55.899 < 2e-16 ***
## Cards 4.51095 4.71744 0.956 0.339819
## Age -0.64226 0.37220 -1.726 0.085574 .
## Education -0.25118 2.01099 -0.125 0.900693
## GenderMales 8.34677 12.68353 0.658 0.511052
## StudentYes 191.55757 70.17185 2.730 0.006756 **
## MarriedNo 19.06950 13.26737 1.437 0.151793
## EthnicityAfrican American -20.67684 15.57792 -1.327 0.185533
## EthnicityAsian -3.14081 15.34472 -0.205 0.837975
## Income:StudentYes -2.56262 1.28661 -1.992 0.047412 *
## Rating:StudentYes 0.96083 0.26575 3.616 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.2 on 268 degrees of freedom
## Multiple R-squared: 0.9476, Adjusted R-squared: 0.9452
## F-statistic: 403.5 on 12 and 268 DF, p-value: < 2.2e-16
Dummy variables
binarizer <- dummyVars(paste('~ ', paste(vars.category, collapse = '+')), data=credit, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata=credit))
binarized_vars %>% head()
## Gender.Males Student.Yes Married.No Ethnicity.African.American
## 1 1 0 0 0
## 2 0 1 0 0
## 3 1 0 1 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 1 0 1 0
## Ethnicity.Asian
## 1 0
## 2 1
## 3 1
## 4 1
## 5 0
## 6 0
combine the original credit data and binarized data
credit.bin <- cbind(credit, binarized_vars)
credit.bin$Gender <- NULL
credit.bin$Student <- NULL
credit.bin$Married <- NULL
credit.bin$Ethnicity <- NULL
head(credit.bin)
## Income Rating Cards Age Education Balance Gender.Males Student.Yes
## 1 14.891 283 2 34 11 333 1 0
## 2 106.025 483 3 82 15 903 0 1
## 3 104.593 514 4 71 11 580 1 0
## 4 148.924 681 3 36 11 964 0 0
## 5 55.882 357 2 68 16 331 1 0
## 6 80.180 569 4 77 10 1151 1 0
## Married.No Ethnicity.African.American Ethnicity.Asian
## 1 0 0 0
## 2 0 0 1
## 3 1 0 1
## 4 1 0 1
## 5 0 0 0
## 6 1 0 0
trains.bin <- credit.bin[partition, ]
tests.bin <- credit.bin[-partition, ]
model.full.bin <- lm(Balance ~.+Income:Student.Yes+Rating:Student.Yes, data=trains.bin)
summary(model.full.bin)
##
## Call:
## lm(formula = Balance ~ . + Income:Student.Yes + Rating:Student.Yes,
## data = trains.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -217.28 -73.14 -13.93 71.91 282.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -532.27287 41.58440 -12.800 < 2e-16 ***
## Income -7.46589 0.31320 -23.838 < 2e-16 ***
## Rating 3.85969 0.06905 55.899 < 2e-16 ***
## Cards 4.51095 4.71744 0.956 0.339819
## Age -0.64226 0.37220 -1.726 0.085574 .
## Education -0.25118 2.01099 -0.125 0.900693
## Gender.Males 8.34677 12.68353 0.658 0.511052
## Student.Yes 191.55757 70.17185 2.730 0.006756 **
## Married.No 19.06950 13.26737 1.437 0.151793
## Ethnicity.African.American -20.67684 15.57792 -1.327 0.185533
## Ethnicity.Asian -3.14081 15.34472 -0.205 0.837975
## Income:Student.Yes -2.56262 1.28661 -1.992 0.047412 *
## Rating:Student.Yes 0.96083 0.26575 3.616 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104.2 on 268 degrees of freedom
## Multiple R-squared: 0.9476, Adjusted R-squared: 0.9452
## F-statistic: 403.5 on 12 and 268 DF, p-value: < 2.2e-16
feature selections
using drop1() function
drop1(model.full)
## Single term deletions
##
## Model:
## Balance ~ Income + Rating + Cards + Age + Education + Gender +
## Student + Married + Ethnicity + Income:Student + Rating:Student
## Df Sum of Sq RSS AIC
## <none> 2908151 2623.8
## Cards 1 9922 2918074 2622.7
## Age 1 32311 2940463 2624.9
## Education 1 169 2908321 2621.8
## Gender 1 4699 2912851 2622.2
## Married 1 22418 2930569 2623.9
## Ethnicity 2 19759 2927910 2621.7
## Income:Student 1 43048 2951200 2625.9
## Rating:Student 1 141853 3050005 2635.1
drop1(model.full.bin)
## Single term deletions
##
## Model:
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males +
## Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes + Rating:Student.Yes
## Df Sum of Sq RSS AIC
## <none> 2908151 2623.8
## Cards 1 9922 2918074 2622.7
## Age 1 32311 2940463 2624.9
## Education 1 169 2908321 2621.8
## Gender.Males 1 4699 2912851 2622.2
## Married.No 1 22418 2930569 2623.9
## Ethnicity.African.American 1 19118 2927269 2623.6
## Ethnicity.Asian 1 455 2908606 2621.8
## Income:Student.Yes 1 43048 2951200 2625.9
## Rating:Student.Yes 1 141853 3050005 2635.1
using stepAIC() function with direction = ‘backward’
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model.backward <- stepAIC(model.full.bin)
## Start: AIC=2623.75
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males +
## Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Education 1 169 2908321 2621.8
## - Ethnicity.Asian 1 455 2908606 2621.8
## - Gender.Males 1 4699 2912851 2622.2
## - Cards 1 9922 2918074 2622.7
## - Ethnicity.African.American 1 19118 2927269 2623.6
## <none> 2908151 2623.8
## - Married.No 1 22418 2930569 2623.9
## - Age 1 32311 2940463 2624.9
## - Income:Student.Yes 1 43048 2951200 2625.9
## - Rating:Student.Yes 1 141853 3050005 2635.1
##
## Step: AIC=2621.77
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes +
## Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.Asian 1 502 2908823 2619.8
## - Gender.Males 1 4739 2913060 2620.2
## - Cards 1 9985 2918306 2620.7
## - Ethnicity.African.American 1 19591 2927911 2621.7
## <none> 2908321 2621.8
## - Married.No 1 22882 2931203 2622.0
## - Age 1 32222 2940543 2622.9
## - Income:Student.Yes 1 43236 2951557 2623.9
## - Rating:Student.Yes 1 141697 3050018 2633.1
##
## Step: AIC=2619.82
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes +
## Married.No + Ethnicity.African.American + Income:Student.Yes +
## Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Gender.Males 1 4947 2913770 2618.3
## - Cards 1 9785 2918608 2618.8
## - Ethnicity.African.American 1 19665 2928488 2619.7
## <none> 2908823 2619.8
## - Married.No 1 23493 2932316 2620.1
## - Age 1 31983 2940806 2620.9
## - Income:Student.Yes 1 42736 2951559 2621.9
## - Rating:Student.Yes 1 141344 3050167 2631.2
##
## Step: AIC=2618.3
## Balance ~ Income + Rating + Cards + Age + Student.Yes + Married.No +
## Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Cards 1 10369 2924139 2617.3
## - Ethnicity.African.American 1 18936 2932706 2618.1
## <none> 2913770 2618.3
## - Married.No 1 22375 2936144 2618.4
## - Age 1 33957 2947727 2619.6
## - Income:Student.Yes 1 42294 2956063 2620.3
## - Rating:Student.Yes 1 141371 3055141 2629.6
##
## Step: AIC=2617.29
## Balance ~ Income + Rating + Age + Student.Yes + Married.No +
## Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.African.American 1 19299 2943438 2617.1
## <none> 2924139 2617.3
## - Married.No 1 24906 2949045 2617.7
## - Age 1 32954 2957093 2618.4
## - Income:Student.Yes 1 45904 2970043 2619.7
## - Rating:Student.Yes 1 144824 3068963 2628.9
##
## Step: AIC=2617.14
## Balance ~ Income + Rating + Age + Student.Yes + Married.No +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Married.No 1 19081 2962519 2617.0
## <none> 2943438 2617.1
## - Age 1 34920 2978358 2618.5
## - Income:Student.Yes 1 50232 2993669 2619.9
## - Rating:Student.Yes 1 144746 3088184 2628.6
##
## Step: AIC=2616.96
## Balance ~ Income + Rating + Age + Student.Yes + Income:Student.Yes +
## Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## <none> 2962519 2617.0
## - Age 1 30873 2993392 2617.9
## - Income:Student.Yes 1 46530 3009049 2619.3
## - Rating:Student.Yes 1 141946 3104465 2628.1
summary(model.backward)
##
## Call:
## lm(formula = Balance ~ Income + Rating + Age + Student.Yes +
## Income:Student.Yes + Rating:Student.Yes, data = trains.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.047 -75.284 -8.434 67.997 270.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -519.53416 26.39998 -19.679 < 2e-16 ***
## Income -7.48106 0.30934 -24.184 < 2e-16 ***
## Rating 3.86492 0.06819 56.679 < 2e-16 ***
## Age -0.62272 0.36852 -1.690 0.092207 .
## Student.Yes 195.14000 69.79429 2.796 0.005541 **
## Income:Student.Yes -2.62991 1.26774 -2.074 0.038967 *
## Rating:Student.Yes 0.95495 0.26356 3.623 0.000347 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104 on 274 degrees of freedom
## Multiple R-squared: 0.9466, Adjusted R-squared: 0.9454
## F-statistic: 809.1 on 6 and 274 DF, p-value: < 2.2e-16
using stepAIC() function with direction = ‘forward’
model.null <- lm(Balance ~ 1, data=trains.bin)
model.forward <- stepAIC(model.null, direction = 'forward',
scope = list(upper = model.full.bin, lower = model.null))
## Start: AIC=3428.15
## Balance ~ 1
##
## Df Sum of Sq RSS AIC
## + Rating 1 41741772 13711196 3037.5
## + Income 1 10817597 44635371 3369.2
## + Student.Yes 1 2516926 52936042 3417.1
## + Cards 1 841494 54611473 3425.8
## <none> 55452968 3428.1
## + Education 1 10792 55442176 3430.1
## + Ethnicity.Asian 1 7512 55445456 3430.1
## + Ethnicity.African.American 1 5377 55447591 3430.1
## + Married.No 1 2997 55449971 3430.1
## + Age 1 2145 55450823 3430.1
## + Gender.Males 1 1839 55451129 3430.1
##
## Step: AIC=3037.5
## Balance ~ Rating
##
## Df Sum of Sq RSS AIC
## + Income 1 7423815 6287380 2820.4
## + Student.Yes 1 3270058 10441138 2962.9
## + Age 1 340310 13370885 3032.4
## + Cards 1 302801 13408394 3033.2
## + Gender.Males 1 105665 13605530 3037.3
## <none> 13711196 3037.5
## + Ethnicity.Asian 1 53549 13657647 3038.4
## + Education 1 42676 13668519 3038.6
## + Married.No 1 15971 13695225 3039.2
## + Ethnicity.African.American 1 7749 13703447 3039.3
##
## Step: AIC=2820.41
## Balance ~ Rating + Income
##
## Df Sum of Sq RSS AIC
## + Student.Yes 1 3146343 3141037 2627.4
## + Age 1 65582 6221798 2819.5
## + Married.No 1 58475 6228905 2819.8
## <none> 6287380 2820.4
## + Cards 1 32787 6254593 2820.9
## + Education 1 28101 6259279 2821.2
## + Gender.Males 1 17478 6269902 2821.6
## + Ethnicity.Asian 1 8109 6279271 2822.1
## + Ethnicity.African.American 1 23 6287357 2822.4
##
## Step: AIC=2627.4
## Balance ~ Rating + Income + Student.Yes
##
## Df Sum of Sq RSS AIC
## + Rating:Student.Yes 1 97940 3043098 2620.5
## + Age 1 33213 3107824 2626.4
## <none> 3141037 2627.4
## + Ethnicity.African.American 1 14529 3126508 2628.1
## + Cards 1 14116 3126922 2628.1
## + Married.No 1 13083 3127954 2628.2
## + Gender.Males 1 6046 3134991 2628.9
## + Income:Student.Yes 1 2671 3138366 2629.2
## + Ethnicity.Asian 1 2253 3138784 2629.2
## + Education 1 78 3140959 2629.4
##
## Step: AIC=2620.5
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## + Income:Student.Yes 1 49706 2993392 2617.9
## + Age 1 34048 3009049 2619.3
## <none> 3043098 2620.5
## + Ethnicity.African.American 1 20131 3022967 2620.6
## + Cards 1 15298 3027800 2621.1
## + Married.No 1 11516 3031582 2621.4
## + Gender.Males 1 5312 3037786 2622.0
## + Ethnicity.Asian 1 2138 3040960 2622.3
## + Education 1 1421 3041677 2622.4
##
## Step: AIC=2617.87
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes +
## Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## + Age 1 30872.6 2962519 2617.0
## <none> 2993392 2617.9
## + Ethnicity.African.American 1 15567.0 2977825 2618.4
## + Married.No 1 15033.6 2978358 2618.5
## + Cards 1 11605.2 2981787 2618.8
## + Gender.Males 1 5613.0 2987779 2619.3
## + Education 1 1105.5 2992286 2619.8
## + Ethnicity.Asian 1 306.9 2993085 2619.8
##
## Step: AIC=2616.96
## Balance ~ Rating + Income + Student.Yes + Age + Rating:Student.Yes +
## Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## <none> 2962519 2617.0
## + Married.No 1 19081.3 2943438 2617.1
## + Ethnicity.African.American 1 13473.9 2949045 2617.7
## + Cards 1 12927.6 2949591 2617.7
## + Gender.Males 1 3889.0 2958630 2618.6
## + Education 1 1461.7 2961057 2618.8
## + Ethnicity.Asian 1 79.7 2962439 2618.9
summary(model.forward)
##
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes + Age +
## Rating:Student.Yes + Income:Student.Yes, data = trains.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.047 -75.284 -8.434 67.997 270.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -519.53416 26.39998 -19.679 < 2e-16 ***
## Rating 3.86492 0.06819 56.679 < 2e-16 ***
## Income -7.48106 0.30934 -24.184 < 2e-16 ***
## Student.Yes 195.14000 69.79429 2.796 0.005541 **
## Age -0.62272 0.36852 -1.690 0.092207 .
## Rating:Student.Yes 0.95495 0.26356 3.623 0.000347 ***
## Income:Student.Yes -2.62991 1.26774 -2.074 0.038967 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 104 on 274 degrees of freedom
## Multiple R-squared: 0.9466, Adjusted R-squared: 0.9454
## F-statistic: 809.1 on 6 and 274 DF, p-value: < 2.2e-16
using stepAIC() function with direction = ‘backward’ and BIC
criterion
model.backward.BIC <- stepAIC(model.full.bin, k = log(nrow(trains.bin)))
## Start: AIC=2671.05
## Balance ~ Income + Rating + Cards + Age + Education + Gender.Males +
## Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Education 1 169 2908321 2665.4
## - Ethnicity.Asian 1 455 2908606 2665.5
## - Gender.Males 1 4699 2912851 2665.9
## - Cards 1 9922 2918074 2666.4
## - Ethnicity.African.American 1 19118 2927269 2667.2
## - Married.No 1 22418 2930569 2667.6
## - Age 1 32311 2940463 2668.5
## - Income:Student.Yes 1 43048 2951200 2669.5
## <none> 2908151 2671.1
## - Rating:Student.Yes 1 141853 3050005 2678.8
##
## Step: AIC=2665.43
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes +
## Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.Asian 1 502 2908823 2659.8
## - Gender.Males 1 4739 2913060 2660.2
## - Cards 1 9985 2918306 2660.8
## - Ethnicity.African.American 1 19591 2927911 2661.7
## - Married.No 1 22882 2931203 2662.0
## - Age 1 32222 2940543 2662.9
## - Income:Student.Yes 1 43236 2951557 2663.9
## <none> 2908321 2665.4
## - Rating:Student.Yes 1 141697 3050018 2673.2
##
## Step: AIC=2659.84
## Balance ~ Income + Rating + Cards + Age + Gender.Males + Student.Yes +
## Married.No + Ethnicity.African.American + Income:Student.Yes +
## Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Gender.Males 1 4947 2913770 2654.7
## - Cards 1 9785 2918608 2655.2
## - Ethnicity.African.American 1 19665 2928488 2656.1
## - Married.No 1 23493 2932316 2656.5
## - Age 1 31983 2940806 2657.3
## - Income:Student.Yes 1 42736 2951559 2658.3
## <none> 2908823 2659.8
## - Rating:Student.Yes 1 141344 3050167 2667.5
##
## Step: AIC=2654.68
## Balance ~ Income + Rating + Cards + Age + Student.Yes + Married.No +
## Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Cards 1 10369 2924139 2650.0
## - Ethnicity.African.American 1 18936 2932706 2650.9
## - Married.No 1 22375 2936144 2651.2
## - Age 1 33957 2947727 2652.3
## - Income:Student.Yes 1 42294 2956063 2653.1
## <none> 2913770 2654.7
## - Rating:Student.Yes 1 141371 3055141 2662.3
##
## Step: AIC=2650.04
## Balance ~ Income + Rating + Age + Student.Yes + Married.No +
## Ethnicity.African.American + Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.African.American 1 19299 2943438 2646.2
## - Married.No 1 24906 2949045 2646.8
## - Age 1 32954 2957093 2647.6
## - Income:Student.Yes 1 45904 2970043 2648.8
## <none> 2924139 2650.0
## - Rating:Student.Yes 1 144824 3068963 2658.0
##
## Step: AIC=2646.25
## Balance ~ Income + Rating + Age + Student.Yes + Married.No +
## Income:Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Married.No 1 19081 2962519 2642.4
## - Age 1 34920 2978358 2643.9
## - Income:Student.Yes 1 50232 2993669 2645.4
## <none> 2943438 2646.2
## - Rating:Student.Yes 1 144746 3088184 2654.1
##
## Step: AIC=2642.43
## Balance ~ Income + Rating + Age + Student.Yes + Income:Student.Yes +
## Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Age 1 30873 2993392 2639.7
## - Income:Student.Yes 1 46530 3009049 2641.2
## <none> 2962519 2642.4
## - Rating:Student.Yes 1 141946 3104465 2649.9
##
## Step: AIC=2639.7
## Balance ~ Income + Rating + Student.Yes + Income:Student.Yes +
## Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Income:Student.Yes 1 49706 3043098 2638.7
## <none> 2993392 2639.7
## - Rating:Student.Yes 1 144974 3138366 2647.3
##
## Step: AIC=2638.69
## Balance ~ Income + Rating + Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## <none> 3043098 2638.7
## - Rating:Student.Yes 1 97940 3141037 2641.9
## - Income 1 7197076 10240174 2974.0
summary(model.backward.BIC)
##
## Call:
## lm(formula = Balance ~ Income + Rating + Student.Yes + Rating:Student.Yes,
## data = trains.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.919 -81.823 -9.319 67.552 280.877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -555.91531 17.40197 -31.946 < 2e-16 ***
## Income -7.69738 0.30128 -25.549 < 2e-16 ***
## Rating 3.89654 0.06757 57.663 < 2e-16 ***
## Student.Yes 216.70361 69.85880 3.102 0.00212 **
## Rating:Student.Yes 0.57813 0.19398 2.980 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105 on 276 degrees of freedom
## Multiple R-squared: 0.9451, Adjusted R-squared: 0.9443
## F-statistic: 1188 on 4 and 276 DF, p-value: < 2.2e-16
using stepAIC() function with direction = ‘forward’ and AIC
criterion
model.forward.BIC <- stepAIC(model.null, direction = 'forward',
scope = list(upper = model.full.bin, lower = model.null),
k = log(nrow(trains.bin)))
## Start: AIC=3431.78
## Balance ~ 1
##
## Df Sum of Sq RSS AIC
## + Rating 1 41741772 13711196 3044.8
## + Income 1 10817597 44635371 3376.4
## + Student.Yes 1 2516926 52936042 3424.4
## <none> 55452968 3431.8
## + Cards 1 841494 54611473 3433.1
## + Education 1 10792 55442176 3437.4
## + Ethnicity.Asian 1 7512 55445456 3437.4
## + Ethnicity.African.American 1 5377 55447591 3437.4
## + Married.No 1 2997 55449971 3437.4
## + Age 1 2145 55450823 3437.4
## + Gender.Males 1 1839 55451129 3437.4
##
## Step: AIC=3044.78
## Balance ~ Rating
##
## Df Sum of Sq RSS AIC
## + Income 1 7423815 6287380 2831.3
## + Student.Yes 1 3270058 10441138 2973.8
## + Age 1 340310 13370885 3043.3
## + Cards 1 302801 13408394 3044.1
## <none> 13711196 3044.8
## + Gender.Males 1 105665 13605530 3048.2
## + Ethnicity.Asian 1 53549 13657647 3049.3
## + Education 1 42676 13668519 3049.5
## + Married.No 1 15971 13695225 3050.1
## + Ethnicity.African.American 1 7749 13703447 3050.2
##
## Step: AIC=2831.33
## Balance ~ Rating + Income
##
## Df Sum of Sq RSS AIC
## + Student.Yes 1 3146343 3141037 2641.9
## <none> 6287380 2831.3
## + Age 1 65582 6221798 2834.0
## + Married.No 1 58475 6228905 2834.3
## + Cards 1 32787 6254593 2835.5
## + Education 1 28101 6259279 2835.7
## + Gender.Males 1 17478 6269902 2836.2
## + Ethnicity.Asian 1 8109 6279271 2836.6
## + Ethnicity.African.American 1 23 6287357 2837.0
##
## Step: AIC=2641.95
## Balance ~ Rating + Income + Student.Yes
##
## Df Sum of Sq RSS AIC
## + Rating:Student.Yes 1 97940 3043098 2638.7
## <none> 3141037 2641.9
## + Age 1 33213 3107824 2644.6
## + Ethnicity.African.American 1 14529 3126508 2646.3
## + Cards 1 14116 3126922 2646.3
## + Married.No 1 13083 3127954 2646.4
## + Gender.Males 1 6046 3134991 2647.1
## + Income:Student.Yes 1 2671 3138366 2647.3
## + Ethnicity.Asian 1 2253 3138784 2647.4
## + Education 1 78 3140959 2647.6
##
## Step: AIC=2638.69
## Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes
##
## Df Sum of Sq RSS AIC
## <none> 3043098 2638.7
## + Income:Student.Yes 1 49706 2993392 2639.7
## + Age 1 34048 3009049 2641.2
## + Ethnicity.African.American 1 20131 3022967 2642.5
## + Cards 1 15298 3027800 2642.9
## + Married.No 1 11516 3031582 2643.3
## + Gender.Males 1 5312 3037786 2643.8
## + Ethnicity.Asian 1 2138 3040960 2644.1
## + Education 1 1421 3041677 2644.2
summary(model.forward.BIC)
##
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes + Rating:Student.Yes,
## data = trains.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.919 -81.823 -9.319 67.552 280.877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -555.91531 17.40197 -31.946 < 2e-16 ***
## Rating 3.89654 0.06757 57.663 < 2e-16 ***
## Income -7.69738 0.30128 -25.549 < 2e-16 ***
## Student.Yes 216.70361 69.85880 3.102 0.00212 **
## Rating:Student.Yes 0.57813 0.19398 2.980 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105 on 276 degrees of freedom
## Multiple R-squared: 0.9451, Adjusted R-squared: 0.9443
## F-statistic: 1188 on 4 and 276 DF, p-value: < 2.2e-16
model validatioin with RMSE
RMSE(predict(model.null, newdata=tests.bin), tests.bin$Balance)
## [1] 492.8913
RMSE(predict(model.full.bin, newdata=tests.bin), tests.bin$Balance)
## [1] 90.10518
RMSE(predict(model.backward.BIC, newdata=tests.bin), tests.bin$Balance)
## [1] 95.2667
RMSE(predict(model.forward.BIC, newdata=tests.bin), tests.bin$Balance)
## [1] 95.2667
Fitting model with regularization - glmnet() function
change the data to matrix type
X.train <- model.matrix(Balance ~ . + Rating:Student + Income:Student,
data = trains)
head(X.train)
## (Intercept) Income Rating Cards Age Education GenderMales StudentYes
## 1 1 14.891 283 2 34 11 1 0
## 3 1 104.593 514 4 71 11 1 0
## 4 1 148.924 681 3 36 11 0 0
## 6 1 80.180 569 4 77 10 1 0
## 8 1 71.408 512 2 87 9 1 0
## 9 1 15.125 266 5 66 13 0 0
## MarriedNo EthnicityAfrican American EthnicityAsian Rating:StudentYes
## 1 0 0 0 0
## 3 1 0 1 0
## 4 1 0 1 0
## 6 1 0 0 0
## 8 1 0 1 0
## 9 1 0 0 0
## Income:StudentYes
## 1 0
## 3 0
## 4 0
## 6 0
## 8 0
## 9 0
model fitting
# install.packages('glmnet')
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-6
lasso <- glmnet(x = X.train, y = trains$Balance,
lambda = 10^(0:3), family = "gaussian", alpha = 1)
summary(lasso)
## Length Class Mode
## a0 4 -none- numeric
## beta 52 dgCMatrix S4
## df 4 -none- numeric
## dim 2 -none- numeric
## lambda 4 -none- numeric
## dev.ratio 4 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 6 -none- call
## nobs 1 -none- numeric
get the coefficient estimates
lasso$a0
## s0 s1 s2 s3
## 512.4093 -168.7323 -497.5085 -532.5532
lasso$beta
## 13 x 4 sparse Matrix of class "dgCMatrix"
## s0 s1 s2 s3
## (Intercept) . . . .
## Income . . -6.3144833 -7.38152149
## Rating . 1.9075860 3.5890555 3.83905399
## Cards . . 1.6482442 4.41507745
## Age . . -0.2351995 -0.60617563
## Education . . . -0.01266776
## GenderMales . . . 5.74946970
## StudentYes . . 158.6984028 196.47780589
## MarriedNo . . . 16.01901966
## EthnicityAfrican American . . . -17.60124298
## EthnicityAsian . . . .
## Rating:StudentYes . 0.1628325 0.6326980 0.82441046
## Income:StudentYes . . . -1.66140890
coef(lasso)
## 14 x 4 sparse Matrix of class "dgCMatrix"
## s0 s1 s2 s3
## (Intercept) 512.4093 -168.7323079 -497.5085492 -532.55320909
## (Intercept) . . . .
## Income . . -6.3144833 -7.38152149
## Rating . 1.9075860 3.5890555 3.83905399
## Cards . . 1.6482442 4.41507745
## Age . . -0.2351995 -0.60617563
## Education . . . -0.01266776
## GenderMales . . . 5.74946970
## StudentYes . . 158.6984028 196.47780589
## MarriedNo . . . 16.01901966
## EthnicityAfrican American . . . -17.60124298
## EthnicityAsian . . . .
## Rating:StudentYes . 0.1628325 0.6326980 0.82441046
## Income:StudentYes . . . -1.66140890
hyperparameter tuning on lambda
set.seed(1234)
cv.model <- cv.glmnet(x = X.train, y = trains$Balance, family="gaussian", alpha=1)
plot(cv.model)

print(cv.model$lambda.min)
## [1] 0.4751616
print(cv.model$lambda.1se)
## [1] 7.743951
fitting the lasso with the optimal lambda
lasso.best <- glmnet(x = X.train, y = trains$Balance, lambda=cv.model$lambda.min,
alpha=1, family="gaussian")
coef(lasso.best)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -532.2864406
## (Intercept) .
## Income -7.4216677
## Rating 3.8488378
## Cards 4.4386545
## Age -0.6240161
## Education -0.1500531
## GenderMales 7.1516664
## StudentYes 190.9096211
## MarriedNo 17.6798239
## EthnicityAfrican American -19.0066617
## EthnicityAsian -1.0573350
## Rating:StudentYes 0.9110834
## Income:StudentYes -2.1852890
make prediction best on the best lasso model
x.test <- model.matrix(Balance ~ . + Rating:Student + Income:Student, data = tests)
y.pred <- predict(lasso.best, newx=x.test)
RMSE(y.pred, tests$Balance)
## [1] 90.15426