Here we try to figure out the important predictors to predict the Balance
# CHUNK 1
# Uncomment the next line the first time you use ISLR
# install.packages("ISLR")
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
data(Credit)
# Delete the first column containing row indices
Credit$ID <- NULL
summary(Credit)
## Income Limit Rating Cards
## Min. : 10.35 Min. : 855 Min. : 93.0 Min. :1.000
## 1st Qu.: 21.01 1st Qu.: 3088 1st Qu.:247.2 1st Qu.:2.000
## Median : 33.12 Median : 4622 Median :344.0 Median :3.000
## Mean : 45.22 Mean : 4736 Mean :354.9 Mean :2.958
## 3rd Qu.: 57.47 3rd Qu.: 5873 3rd Qu.:437.2 3rd Qu.:4.000
## Max. :186.63 Max. :13913 Max. :982.0 Max. :9.000
## Age Education Gender Student Married
## Min. :23.00 Min. : 5.00 Male :193 No :360 No :155
## 1st Qu.:41.75 1st Qu.:11.00 Female:207 Yes: 40 Yes:245
## Median :56.00 Median :14.00
## Mean :55.67 Mean :13.45
## 3rd Qu.:70.00 3rd Qu.:16.00
## Max. :98.00 Max. :20.00
## Ethnicity Balance
## African American: 99 Min. : 0.00
## Asian :102 1st Qu.: 68.75
## Caucasian :199 Median : 459.50
## Mean : 520.01
## 3rd Qu.: 863.00
## Max. :1999.00
# CHUNK 2
library(ggplot2)
ggplot(Credit, aes(x = Balance)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
nrow(Credit[Credit$Balance == 0, ]) # OR sum(Credit$Balance == 0)
## [1] 90
# CHUNK 3
# Calculate the correlation matrix for numeric variables
# The numeric predictors are in the first 6 columns
cor.matrix <- cor(Credit[, c(1:6, 11)])
print("Correlation Matrix")
## [1] "Correlation Matrix"
round(cor.matrix, digits = 4)
## Income Limit Rating Cards Age Education Balance
## Income 1.0000 0.7921 0.7914 -0.0183 0.1753 -0.0277 0.4637
## Limit 0.7921 1.0000 0.9969 0.0102 0.1009 -0.0235 0.8617
## Rating 0.7914 0.9969 1.0000 0.0532 0.1032 -0.0301 0.8636
## Cards -0.0183 0.0102 0.0532 1.0000 0.0429 -0.0511 0.0865
## Age 0.1753 0.1009 0.1032 0.0429 1.0000 0.0036 0.0018
## Education -0.0277 -0.0235 -0.0301 -0.0511 0.0036 1.0000 -0.0081
## Balance 0.4637 0.8617 0.8636 0.0865 0.0018 -0.0081 1.0000
### Notice the correlation coefficient between the limit and rating
# CHUNK 4
ggplot(Credit, aes(x = Limit, y = Rating)) +
geom_point()
# Delete Limit
Credit$Limit <- NULL
Association of the predictor variables and the target variable…
# CHUNK 5
# first save the names of the numeric predictors as a vector
vars.numeric <- colnames(Credit[, 1:5])
for (i in vars.numeric) {
plot <- ggplot(Credit, aes(x = Credit[, i], y = Balance)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = i)
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# CHUNK 6
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Save the names of the categorical predictors as a vector
vars.categorical <- c("Gender", "Student", "Married", "Ethnicity")
for (i in vars.categorical) {
x <- Credit %>%
group_by_(i) %>%
summarize(
mean = mean(Balance),
median = median(Balance),
n = n()
)
print(x)
}
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 4
## Gender mean median n
## <fct> <dbl> <int> <int>
## 1 " Male" 510. 463 193
## 2 "Female" 530. 456 207
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 4
## Student mean median n
## <fct> <dbl> <dbl> <int>
## 1 No 480. 424 360
## 2 Yes 877. 953 40
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 2 × 4
## Married mean median n
## <fct> <dbl> <int> <int>
## 1 No 523. 467 155
## 2 Yes 518. 454 245
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 3 × 4
## Ethnicity mean median n
## <fct> <dbl> <dbl> <int>
## 1 African American 531 480 99
## 2 Asian 512. 414 102
## 3 Caucasian 518. 465 199
# CHUNK 7
for (i in vars.categorical) {
plot <- ggplot(Credit, aes(x = Credit[, i], y = Balance)) +
geom_boxplot() +
labs(x = i)
print(plot)
}
Here we try to see if we should consider any interaction terms between numerical and cat variables. Think about the difference between the “correlation” and the “interaction”.
# CHUNK 8
ggplot(Credit, aes(x = Student, y = Age)) +
geom_boxplot()
ggplot(Credit, aes(x = Income, y = Balance, color = Student)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
Here, we start partitioning the dataset and fit the lm (ols).
# CHUNK 9
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(8964)
partition <- createDataPartition(Credit$Balance, p = 0.75, list = FALSE)
data.train <- Credit[partition, ]
data.test <- Credit[-partition, ]
print("TRAIN")
## [1] "TRAIN"
mean(data.train$Balance)
## [1] 520.4352
print("TEST")
## [1] "TEST"
mean(data.test$Balance)
## [1] 518.7374
In R, the binarization is done automatically, but using the alphanumeric order of the levels.
# CHUNK 10
model.full <- lm(Balance ~ . + Income:Student, data = data.train)
summary(model.full)
##
## Call:
## lm(formula = Balance ~ . + Income:Student, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.17 -76.46 -15.65 66.69 296.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -552.34353 40.34543 -13.690 <2e-16 ***
## Income -7.93864 0.29465 -26.943 <2e-16 ***
## Rating 4.01246 0.06416 62.540 <2e-16 ***
## Cards 2.89772 4.69168 0.618 0.5373
## Age -0.78778 0.35994 -2.189 0.0294 *
## Education 0.88922 1.92035 0.463 0.6437
## GenderFemale -18.37030 12.00867 -1.530 0.1272
## StudentYes 395.67744 30.72302 12.879 <2e-16 ***
## MarriedYes -20.46469 12.39735 -1.651 0.0999 .
## EthnicityAsian 6.53269 17.13794 0.381 0.7033
## EthnicityCaucasian 11.95386 14.82444 0.806 0.4207
## Income:StudentYes 0.50102 0.50028 1.001 0.3174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared: 0.952, Adjusted R-squared: 0.9502
## F-statistic: 521.4 on 11 and 289 DF, p-value: < 2.2e-16
Do the binarization manually.
# CHUNK 11
for (i in vars.categorical){
# Use the table() function to calculate the frequencies for each factor
table <- as.data.frame(table(Credit[, i]))
# Determine the level with the highest frequency
max <- which.max(table[, 2])
# Save the name of the level with the highest frequency
level.name <- as.character(table[max, 1])
# Set the baseline level to the most populous level
Credit[, i] <- relevel(Credit[, i], ref = level.name) ## ref is the baseline
}
summary(Credit[, vars.categorical])
## Gender Student Married Ethnicity
## Female:207 No :360 Yes:245 Caucasian :199
## Male :193 Yes: 40 No :155 African American: 99
## Asian :102
fitting the model by changing the baseline level.
# CHUNK 12
# To make sure factors in the training set are releveled
data.train <- Credit[partition, ]
model.full <- lm(Balance ~ . + Income:Student, data = data.train)
summary(model.full)
##
## Call:
## lm(formula = Balance ~ . + Income:Student, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.17 -76.46 -15.65 66.69 296.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -579.22466 40.08600 -14.450 <2e-16 ***
## Income -7.93864 0.29465 -26.943 <2e-16 ***
## Rating 4.01246 0.06416 62.540 <2e-16 ***
## Cards 2.89772 4.69168 0.618 0.5373
## Age -0.78778 0.35994 -2.189 0.0294 *
## Education 0.88922 1.92035 0.463 0.6437
## Gender Male 18.37030 12.00867 1.530 0.1272
## StudentYes 395.67744 30.72302 12.879 <2e-16 ***
## MarriedNo 20.46469 12.39735 1.651 0.0999 .
## EthnicityAfrican American -11.95386 14.82444 -0.806 0.4207
## EthnicityAsian -5.42117 14.68523 -0.369 0.7123
## Income:StudentYes 0.50102 0.50028 1.001 0.3174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared: 0.952, Adjusted R-squared: 0.9502
## F-statistic: 521.4 on 11 and 289 DF, p-value: < 2.2e-16
Create new columns with the dummy variables based on the manual binarization.
# CHUNK 13
library(caret)
binarizer <- dummyVars(paste("~", paste(vars.categorical, collapse = "+")),
data = Credit, fullRank = TRUE)
# OR type out the categorical predictors one by one
#binarizer <- dummyVars(~ Gender + Student + Married + Ethnicity,
# data = Credit, fullRank = TRUE)
binarized_vars <- data.frame(predict(binarizer, newdata = Credit))
head(binarized_vars)
## Gender..Male Student.Yes Married.No Ethnicity.African.American
## 1 1 0 0 0
## 2 0 1 0 0
## 3 1 0 1 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 1 0 1 0
## Ethnicity.Asian
## 1 0
## 2 1
## 3 1
## 4 1
## 5 0
## 6 0
Here, we include the dummy variables in our CREDIT dataframe and delete the original categorical variables. Later, you will see the reason for manual binarization (feature selection part)
# CHUNK 14
Credit.bin <- cbind(Credit, binarized_vars)
head(Credit.bin)
## Income Rating Cards Age Education Gender Student Married Ethnicity Balance
## 1 14.891 283 2 34 11 Male No Yes Caucasian 333
## 2 106.025 483 3 82 15 Female Yes Yes Asian 903
## 3 104.593 514 4 71 11 Male No No Asian 580
## 4 148.924 681 3 36 11 Female No No Asian 964
## 5 55.882 357 2 68 16 Male No Yes Caucasian 331
## 6 80.180 569 4 77 10 Male No No Caucasian 1151
## Gender..Male Student.Yes Married.No Ethnicity.African.American
## 1 1 0 0 0
## 2 0 1 0 0
## 3 1 0 1 0
## 4 0 0 1 0
## 5 1 0 0 0
## 6 1 0 1 0
## Ethnicity.Asian
## 1 0
## 2 1
## 3 1
## 4 1
## 5 0
## 6 0
Credit.bin$Gender <- NULL
Credit.bin$Student <- NULL
Credit.bin$Married <- NULL
Credit.bin$Ethnicity <- NULL
head(Credit.bin)
## Income Rating Cards Age Education Balance Gender..Male Student.Yes
## 1 14.891 283 2 34 11 333 1 0
## 2 106.025 483 3 82 15 903 0 1
## 3 104.593 514 4 71 11 580 1 0
## 4 148.924 681 3 36 11 964 0 0
## 5 55.882 357 2 68 16 331 1 0
## 6 80.180 569 4 77 10 1151 1 0
## Married.No Ethnicity.African.American Ethnicity.Asian
## 1 0 0 0
## 2 0 0 1
## 3 1 0 1
## 4 1 0 1
## 5 0 0 0
## 6 1 0 0
data.train.bin <- Credit.bin[partition, ]
data.test.bin <- Credit.bin[-partition, ]
Fit the linear model to the new training data set (with the manually created dummy variables)
# CHUNK 15
# The interaction term is now Income:Student.Yes, not Income:Student
model.full.bin <- lm(Balance ~ . + Income:Student.Yes, data = data.train.bin)
summary(model.full.bin)
##
## Call:
## lm(formula = Balance ~ . + Income:Student.Yes, data = data.train.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.17 -76.46 -15.65 66.69 296.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -579.22466 40.08600 -14.450 <2e-16 ***
## Income -7.93864 0.29465 -26.943 <2e-16 ***
## Rating 4.01246 0.06416 62.540 <2e-16 ***
## Cards 2.89772 4.69168 0.618 0.5373
## Age -0.78778 0.35994 -2.189 0.0294 *
## Education 0.88922 1.92035 0.463 0.6437
## Gender..Male 18.37030 12.00867 1.530 0.1272
## Student.Yes 395.67744 30.72302 12.879 <2e-16 ***
## Married.No 20.46469 12.39735 1.651 0.0999 .
## Ethnicity.African.American -11.95386 14.82444 -0.806 0.4207
## Ethnicity.Asian -5.42117 14.68523 -0.369 0.7123
## Income:Student.Yes 0.50102 0.50028 1.001 0.3174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 289 degrees of freedom
## Multiple R-squared: 0.952, Adjusted R-squared: 0.9502
## F-statistic: 521.4 on 11 and 289 DF, p-value: < 2.2e-16
Selecting the best predictor substet using backward and forward feature selection method.
# CHUNK 16
# install.packages("MASS") # uncomment this line the first time you use MASS
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model.backward.AIC <- stepAIC(model.full.bin)
## Start: AIC=2802.25
## Balance ~ Income + Rating + Cards + Age + Education + Gender..Male +
## Student.Yes + Married.No + Ethnicity.African.American + Ethnicity.Asian +
## Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.Asian 1 1448 3071419 2800.4
## - Education 1 2278 3072249 2800.5
## - Cards 1 4052 3074024 2800.6
## - Ethnicity.African.American 1 6907 3076879 2800.9
## - Income:Student.Yes 1 10654 3080626 2801.3
## <none> 3069972 2802.3
## - Gender..Male 1 24859 3094830 2802.7
## - Married.No 1 28946 3098918 2803.1
## - Age 1 50885 3120857 2805.2
## - Rating 1 41548705 44618677 3605.9
##
## Step: AIC=2800.39
## Balance ~ Income + Rating + Cards + Age + Education + Gender..Male +
## Student.Yes + Married.No + Ethnicity.African.American + Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Education 1 2255 3073675 2798.6
## - Cards 1 3911 3075331 2798.8
## - Ethnicity.African.American 1 5595 3077014 2798.9
## - Income:Student.Yes 1 10176 3081595 2799.4
## <none> 3071419 2800.4
## - Gender..Male 1 25165 3096584 2800.8
## - Married.No 1 29351 3100771 2801.3
## - Age 1 51462 3122881 2803.4
## - Rating 1 41550276 44621695 3603.9
##
## Step: AIC=2798.61
## Balance ~ Income + Rating + Cards + Age + Gender..Male + Student.Yes +
## Married.No + Ethnicity.African.American + Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Cards 1 3865 3077540 2797.0
## - Ethnicity.African.American 1 5586 3079261 2797.2
## - Income:Student.Yes 1 10214 3083888 2797.6
## <none> 3073675 2798.6
## - Gender..Male 1 25155 3098830 2799.1
## - Married.No 1 28499 3102174 2799.4
## - Age 1 51652 3125327 2801.6
## - Rating 1 41550265 44623939 3601.9
##
## Step: AIC=2796.99
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes +
## Married.No + Ethnicity.African.American + Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Ethnicity.African.American 1 5579 3083119 2795.5
## - Income:Student.Yes 1 10767 3088307 2796.0
## <none> 3077540 2797.0
## - Gender..Male 1 23659 3101198 2797.3
## - Married.No 1 27917 3105457 2797.7
## - Age 1 49660 3127200 2799.8
## - Rating 1 42161655 45239195 3604.0
##
## Step: AIC=2795.54
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes +
## Married.No + Income:Student.Yes
##
## Df Sum of Sq RSS AIC
## - Income:Student.Yes 1 9873 3092992 2794.5
## <none> 3083119 2795.5
## - Gender..Male 1 23435 3106554 2795.8
## - Married.No 1 24716 3107835 2795.9
## - Age 1 50672 3133791 2798.4
## - Rating 1 42165674 45248793 3602.1
##
## Step: AIC=2794.5
## Balance ~ Income + Rating + Age + Gender..Male + Student.Yes +
## Married.No
##
## Df Sum of Sq RSS AIC
## <none> 3092992 2794.5
## - Gender..Male 1 22314 3115306 2794.7
## - Married.No 1 25027 3118019 2794.9
## - Age 1 49133 3142125 2797.2
## - Student.Yes 1 4931600 8024592 3079.5
## - Income 1 8357794 11450786 3186.5
## - Rating 1 42247226 45340218 3600.7
summary(model.backward.AIC)
##
## Call:
## lm(formula = Balance ~ Income + Rating + Age + Gender..Male +
## Student.Yes + Married.No, data = data.train.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -195.16 -76.45 -14.03 65.33 285.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -565.22267 26.73779 -21.139 <2e-16 ***
## Income -7.87666 0.27946 -28.186 <2e-16 ***
## Rating 4.01180 0.06331 63.370 <2e-16 ***
## Age -0.77016 0.35638 -2.161 0.0315 *
## Gender..Male 17.32103 11.89326 1.456 0.1464
## Student.Yes 418.87031 19.34645 21.651 <2e-16 ***
## Married.No 18.73773 12.14860 1.542 0.1241
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 102.6 on 294 degrees of freedom
## Multiple R-squared: 0.9517, Adjusted R-squared: 0.9507
## F-statistic: 964.8 on 6 and 294 DF, p-value: < 2.2e-16
What if we use the model without the bins (manual binarization). You can see that the levels of the categorical variables will not stay in the model if we do not use the manual binarization.
** Just dfor the demonstration Only one step is included.
# CHUNK 17
model.no.binarize <- stepAIC(model.full, steps = 1)
## Start: AIC=2802.25
## Balance ~ Income + Rating + Cards + Age + Education + Gender +
## Student + Married + Ethnicity + Income:Student
##
## Df Sum of Sq RSS AIC
## - Ethnicity 2 7042 3077014 2798.9
## - Education 1 2278 3072249 2800.5
## - Cards 1 4052 3074024 2800.6
## - Income:Student 1 10654 3080626 2801.3
## <none> 3069972 2802.3
## - Gender 1 24859 3094830 2802.7
## - Married 1 28946 3098918 2803.1
## - Age 1 50885 3120857 2805.2
## - Rating 1 41548705 44618677 3605.9
##
## Step: AIC=2798.94
## Balance ~ Income + Rating + Cards + Age + Education + Gender +
## Student + Married + Income:Student
Here, we use the forward selection, with BIC.
# CHUNK 18
# first fit the null model (i.e., model with no predictors)
model.null <- lm(Balance ~ 1, data = data.train.bin)
model.forward.BIC <- stepAIC(model.null,
direction = "forward",
scope = list(upper = model.full.bin,
lower = model.null),
k = log(nrow(data.train.bin))) ## k=2 is AIC (default) k = log(nrow(data.train.bin)) generates the BIC
## Start: AIC=3698.12
## Balance ~ 1
##
## Df Sum of Sq RSS AIC
## + Rating 1 47082983 16908423 3303.2
## + Income 1 12910007 51081399 3636.0
## + Student.Yes 1 5425335 58566071 3677.2
## <none> 63991406 3698.1
## + Cards 1 163291 63828115 3703.1
## + Gender..Male 1 127201 63864205 3703.2
## + Age 1 79880 63911526 3703.5
## + Education 1 13121 63978284 3703.8
## + Married.No 1 10920 63980486 3703.8
## + Ethnicity.Asian 1 3049 63988357 3703.8
## + Ethnicity.African.American 1 0 63991406 3703.8
##
## Step: AIC=3303.21
## Balance ~ Rating
##
## Df Sum of Sq RSS AIC
## + Income 1 8650956 8257467 3093.2
## + Student.Yes 1 4817765 12090659 3208.0
## + Age 1 707560 16200863 3296.1
## <none> 16908423 3303.2
## + Married.No 1 174547 16733876 3305.8
## + Education 1 71402 16837022 3307.6
## + Ethnicity.Asian 1 49620 16858804 3308.0
## + Gender..Male 1 31887 16876536 3308.4
## + Cards 1 30049 16878374 3308.4
## + Ethnicity.African.American 1 4304 16904120 3308.8
##
## Step: AIC=3093.2
## Balance ~ Rating + Income
##
## Df Sum of Sq RSS AIC
## + Student.Yes 1 5069114 3188353 2812.5
## <none> 8257467 3093.2
## + Married.No 1 127904 8129563 3094.2
## + Age 1 95208 8162259 3095.4
## + Education 1 44496 8212971 3097.3
## + Ethnicity.Asian 1 40115 8217352 3097.4
## + Cards 1 22019 8235448 3098.1
## + Ethnicity.African.American 1 9464 8248003 3098.6
## + Gender..Male 1 244 8257223 3098.9
##
## Step: AIC=2812.47
## Balance ~ Rating + Income + Student.Yes
##
## Df Sum of Sq RSS AIC
## <none> 3188353 2812.5
## + Age 1 46257 3142096 2813.8
## + Gender..Male 1 24128 3164225 2815.9
## + Married.No 1 23759 3164595 2815.9
## + Income:Student.Yes 1 7569 3180784 2817.5
## + Ethnicity.African.American 1 2346 3186007 2817.9
## + Education 1 1600 3186753 2818.0
## + Cards 1 865 3187488 2818.1
## + Ethnicity.Asian 1 747 3187606 2818.1
summary(model.forward.BIC)
##
## Call:
## lm(formula = Balance ~ Rating + Income + Student.Yes, data = data.train.bin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.292 -78.120 -7.473 63.273 299.895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -589.16636 16.11012 -36.57 <2e-16 ***
## Rating 4.01634 0.06353 63.22 <2e-16 ***
## Income -7.97413 0.27691 -28.80 <2e-16 ***
## Student.Yes 421.17339 19.38206 21.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.6 on 297 degrees of freedom
## Multiple R-squared: 0.9502, Adjusted R-squared: 0.9497
## F-statistic: 1888 on 3 and 297 DF, p-value: < 2.2e-16
RMSE for the test datasets. This explain the prediction accuracy.
# CHUNK 19
RMSE(data.test$Balance, predict(model.null, newdata = data.test.bin)) # none.
## [1] 453.3665
RMSE(data.test$Balance, predict(model.full.bin, newdata = data.test.bin)) # 11 predictors
## [1] 105.0656
RMSE(data.test$Balance, predict(model.backward.AIC, newdata = data.test.bin)) # 7 predictors
## [1] 104.2375
RMSE(data.test$Balance, predict(model.forward.BIC, newdata = data.test.bin)) ## 5 predictors
## [1] 102.6785
We choose the model with the smallest RMSE, and the one we choose using forward selection is the best. Now, let do the model diagnostics for the best model.
# CHUNK 20
plot(model.forward.BIC)
# CHUNK 21
X.train <- model.matrix(Balance ~ . + Income:Student, data = data.train)
head(X.train) # print out a few rows of the design matrix
## (Intercept) Income Rating Cards Age Education Gender Male StudentYes
## 1 1 14.891 283 2 34 11 1 0
## 2 1 106.025 483 3 82 15 0 1
## 3 1 104.593 514 4 71 11 1 0
## 6 1 80.180 569 4 77 10 1 0
## 8 1 71.408 512 2 87 9 1 0
## 9 1 15.125 266 5 66 13 0 0
## MarriedNo EthnicityAfrican American EthnicityAsian Income:StudentYes
## 1 0 0 0 0.000
## 2 0 0 1 106.025
## 3 1 0 1 0.000
## 6 1 0 0 0.000
## 8 1 0 1 0.000
## 9 1 0 0 0.000
# CHUNK 22
# install.packages("glmnet") # uncomment this line the first time you use glmnet
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-10
m.lambda <- glmnet(x = X.train,
y = data.train$Balance,
family = "gaussian",
lambda = c(0, 10, 100, 500, 1000),
alpha = 0.5)
# first way to extract coefficient estimates
m.lambda$a0
## s0 s1 s2 s3 s4
## 520.4352 301.2973 -224.9669 -506.0179 -579.1925
m.lambda$beta
## 12 x 5 sparse Matrix of class "dgCMatrix"
## s0 s1 s2 s3 s4
## (Intercept) . . . . .
## Income . . . -6.63360837 -7.9320588
## Rating . 0.6182676 2.034434 3.71042914 4.0114839
## Cards . . . 0.51433943 2.9161946
## Age . . . -0.70550849 -0.7885101
## Education . . . . 0.8874960
## Gender Male . . . 3.08510246 18.3671409
## StudentYes . . 228.762987 396.62014836 396.2181065
## MarriedNo . . . 9.36968005 20.4646766
## EthnicityAfrican American . . . . -11.9281110
## EthnicityAsian . . . . -5.3983474
## Income:StudentYes . . . 0.01473463 0.4929168
# second way
coef(m.lambda)
## 13 x 5 sparse Matrix of class "dgCMatrix"
## s0 s1 s2 s3
## (Intercept) 520.4352 301.2973356 -224.966926 -506.01794831
## (Intercept) . . . .
## Income . . . -6.63360837
## Rating . 0.6182676 2.034434 3.71042914
## Cards . . . 0.51433943
## Age . . . -0.70550849
## Education . . . .
## Gender Male . . . 3.08510246
## StudentYes . . 228.762987 396.62014836
## MarriedNo . . . 9.36968005
## EthnicityAfrican American . . . .
## EthnicityAsian . . . .
## Income:StudentYes . . . 0.01473463
## s4
## (Intercept) -579.1924633
## (Intercept) .
## Income -7.9320588
## Rating 4.0114839
## Cards 2.9161946
## Age -0.7885101
## Education 0.8874960
## Gender Male 18.3671409
## StudentYes 396.2181065
## MarriedNo 20.4646766
## EthnicityAfrican American -11.9281110
## EthnicityAsian -5.3983474
## Income:StudentYes 0.4929168
mean(data.train$Balance)
## [1] 520.4352
# CHUNK 23
set.seed(1111)
m <- cv.glmnet(x = X.train,
y = data.train$Balance,
family = "gaussian",
alpha = 0.5)
plot(m)
m$lambda.min
## [1] 0.8885539
m$lambda.1se
## [1] 7.550517
# CHUNK 24
# Fit the regularized model using lambda.min
m.min <- glmnet(x = X.train,
y = data.train$Balance,
family = "gaussian",
lambda = m$lambda.min,
alpha = 0.5)
# List the coefficient estimates
m.min$beta
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## Income -7.8131829
## Rating 3.9839963
## Cards 2.6915109
## Age -0.7829239
## Education 0.7557949
## Gender Male 17.0049498
## StudentYes 396.3329355
## MarriedNo 19.4182020
## EthnicityAfrican American -9.9708020
## EthnicityAsian -3.6479011
## Income:StudentYes 0.4458288
# Set up the design matrix for the test set
X.test <- model.matrix(Balance ~ . + Income:Student, data = data.test)
# Make predictions on the test set and calculate test RMSE
m.min.pred <- predict(m.min, newx = X.test)
RMSE(data.test$Balance, m.min.pred)
## [1] 103.9201
# CHUNK 25
# Fit the regularized model using lambda.1se
m.1se <- glmnet(x = X.train,
y = data.train$Balance,
family = "gaussian",
lambda = m$lambda.1se,
alpha = 0.5)
# List the coefficient estimates
m.1se$beta
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) .
## Income -6.9378441
## Rating 3.7813573
## Cards 1.1075391
## Age -0.7305808
## Education .
## Gender Male 6.7717419
## StudentYes 397.5199346
## MarriedNo 11.6768447
## EthnicityAfrican American .
## EthnicityAsian .
## Income:StudentYes 0.1148304
# Make predictions on the test set and calculate test RMSE
m.1se.pred <- predict(m.1se, newx = X.test)
RMSE(data.test$Balance, m.1se.pred)
## [1] 102.8241