To build the best GLM usiing lending club dataset to predict interest rate
The best GLM model is using all the remain features (45 features) and alpha = 0.5 (elastic net). RMSE = 3.8, R^2 = 0.24, MSE = 14.67
This document includes
General EDA reference
Feature engineering
GLM
## Loading required package: lattice
## Loading required package: ggplot2
## corrplot 0.84 loaded
## For NA columns
mydata = readRDS('mytable.csv')
num.NA <- sort(sapply(mydata, function(x) {sum(is.na(x))}), decreasing = T)
remain.col <- names(num.NA)[which(num.NA <= 0.8 * dim(mydata)[1])]
myremain = mydata[,remain.col ]
num_NA_remain <- sort(sapply(myremain,function(x) {sum(is.na(x))}), decreasing = T)
## remove pymnt columns
no_pymnt_col = names(myremain)[!grepl('pymnt', names(myremain))]
length(no_pymnt_col)
## [1] 51
remain_data = myremain[, no_pymnt_col]
dim(remain_data)
## [1] 887379 51
## remove int columns
no_int_col = names(remain_data)[!grepl('int', names(remain_data))]
remain_data_2 = cbind(remain_data[, no_int_col], remain_data$int_rate)
names(remain_data_2)[49] = 'int_rate'
## remove univariate and id variables
summary(remain_data_2)
## mths_since_last_major_derog mths_since_last_delinq tot_coll_amt
## Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 27.0 1st Qu.: 15.0 1st Qu.: 0
## Median : 44.0 Median : 31.0 Median : 0
## Mean : 44.1 Mean : 34.1 Mean : 226
## 3rd Qu.: 61.0 3rd Qu.: 50.0 3rd Qu.: 0
## Max. :188.0 Max. :188.0 Max. :9152545
## NA's :665676 NA's :454312 NA's :70276
## tot_cur_bal total_rev_hi_lim revol_util
## Min. : 0 Min. : 0 Min. : 0.00
## 1st Qu.: 29853 1st Qu.: 13900 1st Qu.: 37.70
## Median : 80559 Median : 23700 Median : 56.00
## Mean : 139458 Mean : 32069 Mean : 55.07
## 3rd Qu.: 208205 3rd Qu.: 39800 3rd Qu.: 73.60
## Max. :8000078 Max. :9999999 Max. :892.30
## NA's :70276 NA's :70276 NA's :502
## collections_12_mths_ex_med delinq_2yrs inq_last_6mths
## Min. : 0.00000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.00000 Median : 0.0000 Median : 0.0000
## Mean : 0.01438 Mean : 0.3144 Mean : 0.6946
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 1.0000
## Max. :20.00000 Max. :39.0000 Max. :33.0000
## NA's :145 NA's :29 NA's :29
## open_acc pub_rec total_acc acc_now_delinq
## Min. : 0.00 Min. : 0.0000 Min. : 1.00 Min. : 0.000000
## 1st Qu.: 8.00 1st Qu.: 0.0000 1st Qu.: 17.00 1st Qu.: 0.000000
## Median :11.00 Median : 0.0000 Median : 24.00 Median : 0.000000
## Mean :11.55 Mean : 0.1953 Mean : 25.27 Mean : 0.004991
## 3rd Qu.:14.00 3rd Qu.: 0.0000 3rd Qu.: 32.00 3rd Qu.: 0.000000
## Max. :90.00 Max. :86.0000 Max. :169.00 Max. :14.000000
## NA's :29 NA's :29 NA's :29 NA's :29
## annual_inc id member_id loan_amnt
## Min. : 0 Min. : 54734 Min. : 70473 Min. : 500
## 1st Qu.: 45000 1st Qu.: 9206643 1st Qu.:10877134 1st Qu.: 8000
## Median : 65000 Median :34433267 Median :37095283 Median :13000
## Mean : 75028 Mean :32465133 Mean :35001825 Mean :14755
## 3rd Qu.: 90000 3rd Qu.:54908135 3rd Qu.:58471347 3rd Qu.:20000
## Max. :9500000 Max. :68617057 Max. :73544841 Max. :35000
## NA's :4
## funded_amnt funded_amnt_inv term installment
## Min. : 500 Min. : 0 Length:887379 Min. : 15.67
## 1st Qu.: 8000 1st Qu.: 8000 Class :character 1st Qu.: 260.70
## Median :13000 Median :13000 Mode :character Median : 382.55
## Mean :14742 Mean :14702 Mean : 436.72
## 3rd Qu.:20000 3rd Qu.:20000 3rd Qu.: 572.60
## Max. :35000 Max. :35000 Max. :1445.46
##
## grade sub_grade emp_title
## Length:887379 Length:887379 Length:887379
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## emp_length home_ownership verification_status
## Length:887379 Length:887379 Length:887379
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## issue_d loan_status url
## Length:887379 Length:887379 Length:887379
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## desc purpose title
## Length:887379 Length:887379 Length:887379
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## zip_code addr_state dti
## Length:887379 Length:887379 Min. : 0.00
## Class :character Class :character 1st Qu.: 11.91
## Mode :character Mode :character Median : 17.65
## Mean : 18.16
## 3rd Qu.: 23.95
## Max. :9999.00
##
## earliest_cr_line revol_bal initial_list_status out_prncp
## Length:887379 Min. : 0 Length:887379 Min. : 0
## Class :character 1st Qu.: 6443 Class :character 1st Qu.: 0
## Mode :character Median : 11875 Mode :character Median : 6458
## Mean : 16921 Mean : 8403
## 3rd Qu.: 20829 3rd Qu.:13659
## Max. :2904836 Max. :49373
##
## out_prncp_inv total_rec_prncp total_rec_late_fee recoveries
## Min. : 0 Min. : 0 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0 1st Qu.: 1201 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 6456 Median : 3215 Median : 0.0000 Median : 0.00
## Mean : 8400 Mean : 5758 Mean : 0.3967 Mean : 45.92
## 3rd Qu.:13654 3rd Qu.: 8000 3rd Qu.: 0.0000 3rd Qu.: 0.00
## Max. :49373 Max. :35000 Max. :358.6800 Max. :33520.27
##
## collection_recovery_fee last_credit_pull_d policy_code
## Min. : 0.000 Length:887379 Min. :1
## 1st Qu.: 0.000 Class :character 1st Qu.:1
## Median : 0.000 Mode :character Median :1
## Mean : 4.881 Mean :1
## 3rd Qu.: 0.000 3rd Qu.:1
## Max. :7002.190 Max. :1
##
## application_type int_rate
## Length:887379 Min. : 5.32
## Class :character 1st Qu.: 9.99
## Mode :character Median :12.99
## Mean :13.25
## 3rd Qu.:16.20
## Max. :28.99
##
#loan = remain_data_2
#num.value <- sapply(loan, function(x){return(length(unique(x)))})
#which(num.value == 1)
#which(num.value == dim(loan)[1])
remain_data_2 = subset(remain_data_2, select = -c(policy_code, id, member_id))
num_na <- sort(sapply(remain_data_2, function(x) {sum(is.na(x))}), decreasing = T)
num_na
## mths_since_last_major_derog mths_since_last_delinq
## 665676 454312
## tot_coll_amt tot_cur_bal
## 70276 70276
## total_rev_hi_lim revol_util
## 70276 502
## collections_12_mths_ex_med delinq_2yrs
## 145 29
## inq_last_6mths open_acc
## 29 29
## pub_rec total_acc
## 29 29
## acc_now_delinq annual_inc
## 29 4
## loan_amnt funded_amnt
## 0 0
## funded_amnt_inv term
## 0 0
## installment grade
## 0 0
## sub_grade emp_title
## 0 0
## emp_length home_ownership
## 0 0
## verification_status issue_d
## 0 0
## loan_status url
## 0 0
## desc purpose
## 0 0
## title zip_code
## 0 0
## addr_state dti
## 0 0
## earliest_cr_line revol_bal
## 0 0
## initial_list_status out_prncp
## 0 0
## out_prncp_inv total_rec_prncp
## 0 0
## total_rec_late_fee recoveries
## 0 0
## collection_recovery_fee last_credit_pull_d
## 0 0
## application_type int_rate
## 0 0
#F1 <- createDataPartition(myremain$int_rate, p = 0.8, groups = 2)
#data_sub = myremain[F1$Resample1,]
P1 = createDataPartition(remain_data_2$int_rate, p = 0.7, groups = 2)
loan_train = remain_data_2[P1$Resample1, ]
loan_test = remain_data_2[-P1$Resample1, ]
boxplot(int_rate~addr_state, data = loan_train)
# possible to see NA if features has missing value
col_type = sapply(loan_train, class)
num_col = names(loan_train)[col_type == 'numeric']
cate_col = names(loan_train)[col_type != 'numeric']
correlations <- cor(loan_train[, num_col],
use = "pairwise.complete.obs")
corrplot(correlations, method = "square", tl.cex = 1, type = 'lower')
## imputate state name
int_state <- by(loan_train, loan_train$addr_state, function(x) {
return(mean(x$int_rate))
})
high_cor_f = which(abs(correlations[,27]) > 0.1 )
## this needs to be an encoder when doing prediction
loan_train$state_mean_int <-
ifelse(loan_train$addr_state %in% names(int_state)[which(int_state <= quantile(int_state, 0.25))],
'low', ifelse(loan_train$addr_state %in% names(int_state)[which(int_state <= quantile(int_state, 0.5))],
'lowmedium', ifelse(loan_train$addr_state %in% names(int_state)[which(int_state <= quantile(int_state, 0.75))],
'mediumhigh', 'high')))
## imputate
The best model so far measured by mean square error (MSE) and R_square are glm_1, glm_2 and glm_3.
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 43 minutes
## H2O cluster timezone: America/Denver
## H2O data parsing timezone: UTC
## H2O cluster version: 3.18.0.8
## H2O cluster version age: 9 days
## H2O cluster name: H2O_from_python_xiguo_yh2i23
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.44 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.4 (2018-03-15)
hdf = as.h2o(remain_data_2, destination_frame = 'myhdf')
##
|
| | 0%
|
|=================================================================| 100%
hdf_split = h2o.splitFrame(hdf, ratios = .7)
hdf_train = hdf_split[[1]]
hdf_test = hdf_split[[2]]
y = 'int_rate'
x = h2o.colnames(hdf)[-46]
x2 = h2o.colnames(hdf)[1:15] ## best feature
x3 = h2o.colnames(hdf)[15:25] ##
x4 = h2o.colnames(hdf)[25:35] ## worst performance
x5 = h2o.colnames(hdf)[c(5, 6, 9, 15,16, 17, 19, 42)]
x6 = h2o.colnames(hdf)[c(5:15,16, 17, 19, 42)]
glm_1 <- h2o.glm(y = y, x = x, training_frame=hdf_train, family="gaussian", model_id = 'glm1')
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [last_credit_pull_d, loan_status, emp_length, purpose, application_type, issue_d, emp_title, home_ownership, verification_status, title, url, zip_code, addr_state, grade, initial_list_status, earliest_cr_line, term, sub_grade, desc].
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
glm_2 <- h2o.glm(y = y, x = x, training_frame=hdf_train, family="gaussian", nfolds=5, alpha=1, model_id = 'glm2')
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [last_credit_pull_d, loan_status, emp_length, purpose, application_type, issue_d, emp_title, home_ownership, verification_status, title, url, zip_code, addr_state, grade, initial_list_status, earliest_cr_line, term, sub_grade, desc].
##
|
| | 0%
|
|================================= | 51%
|
|=================================================================| 100%
glm_3 <- h2o.glm(y = y, x=x2, training_frame = hdf_train, family = 'gaussian' , model_id = 'glm3')
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
glm_4 <- h2o.glm(y = y, x=x3, training_frame = hdf_train, family = 'gaussian' , model_id = 'glm4')
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [emp_length, grade, term, emp_title, home_ownership, verification_status, sub_grade].
##
|
| | 0%
|
|=== | 4%
|
|=================================================================| 100%
glm_5 <- h2o.glm(y = y, x=x4, training_frame = hdf_train, family = 'gaussian', model_id = 'glm5' ) ##worst performance
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [loan_status, addr_state, purpose, issue_d, earliest_cr_line, verification_status, title, url, zip_code, desc].
##
|
| | 0%
|
|=== | 4%
|
|=================================================================| 100%
glm_6 <- h2o.glm(y = y, x=x5, training_frame = hdf_train, family = 'gaussian' , model_id = 'glm6')
##
|
| | 0%
|
|= | 2%
|
|=================================================================| 100%
h2o.performance(glm_1, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 15.24599
## RMSE: 3.904611
## MAE: 3.006649
## RMSLE: NaN
## Mean Residual Deviance : 15.24599
## R^2 : 0.2041887
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :4061166
## Residual D.o.F. :266349
## AIC :1481691
h2o.performance(glm_2, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 15.25099
## RMSE: 3.905252
## MAE: 3.006187
## RMSLE: NaN
## Mean Residual Deviance : 15.25099
## R^2 : 0.2039276
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :4062499
## Residual D.o.F. :266351
## AIC :1481774
h2o.performance(glm_3, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 15.46468
## RMSE: 3.932516
## MAE: 3.071922
## RMSLE: NaN
## Mean Residual Deviance : 15.46468
## R^2 : 0.1927733
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :4119421
## Residual D.o.F. :266360
## AIC :1485463
h2o.performance(glm_4, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 18.75463
## RMSE: 4.330662
## MAE: 3.479641
## RMSLE: 0.3189626
## Mean Residual Deviance : 18.75463
## R^2 : 0.02104445
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :4995784
## Residual D.o.F. :266371
## AIC :1536820
h2o.performance(glm_5, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 19.13386
## RMSE: 4.374226
## MAE: 3.486454
## RMSLE: 0.3194138
## Mean Residual Deviance : 19.13386
## R^2 : 0.001249623
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :5096800
## Residual D.o.F. :266374
## AIC :1542146
h2o.performance(glm_6, newdata = hdf_test)
## H2ORegressionMetrics: glm
##
## MSE: 15.85458
## RMSE: 3.981781
## MAE: 3.115822
## RMSLE: NaN
## Mean Residual Deviance : 15.85458
## R^2 : 0.1724215
## Null Deviance :5103178
## Null D.o.F. :266375
## Residual Deviance :4223280
## Residual D.o.F. :266368
## AIC :1492079
## you can use autoML in h2o to find a better model
#auto_ml = h2o.automl(x=x, y=y, training_frame = hdf_train, nfolds = 3)
#h2o.colnames(hdf_train)