loans <- read_fst(paste0(path,"loan_data_fico_prob_Oct272020.fst"),as.data.table = T)
loans[,credithistorycat:=ifelse(individual_credit_history_days<120*12,"lt10","gt10")]
loans[,year_term:=paste(loanyear,loan_term)]
loans[,zip_year:=paste(zip,loanyear)]
loans[,ficoby100:=cr_fico/100]
setnames(loans,
         c("annual_income.x","borrower_age","dti_at_pricing","number_of_accounts_at_pricing","g098_number_or_inquiries_in_last_6_months","individual_credit_history_days","months_at_job_as_of_approval"),
         c("income","age","dti","noofaccounts","noinq6months","crhistorydays","monthsatjob"))
loans[,stateyear:=paste0(state,loanyear)]
loans[,mtgbalancepct:=(totalbalance-totalbalanceminusmortgage)/totalbalance]
loans[,mtgpmttoincome:=mortgage_payment_at_pricing/income]
loans[,credithistorytoage:=crhistorydays/(365*age)]
loans[,ccbalancetoincome:=totalcreditcardbalance/income]

1 Recursive Feature Elimination with Random Forests

Objective: identify the most important variables that predict Upstart Score

R Package documentation here: https://topepo.github.io/caret/recursive-feature-elimination.html

1.1 3-year loans

Sample after 2017.

1.1.1 Full list of variables

x <- loans[loan_term==36 & loanyear>=2017,c("ficoby100","income","age","dti","noofaccounts","noinq6months","crhistorydays","totalbalance","yearssincegraduation","education","debtconsolidation","monthsatjob","employed_hourly","computer","loanaggrigator","default_probability","loanyear","loan_amount","referral","primary_income_verified","num_delinquencies_past_2_years_at_pricing","mtgpmttoincome","noof90ormoretradesever","noof30ormoretrades","creditcardutilization","ccbalancetoincome","noofopencreditcardtrades","noofopenautotrades","mtgbalancepct","credithistorytoage")] #,"home_ownership_type","employment_status"
x <- x[complete.cases(x)]

x[,education:=as.factor(education)]
x[,loanyear:=as.factor(loanyear)]
x[,age2:=age*age]

setnames(x,
         c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount","num_delinquencies_past_2_years_at_pricing"),
         paste0("log_",c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount","delq2yrs")))

x[,log_income:=log(1+log_income)]
x[,log_noofaccounts:=log(1+log_noofaccounts)]
x[,log_noinq6months:=log(1+log_noinq6months)]
x[,log_totalbalance:=log(6+log_totalbalance)]
x[,log_crhistorydays:=log(1+log_crhistorydays)]
x[,log_monthsatjob:=log(14+log_monthsatjob)]
x[,log_loan_amount:=log(1+log_loan_amount)]
x[,primary_income_verified:=ifelse(primary_income_verified==T,1,0)]
x[,referral:=ifelse(referral==T,1,0)]

x[,collegedegree:=ifelse(education==1,1,0)]
x[,advanceddegree:=ifelse(education==2,1,0)]
x[,yr2018:=ifelse(loanyear==2018,1,0)]
x[,yr2019:=ifelse(loanyear==2019,1,0)]

y <- x$default_probability

x[,c("default_probability","education","loanyear"):=list(NULL)]

x <- as.data.frame(x)
str(x)

## 'data.frame':    116262 obs. of  32 variables:
##  $ ficoby100               : num  6.7 6.3 7.42 6.99 6.5 7.31 6.54 6.43 6.45 6.99 ...
##  $ log_income              : num  10.8 10.9 11.5 11 10.6 ...
##  $ age                     : int  28 37 34 29 36 29 36 34 29 32 ...
##  $ dti                     : num  21.2 11.4 8.5 19 23.2 ...
##  $ log_noofaccounts        : num  3.22 3.18 3.04 2.71 2.48 ...
##  $ log_noinq6months        : num  0 0.693 0 0.693 0 ...
##  $ log_crhistorydays       : num  8.22 8.82 8.74 8.35 8.37 ...
##  $ log_totalbalance        : num  11.9 11.3 12.3 13.4 10.3 ...
##  $ yearssincegraduation    : num  2 9 6 7 6 6 13 4 8 8 ...
##  $ debtconsolidation       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ log_monthsatjob         : num  3.22 3.26 3.04 3.81 3.78 ...
##  $ employed_hourly         : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ computer                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ loanaggrigator          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_loan_amount         : num  9.62 8.94 9.33 9.68 8.85 ...
##  $ referral                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ primary_income_verified : num  1 1 1 1 1 0 1 1 0 1 ...
##  $ log_delq2yrs            : int  1 1 0 2 1 0 0 1 0 1 ...
##  $ mtgpmttoincome          : num  0 0 0.0121 0.0687 0 ...
##  $ noof90ormoretradesever  : int  2 6 0 0 0 0 0 7 9 0 ...
##  $ noof30ormoretrades      : int  0 3 0 2 1 0 0 0 0 0 ...
##  $ creditcardutilization   : int  71 86 101 87 99 98 92 66 100 65 ...
##  $ ccbalancetoincome       : num  0.152 0.279 0.117 0.264 0.156 ...
##  $ noofopencreditcardtrades: int  3 4 1 3 3 2 3 10 5 5 ...
##  $ noofopenautotrades      : int  1 0 1 1 1 1 0 1 -1 -1 ...
##  $ mtgbalancepct           : num  0 0 0.788 0.942 0 ...
##  $ credithistorytoage      : num  0.363 0.499 0.505 0.4 0.329 ...
##  $ age2                    : int  784 1369 1156 841 1296 841 1296 1156 841 1024 ...
##  $ collegedegree           : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ advanceddegree          : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ yr2018                  : num  1 1 1 0 0 0 1 0 1 0 ...
##  $ yr2019                  : num  0 0 0 0 0 1 0 0 0 1 ...

1.1.2 Contribution of each variable

RMSE imporvement is only marginal after 15 variables.

x <- x[,1:30]
normalization <- preProcess(x)
x <- as.data.frame(predict(normalization, x))
subsets <- c(1:ncol(x))



set.seed(10)

ctrl <- rfeControl(functions = lmFuncs,
                   method = "repeatedcv",
                   repeats = 5,
                   verbose = FALSE)

lmProfile <- rfe(x, y,
                 sizes = subsets,
                 rfeControl = ctrl)

plotdata <- data.table(lmProfile$results)

ggplot(plotdata,aes(x=Variables,y=RMSE))+geom_line(color="dodgerblue4",size=1)+geom_point(color="dodgerblue4",size=2)+theme_minimal()+
  labs(x="Variables",y="RMSE (Repeated Cross-Validation)")+geom_vline(xintercept = 15,color="darkred")

1.1.3 Selected variables

Here are the variables selected in the order of importance.

threeyearpred <- predictors(lmProfile)[1:15]
print(threeyearpred)

##  [1] "age"                   "ficoby100"             "dti"                   "age2"                  "log_crhistorydays"     "log_noinq6months"      "collegedegree"         "advanceddegree"        "creditcardutilization" "debtconsolidation"     "referral"              "log_income"            "computer"              "log_totalbalance"      "ccbalancetoincome"

1.2 5-year loans

Sample after 2017.

1.2.1 Full list of variables

x <- loans[loan_term==60 & loanyear>=2017,c("ficoby100","income","age","dti","noofaccounts","noinq6months","crhistorydays","totalbalance","yearssincegraduation","education","debtconsolidation","monthsatjob","employed_hourly","computer","loanaggrigator","default_probability","loanyear","loan_amount","referral","primary_income_verified","num_delinquencies_past_2_years_at_pricing","mtgpmttoincome","noof90ormoretradesever","noof30ormoretrades","creditcardutilization","ccbalancetoincome","noofopencreditcardtrades","noofopenautotrades","mtgbalancepct","credithistorytoage")] #,"home_ownership_type","employment_status"
x <- x[complete.cases(x)]

x[,education:=as.factor(education)]
x[,loanyear:=as.factor(loanyear)]
x[,age2:=age*age]

setnames(x,
         c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount"),
         paste0("log_",c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount")))

x[,log_income:=log(1+log_income)]
x[,log_noofaccounts:=log(1+log_noofaccounts)]
x[,log_noinq6months:=log(1+log_noinq6months)]
x[,log_totalbalance:=log(6+log_totalbalance)]
x[,log_crhistorydays:=log(1+log_crhistorydays)]
x[,log_monthsatjob:=log(14+log_monthsatjob)]
x[,log_loan_amount:=log(1+log_loan_amount)]
x[,primary_income_verified:=ifelse(primary_income_verified==T,1,0)]
x[,referral:=ifelse(referral==T,1,0)]

x[,collegedegree:=ifelse(education==1,1,0)]
x[,advanceddegree:=ifelse(education==2,1,0)]
x[,yr2018:=ifelse(loanyear==2018,1,0)]
x[,yr2019:=ifelse(loanyear==2019,1,0)]

y <- x$default_probability

x[,c("default_probability","education","loanyear"):=list(NULL)]

str(x)

## Classes 'data.table' and 'data.frame':   195964 obs. of  32 variables:
##  $ ficoby100                                : num  6.79 6.86 6.71 7.17 7.16 6.98 6.41 6.6 7.23 6.99 ...
##  $ log_income                               : num  11.8 11.2 11 11.8 10.9 ...
##  $ age                                      : int  30 27 27 34 31 29 31 35 32 29 ...
##  $ dti                                      : num  19.8 9.9 13.3 17.6 36 ...
##  $ log_noofaccounts                         : num  3.93 3.18 2.71 3.33 3.14 ...
##  $ log_noinq6months                         : num  1.609 0 0 0 0.693 ...
##  $ log_crhistorydays                        : num  8.34 8.12 8.13 8.69 8.44 ...
##  $ log_totalbalance                         : num  11.7 11.7 10.8 11.5 10.8 ...
##  $ yearssincegraduation                     : num  6 5 5 12 9 7 9 12 11 6 ...
##  $ debtconsolidation                        : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ log_monthsatjob                          : num  3 2.94 3.81 3.64 3.5 ...
##  $ employed_hourly                          : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ computer                                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ loanaggrigator                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_loan_amount                          : num  10.13 8.73 8.99 9.16 10.53 ...
##  $ referral                                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ primary_income_verified                  : num  1 0 1 1 1 1 1 0 1 1 ...
##  $ num_delinquencies_past_2_years_at_pricing: int  0 1 0 0 0 0 2 0 0 0 ...
##  $ mtgpmttoincome                           : num  0 0 0 0 0 ...
##  $ noof90ormoretradesever                   : int  2 0 3 0 0 0 0 0 0 0 ...
##  $ noof30ormoretrades                       : int  0 1 0 0 0 0 1 0 0 0 ...
##  $ creditcardutilization                    : int  74 42 93 56 48 46 94 95 60 100 ...
##  $ ccbalancetoincome                        : num  0.1355 0.0265 0.115 0.1924 0.4677 ...
##  $ noofopencreditcardtrades                 : int  8 3 3 8 8 7 5 8 6 2 ...
##  $ noofopenautotrades                       : int  2 1 1 -1 0 1 1 1 1 1 ...
##  $ mtgbalancepct                            : num  0 0 0 0 0 ...
##  $ credithistorytoage                       : num  0.384 0.341 0.345 0.481 0.409 ...
##  $ age2                                     : int  900 729 729 1156 961 841 961 1225 1024 841 ...
##  $ collegedegree                            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ advanceddegree                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ yr2018                                   : num  0 1 0 1 1 0 0 0 0 0 ...
##  $ yr2019                                   : num  0 0 0 0 0 0 0 1 1 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

1.2.2 Contribution of each variable