loans <- read_fst(paste0(path,"loan_data_fico_prob_Oct272020.fst"),as.data.table = T)
loans[,credithistorycat:=ifelse(individual_credit_history_days<120*12,"lt10","gt10")]
loans[,year_term:=paste(loanyear,loan_term)]
loans[,zip_year:=paste(zip,loanyear)]
loans[,ficoby100:=cr_fico/100]
setnames(loans,
c("annual_income.x","borrower_age","dti_at_pricing","number_of_accounts_at_pricing","g098_number_or_inquiries_in_last_6_months","individual_credit_history_days","months_at_job_as_of_approval"),
c("income","age","dti","noofaccounts","noinq6months","crhistorydays","monthsatjob"))
loans[,stateyear:=paste0(state,loanyear)]
loans[,mtgbalancepct:=(totalbalance-totalbalanceminusmortgage)/totalbalance]
loans[,mtgpmttoincome:=mortgage_payment_at_pricing/income]
loans[,credithistorytoage:=crhistorydays/(365*age)]
loans[,ccbalancetoincome:=totalcreditcardbalance/income]
Objective: identify the most important variables that predict Upstart Score
R Package documentation here: https://topepo.github.io/caret/recursive-feature-elimination.html
Sample after 2017.
x <- loans[loan_term==36 & loanyear>=2017,c("ficoby100","income","age","dti","noofaccounts","noinq6months","crhistorydays","totalbalance","yearssincegraduation","education","debtconsolidation","monthsatjob","employed_hourly","computer","loanaggrigator","default_probability","loanyear","loan_amount","referral","primary_income_verified","num_delinquencies_past_2_years_at_pricing","mtgpmttoincome","noof90ormoretradesever","noof30ormoretrades","creditcardutilization","ccbalancetoincome","noofopencreditcardtrades","noofopenautotrades","mtgbalancepct","credithistorytoage")] #,"home_ownership_type","employment_status"
x <- x[complete.cases(x)]
x[,education:=as.factor(education)]
x[,loanyear:=as.factor(loanyear)]
x[,age2:=age*age]
setnames(x,
c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount","num_delinquencies_past_2_years_at_pricing"),
paste0("log_",c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount","delq2yrs")))
x[,log_income:=log(1+log_income)]
x[,log_noofaccounts:=log(1+log_noofaccounts)]
x[,log_noinq6months:=log(1+log_noinq6months)]
x[,log_totalbalance:=log(6+log_totalbalance)]
x[,log_crhistorydays:=log(1+log_crhistorydays)]
x[,log_monthsatjob:=log(14+log_monthsatjob)]
x[,log_loan_amount:=log(1+log_loan_amount)]
x[,primary_income_verified:=ifelse(primary_income_verified==T,1,0)]
x[,referral:=ifelse(referral==T,1,0)]
x[,collegedegree:=ifelse(education==1,1,0)]
x[,advanceddegree:=ifelse(education==2,1,0)]
x[,yr2018:=ifelse(loanyear==2018,1,0)]
x[,yr2019:=ifelse(loanyear==2019,1,0)]
y <- x$default_probability
x[,c("default_probability","education","loanyear"):=list(NULL)]
x <- as.data.frame(x)
str(x)
## 'data.frame': 116262 obs. of 32 variables:
## $ ficoby100 : num 6.7 6.3 7.42 6.99 6.5 7.31 6.54 6.43 6.45 6.99 ...
## $ log_income : num 10.8 10.9 11.5 11 10.6 ...
## $ age : int 28 37 34 29 36 29 36 34 29 32 ...
## $ dti : num 21.2 11.4 8.5 19 23.2 ...
## $ log_noofaccounts : num 3.22 3.18 3.04 2.71 2.48 ...
## $ log_noinq6months : num 0 0.693 0 0.693 0 ...
## $ log_crhistorydays : num 8.22 8.82 8.74 8.35 8.37 ...
## $ log_totalbalance : num 11.9 11.3 12.3 13.4 10.3 ...
## $ yearssincegraduation : num 2 9 6 7 6 6 13 4 8 8 ...
## $ debtconsolidation : num 1 1 1 1 1 1 1 1 1 1 ...
## $ log_monthsatjob : num 3.22 3.26 3.04 3.81 3.78 ...
## $ employed_hourly : num 0 0 0 0 1 0 0 0 0 0 ...
## $ computer : num 0 0 0 0 0 0 0 0 0 0 ...
## $ loanaggrigator : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_loan_amount : num 9.62 8.94 9.33 9.68 8.85 ...
## $ referral : num 0 0 0 0 0 0 0 0 0 0 ...
## $ primary_income_verified : num 1 1 1 1 1 0 1 1 0 1 ...
## $ log_delq2yrs : int 1 1 0 2 1 0 0 1 0 1 ...
## $ mtgpmttoincome : num 0 0 0.0121 0.0687 0 ...
## $ noof90ormoretradesever : int 2 6 0 0 0 0 0 7 9 0 ...
## $ noof30ormoretrades : int 0 3 0 2 1 0 0 0 0 0 ...
## $ creditcardutilization : int 71 86 101 87 99 98 92 66 100 65 ...
## $ ccbalancetoincome : num 0.152 0.279 0.117 0.264 0.156 ...
## $ noofopencreditcardtrades: int 3 4 1 3 3 2 3 10 5 5 ...
## $ noofopenautotrades : int 1 0 1 1 1 1 0 1 -1 -1 ...
## $ mtgbalancepct : num 0 0 0.788 0.942 0 ...
## $ credithistorytoage : num 0.363 0.499 0.505 0.4 0.329 ...
## $ age2 : int 784 1369 1156 841 1296 841 1296 1156 841 1024 ...
## $ collegedegree : num 0 1 1 1 1 1 1 1 1 1 ...
## $ advanceddegree : num 1 0 0 0 0 0 0 0 0 0 ...
## $ yr2018 : num 1 1 1 0 0 0 1 0 1 0 ...
## $ yr2019 : num 0 0 0 0 0 1 0 0 0 1 ...
RMSE imporvement is only marginal after 15 variables.
x <- x[,1:30]
normalization <- preProcess(x)
x <- as.data.frame(predict(normalization, x))
subsets <- c(1:ncol(x))
set.seed(10)
ctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x, y,
sizes = subsets,
rfeControl = ctrl)
plotdata <- data.table(lmProfile$results)
ggplot(plotdata,aes(x=Variables,y=RMSE))+geom_line(color="dodgerblue4",size=1)+geom_point(color="dodgerblue4",size=2)+theme_minimal()+
labs(x="Variables",y="RMSE (Repeated Cross-Validation)")+geom_vline(xintercept = 15,color="darkred")
Here are the variables selected in the order of importance.
threeyearpred <- predictors(lmProfile)[1:15]
print(threeyearpred)
## [1] "age" "ficoby100" "dti" "age2" "log_crhistorydays" "log_noinq6months" "collegedegree" "advanceddegree" "creditcardutilization" "debtconsolidation" "referral" "log_income" "computer" "log_totalbalance" "ccbalancetoincome"
Sample after 2017.
x <- loans[loan_term==60 & loanyear>=2017,c("ficoby100","income","age","dti","noofaccounts","noinq6months","crhistorydays","totalbalance","yearssincegraduation","education","debtconsolidation","monthsatjob","employed_hourly","computer","loanaggrigator","default_probability","loanyear","loan_amount","referral","primary_income_verified","num_delinquencies_past_2_years_at_pricing","mtgpmttoincome","noof90ormoretradesever","noof30ormoretrades","creditcardutilization","ccbalancetoincome","noofopencreditcardtrades","noofopenautotrades","mtgbalancepct","credithistorytoage")] #,"home_ownership_type","employment_status"
x <- x[complete.cases(x)]
x[,education:=as.factor(education)]
x[,loanyear:=as.factor(loanyear)]
x[,age2:=age*age]
setnames(x,
c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount"),
paste0("log_",c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount")))
x[,log_income:=log(1+log_income)]
x[,log_noofaccounts:=log(1+log_noofaccounts)]
x[,log_noinq6months:=log(1+log_noinq6months)]
x[,log_totalbalance:=log(6+log_totalbalance)]
x[,log_crhistorydays:=log(1+log_crhistorydays)]
x[,log_monthsatjob:=log(14+log_monthsatjob)]
x[,log_loan_amount:=log(1+log_loan_amount)]
x[,primary_income_verified:=ifelse(primary_income_verified==T,1,0)]
x[,referral:=ifelse(referral==T,1,0)]
x[,collegedegree:=ifelse(education==1,1,0)]
x[,advanceddegree:=ifelse(education==2,1,0)]
x[,yr2018:=ifelse(loanyear==2018,1,0)]
x[,yr2019:=ifelse(loanyear==2019,1,0)]
y <- x$default_probability
x[,c("default_probability","education","loanyear"):=list(NULL)]
str(x)
## Classes 'data.table' and 'data.frame': 195964 obs. of 32 variables:
## $ ficoby100 : num 6.79 6.86 6.71 7.17 7.16 6.98 6.41 6.6 7.23 6.99 ...
## $ log_income : num 11.8 11.2 11 11.8 10.9 ...
## $ age : int 30 27 27 34 31 29 31 35 32 29 ...
## $ dti : num 19.8 9.9 13.3 17.6 36 ...
## $ log_noofaccounts : num 3.93 3.18 2.71 3.33 3.14 ...
## $ log_noinq6months : num 1.609 0 0 0 0.693 ...
## $ log_crhistorydays : num 8.34 8.12 8.13 8.69 8.44 ...
## $ log_totalbalance : num 11.7 11.7 10.8 11.5 10.8 ...
## $ yearssincegraduation : num 6 5 5 12 9 7 9 12 11 6 ...
## $ debtconsolidation : num 1 0 1 1 1 1 1 1 1 1 ...
## $ log_monthsatjob : num 3 2.94 3.81 3.64 3.5 ...
## $ employed_hourly : num 0 0 0 0 0 1 0 0 0 0 ...
## $ computer : num 0 0 0 0 0 0 0 0 0 0 ...
## $ loanaggrigator : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_loan_amount : num 10.13 8.73 8.99 9.16 10.53 ...
## $ referral : num 0 0 0 0 0 0 0 0 0 0 ...
## $ primary_income_verified : num 1 0 1 1 1 1 1 0 1 1 ...
## $ num_delinquencies_past_2_years_at_pricing: int 0 1 0 0 0 0 2 0 0 0 ...
## $ mtgpmttoincome : num 0 0 0 0 0 ...
## $ noof90ormoretradesever : int 2 0 3 0 0 0 0 0 0 0 ...
## $ noof30ormoretrades : int 0 1 0 0 0 0 1 0 0 0 ...
## $ creditcardutilization : int 74 42 93 56 48 46 94 95 60 100 ...
## $ ccbalancetoincome : num 0.1355 0.0265 0.115 0.1924 0.4677 ...
## $ noofopencreditcardtrades : int 8 3 3 8 8 7 5 8 6 2 ...
## $ noofopenautotrades : int 2 1 1 -1 0 1 1 1 1 1 ...
## $ mtgbalancepct : num 0 0 0 0 0 ...
## $ credithistorytoage : num 0.384 0.341 0.345 0.481 0.409 ...
## $ age2 : int 900 729 729 1156 961 841 961 1225 1024 841 ...
## $ collegedegree : num 1 1 1 1 1 1 1 1 1 1 ...
## $ advanceddegree : num 0 0 0 0 0 0 0 0 0 0 ...
## $ yr2018 : num 0 1 0 1 1 0 0 0 0 0 ...
## $ yr2019 : num 0 0 0 0 0 0 0 1 1 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
RMSE imporvement is only marginal after 15 variables.
normalization <- preProcess(x)
x <- as.data.frame(predict(normalization, x))
subsets <- c(1:ncol(x))
set.seed(10)
ctrl <- rfeControl(functions = lmFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x, y,
sizes = subsets,
rfeControl = ctrl)
plotdata <- data.table(lmProfile$results)
ggplot(plotdata,aes(x=Variables,y=RMSE))+geom_line(color="dodgerblue4",size=1)+geom_point(color="dodgerblue4",size=2)+theme_minimal()+
labs(x="Variables",y="RMSE (Repeated Cross-Validation)")+geom_vline(xintercept = 15,color="darkred")
Here are the variables selected in the order of importance.
fiveyearpred <- predictors(lmProfile)[1:15]
print(fiveyearpred)
## [1] "age" "ficoby100" "dti" "age2" "log_crhistorydays" "log_noinq6months" "yr2019" "collegedegree" "advanceddegree" "yr2018" "creditcardutilization" "debtconsolidation" "ccbalancetoincome" "noofopenautotrades" "log_totalbalance"
Uses union of all top 15 variables selected above
Fixed effects: zip * year
SE Cluster: zip code
x <- copy(loans)
x[,education:=as.factor(education)]
x[,age2:=age*age]
setnames(x,
c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount"),
paste0("log_",c("income","noofaccounts","noinq6months","crhistorydays","totalbalance","monthsatjob","loan_amount")))
x[,log_income:=log(1+log_income)]
x[,log_noofaccounts:=log(1+log_noofaccounts)]
x[,log_noinq6months:=log(1+log_noinq6months)]
x[,log_totalbalance:=log(6+log_totalbalance)]
x[,log_crhistorydays:=log(1+log_crhistorydays)]
x[,log_monthsatjob:=log(14+log_monthsatjob)]
x[,log_loan_amount:=log(1+log_loan_amount)]
x[,primary_income_verified:=ifelse(primary_income_verified==T,1,0)]
x[,referral:=ifelse(referral==T,1,0)]
x[,collegedegree:=ifelse(education==1,1,0)]
x[,advanceddegree:=ifelse(education==2,1,0)]
x[,yr2018:=ifelse(loanyear==2018,1,0)]
x[,yr2019:=ifelse(loanyear==2019,1,0)]
controls <- unique(threeyearpred,fiveyearpred)
controls <- controls[!controls %in% c("yr2019","yr2018")]
controls <- paste(controls,collapse = "+")
olsformula <- as.formula(paste0("default_probability~",controls,"|zip_year|0|zip"))
regs <- list()
regs[[1]] <- felm(olsformula,data=x[loan_term==36 & loanyear>=2017])
regs[[2]] <- felm(olsformula,data=x[loan_term==60 & loanyear>=2017])
.printtable(regs,column.labels = c("3 year loans","5 year loans"))
##
## ==================================================
## Dependent variable:
## ----------------------------
## 3 year loans 5 year loans
## (1) (2)
## --------------------------------------------------
## age 0.002*** 0.004***
## (0.0001) (0.0002)
## ficoby100 -0.067*** -0.123***
## (0.001) (0.001)
## dti 0.002*** 0.003***
## (0.00003) (0.00003)
## age2 -0.00002*** -0.00003***
## (0.00000) (0.00000)
## log_crhistorydays -0.016*** -0.029***
## (0.0004) (0.001)
## log_noinq6months 0.031*** 0.051***
## (0.0005) (0.001)
## collegedegree -0.028*** -0.041***
## (0.001) (0.001)
## advanceddegree -0.032*** -0.046***
## (0.001) (0.001)
## creditcardutilization 0.0003*** 0.0005***
## (0.00001) (0.00001)
## debtconsolidation -0.023*** -0.039***
## (0.001) (0.001)
## referral 0.005*** 0.005***
## (0.001) (0.001)
## log_income -0.013*** -0.012***
## (0.001) (0.001)
## computer -0.006*** -0.008***
## (0.0005) (0.001)
## log_totalbalance -0.003*** -0.007***
## (0.0002) (0.0003)
## ccbalancetoincome -0.001 -0.036***
## (0.002) (0.002)
## --------------------------------------------------
##
## Observations 119,697 201,088
## Adjusted R2 0.323 0.309
## ==================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##