library(tidyverse)
library(rpart)
library(faraway)
library(maptree) #Loading packages
library(tree)
The data used in this analysis contains a real account of data on customers of a credit card company in Sioux Falls, SD. It has 18 predictors and a target variable labeled Bad which indicates a customer did not pay his/her bill and is now seriously delinquent or in default. Bad is in two levels (0 & 1) with 0 representing a customer that is not in default and 1 denoting a customer that is in default. The predictors were collected at time t and the response variable was collected at time t+6. My goal for this study was to build two types of models, logistic regression and MARS and compare their predictive power in terms of predicting the likelihood of a customer defaulting on a credit card account.
#getwd()
setwd("D:/R-STUDIO/STAT 515/MARS")
df_mars<-read.csv(file = "customerretentionMARS.csv",na.strings = "NULL") #Reading the data into R
DT::datatable(df_mars,caption = "Customer Retension Dataset")
SmartEDA::ExpData(df_mars,2) #Computing count and percentage of missing values in each variable
## Index Variable_Name Variable_Type Sample_n Missing_Count
## 1 1 DebtDimId integer 6237 0
## 2 2 Months_On_Book integer 6237 0
## 3 3 Credit_Limit character 6237 0
## 4 4 Opening_Balance character 6237 0
## 5 5 Ending_Balance character 6237 0
## 6 6 Over_limit_Amount integer 6237 0
## 7 7 Actual_Min_Pay_Due integer 6237 0
## 8 8 Total_Min_Pay_Due integer 6237 0
## 9 9 Net_Payments_During_Cycle character 6237 0
## 10 10 Net_Purchases_During_Cycle integer 6237 0
## 11 11 Net_Cash_Advances_During_Cycle integer 6237 0
## 12 12 Net_Premier_Fees_Billed_During_C integer 6237 0
## 13 13 Net_Behavior_Fees_Billed_During integer 6237 0
## 14 14 Net_Concessions_Billed_During_Cy integer 6237 0
## 15 15 Quarterly_Fico_Score integer 6237 0
## 16 16 Behavior_Score integer 6237 0
## 17 17 Good_Customer_Score integer 5918 319
## 18 18 Utility numeric 6237 0
## 19 19 Bad integer 6237 0
## Per_of_Missing No_of_distinct_values
## 1 0.000 6230
## 2 0.000 143
## 3 0.000 39
## 4 0.000 722
## 5 0.000 723
## 6 0.000 92
## 7 0.000 38
## 8 0.000 116
## 9 0.000 404
## 10 0.000 407
## 11 0.000 51
## 12 0.000 71
## 13 0.000 46
## 14 0.000 37
## 15 0.000 358
## 16 0.000 138
## 17 0.051 384
## 18 0.000 5678
## 19 0.000 2
Some of the predictor variables had values separated by strings. The strings in such values were removed and the variables converted to numeric forms. The predictor variable, Good_Customer_Score had 319 NA’s. Rows with such observations were removed from the data. Some duplicated rows in the data were also removed. The final dataset after the processing stage had 5909 rows with 18 columns.
df_mars$Credit_Limit<-as.numeric(gsub(",", "", df_mars$Credit_Limit))#Removing the string "," from the values in the variable Credit_Limit
df_mars$Opening_Balance<-as.numeric(gsub(",", "", df_mars$Opening_Balance))#Removing the string "," from the values in the variable Opening_Balance
df_mars$Ending_Balance<-as.numeric(gsub(",", "", df_mars$Ending_Balance)) #Removing the string "," from the values in the variable Ending_Balance
df_mars$Net_Payments_During_Cycle<-as.numeric(df_mars$Net_Payments_During_Cycle)#The variable Net_Payments_During_Cycle is character, converting it to numeric
dplyr::glimpse(df_mars[which(duplicated(df_mars$DebtDimId)),]) #Checking for duplicate rows in the data using customer Id. Seven records are duplicated in the data
## Rows: 7
## Columns: 19
## $ DebtDimId <int> 1695646, 1695646, 1695646, 1827155, 1~
## $ Months_On_Book <int> 119, 119, 119, 122, 122, 122, 7
## $ Credit_Limit <dbl> 250, 250, 250, 350, 350, 350, 250
## $ Opening_Balance <dbl> -70, -70, -70, 276, 276, 276, 187
## $ Ending_Balance <dbl> -89, -89, -89, 345, 345, 345, 77
## $ Over_limit_Amount <int> 0, 0, 0, 0, 0, 0, 0
## $ Actual_Min_Pay_Due <int> 0, 0, 0, 25, 25, 25, 25
## $ Total_Min_Pay_Due <int> 0, 0, 0, 25, 25, 25, 25
## $ Net_Payments_During_Cycle <dbl> 25, 25, 25, 100, 100, 100, 158
## $ Net_Purchases_During_Cycle <int> 0, 0, 0, 113, 113, 113, 70
## $ Net_Cash_Advances_During_Cycle <int> 0, 0, 0, 0, 0, 0, 0
## $ Net_Premier_Fees_Billed_During_C <int> 6, 6, 6, 51, 51, 51, 7
## $ Net_Behavior_Fees_Billed_During <int> 0, 0, 0, 5, 5, 5, 0
## $ Net_Concessions_Billed_During_Cy <int> 0, 0, 0, 0, 0, 0, 29
## $ Quarterly_Fico_Score <int> 678, 678, 678, 682, 682, 682, 418
## $ Behavior_Score <int> 677, 677, 677, 656, 656, 656, 594
## $ Good_Customer_Score <int> 996, 996, 996, 701, 701, 701, 832
## $ Utility <dbl> -0.3560000, -0.3560000, -0.3560000, 0~
## $ Bad <int> 0, 0, 0, 0, 0, 0, 1
df_mars<-df_mars[which(!duplicated(df_mars$DebtDimId)),] #Removing duplicated rows from the data
summary(df_mars[,-1]) #Computing summary statistics on the data
## Months_On_Book Credit_Limit Opening_Balance Ending_Balance
## Min. : 6.00 Min. : 200.0 Min. :-320.0 Min. :-559.0
## 1st Qu.: 12.00 1st Qu.: 250.0 1st Qu.: 170.0 1st Qu.: 162.2
## Median : 22.00 Median : 300.0 Median : 251.0 Median : 240.0
## Mean : 31.99 Mean : 338.1 Mean : 249.3 Mean : 240.2
## 3rd Qu.: 43.00 3rd Qu.: 350.0 3rd Qu.: 323.0 3rd Qu.: 312.0
## Max. :164.00 Max. :2500.0 Max. :1893.0 Max. :1969.0
##
## Over_limit_Amount Actual_Min_Pay_Due Total_Min_Pay_Due
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:20.00 1st Qu.: 20.00
## Median : 0.000 Median :25.00 Median : 25.00
## Mean : 3.028 Mean :21.85 Mean : 24.88
## 3rd Qu.: 0.000 3rd Qu.:25.00 3rd Qu.: 25.00
## Max. :192.000 Max. :60.00 Max. :212.00
##
## Net_Payments_During_Cycle Net_Purchases_During_Cycle
## Min. :-117.00 Min. :-176.00
## 1st Qu.: 25.00 1st Qu.: 0.00
## Median : 50.00 Median : 15.00
## Mean : 74.40 Mean : 47.93
## 3rd Qu.: 91.25 3rd Qu.: 59.00
## Max. : 985.00 Max. : 775.00
## NA's :2
## Net_Cash_Advances_During_Cycle Net_Premier_Fees_Billed_During_C
## Min. : 0.000 Min. :-11.00
## 1st Qu.: 0.000 1st Qu.: 6.00
## Median : 0.000 Median : 7.00
## Mean : 1.161 Mean : 12.23
## 3rd Qu.: 0.000 3rd Qu.: 9.00
## Max. :411.000 Max. :159.00
##
## Net_Behavior_Fees_Billed_During Net_Concessions_Billed_During_Cy
## Min. :-30.000 Min. :-28.0000
## 1st Qu.: 2.000 1st Qu.: 0.0000
## Median : 4.000 Median : 0.0000
## Mean : 3.592 Mean : 0.8372
## 3rd Qu.: 5.000 3rd Qu.: 0.0000
## Max. : 63.000 Max. :100.0000
##
## Quarterly_Fico_Score Behavior_Score Good_Customer_Score Utility
## Min. : 0.0 Min. :558.0 Min. : 0.0 Min. :-2.2376
## 1st Qu.:543.0 1st Qu.:644.0 1st Qu.: 707.0 1st Qu.: 0.5460
## Median :597.0 Median :661.0 Median : 738.0 Median : 0.8364
## Mean :587.4 Mean :659.6 Mean : 769.7 Mean : 0.7143
## 3rd Qu.:642.0 3rd Qu.:677.0 3rd Qu.: 811.0 3rd Qu.: 0.9524
## Max. :810.0 Max. :721.0 Max. :1000.0 Max. : 1.6396
## NA's :319
## Bad
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1186
## 3rd Qu.:0.0000
## Max. :1.0000
##
Row_na<-apply(df_mars,1,function(x){any(is.na(x))})
df_mars<-df_mars[!Row_na,] #Removing rows with NA's from the dataset
summary(df_mars[,-1]) #Checking if all NA's are removed
## Months_On_Book Credit_Limit Opening_Balance Ending_Balance
## Min. : 6.0 Min. : 200.0 Min. :-320 Min. :-513.0
## 1st Qu.: 12.0 1st Qu.: 250.0 1st Qu.: 174 1st Qu.: 165.0
## Median : 22.0 Median : 300.0 Median : 252 Median : 242.0
## Mean : 32.1 Mean : 338.6 Mean : 252 Mean : 241.7
## 3rd Qu.: 43.0 3rd Qu.: 350.0 3rd Qu.: 324 3rd Qu.: 313.0
## Max. :164.0 Max. :2500.0 Max. :1893 Max. :1969.0
## Over_limit_Amount Actual_Min_Pay_Due Total_Min_Pay_Due
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:20.00 1st Qu.: 20.00
## Median : 0.000 Median :25.00 Median : 25.00
## Mean : 3.097 Mean :21.88 Mean : 24.97
## 3rd Qu.: 0.000 3rd Qu.:25.00 3rd Qu.: 25.00
## Max. :192.000 Max. :60.00 Max. :212.00
## Net_Payments_During_Cycle Net_Purchases_During_Cycle
## Min. :-117.00 Min. :-176.00
## 1st Qu.: 25.00 1st Qu.: 0.00
## Median : 50.00 Median : 14.00
## Mean : 73.78 Mean : 47.31
## 3rd Qu.: 90.00 3rd Qu.: 59.00
## Max. : 985.00 Max. : 775.00
## Net_Cash_Advances_During_Cycle Net_Premier_Fees_Billed_During_C
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 6.00
## Median : 0.000 Median : 7.00
## Mean : 1.135 Mean : 12.32
## 3rd Qu.: 0.000 3rd Qu.: 9.00
## Max. :411.000 Max. :159.00
## Net_Behavior_Fees_Billed_During Net_Concessions_Billed_During_Cy
## Min. :-27.000 Min. :-28.0000
## 1st Qu.: 2.000 1st Qu.: 0.0000
## Median : 4.000 Median : 0.0000
## Mean : 3.632 Mean : 0.8387
## 3rd Qu.: 5.000 3rd Qu.: 0.0000
## Max. : 63.000 Max. :100.0000
## Quarterly_Fico_Score Behavior_Score Good_Customer_Score Utility
## Min. : 0.0 Min. :580.0 Min. : 0.0 Min. :-1.7085
## 1st Qu.:544.0 1st Qu.:644.0 1st Qu.: 707.0 1st Qu.: 0.5544
## Median :598.0 Median :661.0 Median : 738.0 Median : 0.8392
## Mean :588.4 Mean :659.7 Mean : 769.7 Mean : 0.7174
## 3rd Qu.:642.0 3rd Qu.:677.0 3rd Qu.: 811.0 3rd Qu.: 0.9533
## Max. :810.0 Max. :721.0 Max. :1000.0 Max. : 1.6396
## Bad
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1188
## 3rd Qu.:0.0000
## Max. :1.0000
df_log<-df_mars #Duplicating the dataset for the logistic regression
corr<-round(cor(df_mars[,-c(1,19)]),1)#Removing the DebtDimId and the target variable from the dataset and computing correlation plot on them
ggcorrplot::ggcorrplot(corr, hc.order = TRUE, outline.col = "white")
Figure 1: Correlation plot on the predictors
Figure 1 shows that there is strong correlation among some of the predictor variables. Variables with correlation absolute value greater than 0.75 were removed from the data.
df_mars<-df_mars[,-1]# Removing the the info variable "DebtDimId" from the data
df<-df_mars[,!names(df_mars) %in% 'Bad']
Matrix<- cor(df) #Computing correlation matrix on the input variables
Matrix[upper.tri(Matrix)]<-0
diag(Matrix)<-0 #Setting the upper triangle and perfectly correlated variables to 0
df_new<-df[,!apply(Matrix,2,function(x) any(x > abs(0.75)))]#Removing any input variable with correlation absolute value greater than 0.75
Histogram was generated to visually inspect the distribution of the variables. In Figure 2, most of the variables are not normally distributed. The distribution of Behavior_Score and Quarterly_Fico_Score appears normally distributed.
plotHist <- function(columns,bin,colours){#Exploratory analysis on the data set
par(mfrow = c(3,5)) #Histogram plots to visualize the distribution of the 15 variables in the data set.
for (i in columns) {
hist(df_new[,i], main = paste("Histogram of ", names(df_new)[i]),
nclass = bin, las = 1, col = colours,
xlab = paste(names(df_new)[i]))
}
}
plotHist(c(1:15), rep(5,15), "brown")
Figure 2: Histogram of the variables in the dataset
It is observed in 3 that the proportion of the amount of loan requested by customers likely to default (Bad loan) and those who did not default (Good loan) is equal. However, interest rate varies among these two groups, with that of customers likely to default being higher than that of customers who did not default. Revolving credit balance divided by total revolving credit limit for both bad and good loan is equal with that of bad loan being slightly high. Number of months of loan payment for a loan do not affect whether a customer will default or not as the distribution is equal among the two groups of customers. Current credit policy and months since a customer has had a loan is the same among the two groups which confirms what was observed in the summary statistics. Also, delinquencies in the last 2 years and number of public records for both groups are the same. However, extreme observations appear in that of the good customers. Monthly income of both groups appears to be equal both there are number of good customers with higher monthly income compared to the others.
df_new$Bad<-df_mars$Bad
df_new %>%
select(Bad, 1:15) %>% #Boxplots to visualize the distribution of the 15 variables in the data set.
gather(Measure, Value, -Bad) %>%
ggplot(aes(x = factor(Bad)
, y = Value)) +
geom_boxplot() + theme_bw()+stat_boxplot(geom='errorbar',
linetype=1, width=0.5)+
facet_wrap(~Measure
, scales = "free_y")
Figure 3: Boxplot of the input variables Vs the target
The nature of MARS features breaks predictor into two distinct groups and models linear relationships between the predictor and the outcome in each group. Due to its robustness in terms of handling input variables, datasets used for MARS models needs little pre-processing such that transformation and filtering of input variables are not necessary. Even though relationship among variables such as their strong correlation with one another, do not significantly affect the model’s performance, they can complicate the model’s interpretation. To circumvent this shortcoming, correlation matrix was computed on the predictors and those with correlation absolute value greater than 0.75 were removed from the data. Total of 15 out of 18 variables entered the model. However, the model thinned the predictors and retained only 6 of them for the prediction.
Data splitting
The MARS model was built with the data containing the predictors and the target variable. The data set was split randomly at 60% (3546 observations) as the train data and 40% (2363) as the test data.
set.seed(551)
df_new$Bad<-df_mars$Bad
target<-df_new$Bad
predictors<-df_new[,-16]
trainRows<-caret::createDataPartition(target,p=.60,list = FALSE) #Randomly spliiting the dataset into 40% test data and 60% train data
trainPredictors<-predictors[trainRows,]
trainTarget<-target[trainRows]
testPredictors<-predictors[-trainRows,]
testTarget<-target[-trainRows]
The MARS model was built on the train data using the earth package, and the model performance evaluated on both the train and test data.The performance of the model on both the train and test data was statistically significant as observed by its low RMSE value on both the train (0.3004240) and test data (0.3060463). This was also true when their MAE (0.1797160 and 0.1856793 for the train and test data, respectively) were computed. Even though its low Mean Absolute Error rate do not tell the model’s performance, it suggests that the MARS model is great at prediction. This was confirmed by computing KS and ROC curve and generating the area under each curve (auc). As shown in Fig.3, the model’s performance rate on both the train and test data appears statistically significant as the value of the Area under their ROC curve (0.78 and 0.77 for the train and test data, respectively) is close to 1. Their computed AUROC curves are also comparable, indicating the model do not overfit the test data.
marsfit<-earth::earth(trainPredictors,trainTarget,glm=list(family=binomial),degree=1)#Performing the MARS model using the earth package.
summary(marsfit) #Using the summary() to generate more extensive output from the model
## Call: earth(x=trainPredictors, y=trainTarget, glm=list(family=binomial),
## degree=1)
##
## GLM coefficients
## trainTarget
## (Intercept) 0.235852552
## h(384-Ending_Balance) 0.002469206
## h(18-Net_Purchases_During_Cycle) 0.017216300
## h(Net_Concessions_Billed_During_Cy-11) -0.023442083
## h(Quarterly_Fico_Score-501) -0.005847568
## h(Quarterly_Fico_Score-645) 0.001532795
## h(640-Behavior_Score) 0.027278473
## h(Behavior_Score-640) -0.031616337
## h(627-Good_Customer_Score) -0.052380625
## h(Good_Customer_Score-627) -0.061356810
## h(Good_Customer_Score-644) 0.048863162
## h(Good_Customer_Score-701) 0.007931057
##
## GLM (family binomial, link logit):
## nulldev df dev df devratio AIC iters converged
## 2580.15 3545 2159.06 3534 0.163 2183 9 1
##
## Earth selected 12 of 18 terms, and 6 of 15 predictors
## Termination condition: RSq changed by less than 0.001 at 18 terms
## Importance: Behavior_Score, Quarterly_Fico_Score, Good_Customer_Score, ...
## Number of terms at each degree of interaction: 1 11 (additive model)
## Earth GCV 0.09151828 RSS 320.3276 GRSq 0.1240041 RSq 0.1348431
marsTest_pred<-predict(marsfit,newdata=testPredictors,type="response")#Validating the model on the test data
testTarget_testPred<-data.frame(obs=testTarget,
pred=marsTest_pred)
testTarget_testPred %>%
rename(
pred=trainTarget,
) %>%
caret::defaultSummary()
## RMSE Rsquared MAE
## 0.3060463 0.1100184 0.1856793
Table 1 shows the input variables selected by the earth function during the model building as significant in the prediction. Their overall importance is scaled between 0 and 100.
sel_vars<-caret::varImp(marsfit) #Input variables selected by the MARS model
kableExtra::kable(sel_vars,caption = "Table.1: Earth variable importance") %>%
kableExtra::kable_classic(html_font="Times New Roman")
| Overall | |
|---|---|
| Behavior_Score | 100.000000 |
| Quarterly_Fico_Score | 38.509440 |
| Good_Customer_Score | 30.692947 |
| Ending_Balance | 15.862359 |
| Net_Purchases_During_Cycle | 10.860900 |
| Net_Concessions_Billed_During_Cy | 1.652936 |
marsTrain_pred<-predict(marsfit,newdata=trainPredictors,type="response")#Validating the model on the train data
trainTarget_trainPred<-data.frame(obs=trainTarget,
pred=marsTrain_pred)
trainTarget_trainPred %>%
rename(
pred=trainTarget,
) %>%
caret::defaultSummary()
## RMSE Rsquared MAE
## 0.3004240 0.1357135 0.1797160
df_marsTrain_pred<-tibble(trainTarget,marsTrain_pred)#Generating the scoring data for the MARS model on the train data
sort_df_marsTrain_pred<-(df_marsTrain_pred[order(df_marsTrain_pred[,2],decreasing = T),])
DT::datatable(sort_df_marsTrain_pred,filter = "top",caption="Table.2: Predicted verses Actual values in the train Data (MARS Model)")
df_marsTest_pred<-tibble(testTarget,marsTest_pred)#Generating the scoring data for the MARS model on the test data
sort_mars_Test_pred<-(df_marsTest_pred[order(df_marsTest_pred[,2],decreasing = T),])
DT::datatable(sort_mars_Test_pred,filter = "top",caption="Table.3: Predicted verses Actual values in the test Data (MARS Model)")
#write.csv(sort_mars_target_pred,"MARS_Scoring.csv")
if(require(ROCR)){trainpred<-prediction(sort_df_marsTrain_pred$marsTrain_pred,sort_df_marsTrain_pred$trainTarget)}
if(require('ROCR')){trainperf<-performance(trainpred,"tpr","fpr")}#Generate a plot of True Positive Rate vs False Positive Rate from the MARS model on the train data
marsTrain_auc<-performance(trainpred, measure = "auc") #Computes the area under the ROC curve
print(paste(marsTrain_auc@y.values,"AUC ON TRAIN DATA"))
## [1] "0.783695883983792 AUC ON TRAIN DATA"
if(require(ROCR)){testpred<-prediction(sort_mars_Test_pred$marsTest_pred,sort_mars_Test_pred$testTarget)}
if(require('ROCR')){testperf<-performance(testpred,"tpr","fpr")}#Generate a plot of True Positive Rate vs False Positive Rate from the MARS model on the test data
mars_test_auc<-performance(testpred, measure = "auc") #Computes the area under the ROC curve
print(paste(mars_test_auc@y.values,"AUC ON TEST DATA"))
## [1] "0.768553545928894 AUC ON TEST DATA"
The performance of the model on both the train and test data was statistically significant as observed by its low RMSE value on both the train (0.3004240) and test data (0.3060463). This was also true when their MAE (0.1797160 and 0.1856793 for the train and test data, respectively) were computed. Even though its low Mean Absolute Error rate do not tell the model’s performance, it suggests that the MARS model is great at prediction. ROC curves for both the train and test data together with the area under each curve (auc) were computed. As shown in figure 4, the model’s performance rate on both the train and test data appears statistically significant as the value of the Area under their ROC curve (0.78 and 0.77 for the train and test data, respectively) is close to 1. Their computed AUROC curves are comparable, indicating the model do not overfit the test data.
plot(unlist(slot(trainperf,"x.values")),unlist(slot(trainperf,"y.values")),ylab="True Positive Rate",
xlab="False Positive Rate",type="l",col="green", main="Receiver Operating Characteristic (ROC) Plot on the MARS Model")
points(unlist(slot(testperf,"x.values")),unlist(slot(testperf,"y.values")),ylab="True Positive Rate",xlab="False Positive Rate",type="l",col="blue")
abline(a=0,b=1,col="red")
legend("topleft", legend=c("TRAIN_AUC","Test_AUC","Baseline_AUC"),
col=c("green","blue","red"), lty=1:3, cex=0.6)
legend("bottomright", legend=c("Train_AUC=0.78","Test_AUC=0.77"),
col=c("green","blue"), lty=1:2, cex=0.6)
Figure 4: Receiver Operating Characteristic (ROC) Plot on the MARS Model
if(require(gains)){Gains<-gains(testTarget,marsTest_pred)}#Computing gains for the model prediction on the test data
Gains
## Depth Cume Cume Pct Mean
## of Cume Mean Mean of Total Lift Cume Model
## File N N Resp Resp Resp Index Lift Score
## -------------------------------------------------------------------------
## 10 236 236 0.36 0.36 29.8% 298 298 0.40
## 20 236 472 0.22 0.29 47.9% 181 240 0.23
## 30 236 708 0.22 0.26 66.3% 185 221 0.16
## 40 237 945 0.13 0.23 77.0% 106 192 0.12
## 50 236 1181 0.07 0.20 82.6% 57 165 0.09
## 60 236 1417 0.07 0.18 88.3% 57 147 0.07
## 70 237 1654 0.05 0.16 92.6% 42 132 0.05
## 80 236 1890 0.06 0.14 97.2% 46 121 0.04
## 90 236 2126 0.02 0.13 98.9% 18 110 0.02
## 100 237 2363 0.01 0.12 100.0% 11 100 0.01
From figure 5 and the Gains table, it is observed that Depth of file of 70 which has cumulative observation of 1654 out of the 2363 has significant cumulative percentage of 92.60% out of the total response. This implies that in case of predicting customers with the probability of defaulting, if we target the first 70% of the customers (1653) from the predictions made by the model, approximately 92.60% percent of them will default. Thus, approximately 1548 out of the 1654 customers will not pay their bills.
par(mfrow=c(1,4))#A plot of the Mean Response and the Deciles
plot(Gains,ylim = c(0,0.1),xlim=c(0,100))
barplot(Gains$mean.resp, names.arg = Gains$depth, xlab = "Percentile", #A barplot of the Means Response and the Deciles
ylab = "Mean Response", main = "Decile-wise lift chart")
barplot(Gains$cume.pct.of.total, names.arg = Gains$depth, xlab = "Percentile", #A barplot of the cummulative percentage of the total response vs the deciles
ylab = "Cum(%_Total_Response)", main = "Decile-wise lift chart")
plot(Gains$lift, names.arg = Gains$depth, xlab = "Depth(Bucket)", #A barplot of the cummulative percentage of the total response vs the deciles
ylab = "Lift Index",type="l", main = "MARS Lift Plot") #A plot of Lift Index vs Depth of File
Figure 5: Plots on the MARS Gains table
set.seed(551)
rfmodel<-randomForest::randomForest(df_log[,2:18], df_log[,19],ntree=1000,importance=TRUE)
kableExtra::kable(randomForest::importance(rfmodel),caption="Table.4: A comparison of variable importance magnitude for differing input variables") %>%
#Selecting predictors significant in the prediction using randomForest
kableExtra::kable_classic(html_font="Times New Roman")
| %IncMSE | IncNodePurity | |
|---|---|---|
| Months_On_Book | 23.569085 | 37.839826 |
| Credit_Limit | 24.873484 | 15.817112 |
| Opening_Balance | 35.407424 | 47.427643 |
| Ending_Balance | 40.620424 | 43.189630 |
| Over_limit_Amount | 15.989290 | 9.422559 |
| Actual_Min_Pay_Due | 11.309938 | 4.639619 |
| Total_Min_Pay_Due | 21.416481 | 14.117881 |
| Net_Payments_During_Cycle | 23.722479 | 35.765979 |
| Net_Purchases_During_Cycle | 21.688843 | 31.309011 |
| Net_Cash_Advances_During_Cycle | 4.466423 | 1.568489 |
| Net_Premier_Fees_Billed_During_C | 13.142265 | 22.726754 |
| Net_Behavior_Fees_Billed_During | 28.743256 | 17.102198 |
| Net_Concessions_Billed_During_Cy | 8.345644 | 4.139720 |
| Quarterly_Fico_Score | 15.871172 | 65.530764 |
| Behavior_Score | 59.088187 | 81.038866 |
| Good_Customer_Score | 40.058068 | 63.997880 |
| Utility | 30.982161 | 49.795784 |
Random Forest was used to select significant input variables for the logistic regression. Table 2 shows the order of importance of the input variables from greatest impact to least impact, from top to bottom. %IncMSE is the increase in Mean Squared Error of the predictions as a given input variable is randomly permuted. That is, %IncMSE associated with a given variable is an indication of the percent decrease of the model’s accuracy as that given variable is left out. The higher the value, the higher the importance of the associated variable. IncNodePurity measures variable importance based on Gini impurity index used for the calculating the splits in trees. The higher the value of mean decrease accuracy or mean decrease gini score, the higher the importance of the variable to the model. Since IncNodePurity is often biased, %IncMSE was used as the criterion in the feature selection. The top 5 variables Figure 6 with MSE greater than 35% were selected for the model building process.
feat_imp_df <- randomForest::importance(rfmodel) %>%
data.frame() %>%
mutate(feature = row.names(.))
# plot dataframe
A<-ggplot2::ggplot(feat_imp_df, aes(x = reorder(feature,X.IncMSE),
y =X.IncMSE)) +
geom_bar(stat='identity') +
coord_flip() +
theme_classic() +
labs(
x = "Feature",
y = "Importance",
title = "Feature Importance: <X.IncMSE>"
)
# plot dataframe
B<-ggplot2::ggplot(feat_imp_df, aes(x = reorder(feature,IncNodePurity),
y = IncNodePurity)) +
geom_bar(stat='identity') +
coord_flip() +
theme_classic() +
labs(
x = "Feature",
y = "Importance",
title = "Feature Importance: <IncNodePurity>"
)
gridExtra::grid.arrange(A,B, ncol = 2)
Figure 6: Random Forest Feature Selection
Data Splitting
The logistic model was built with the data containing the 5 input variables and the target variable, Bad. The data set was split randomly at 60% (3546 observations) as the train data and 40% (2363) as the test data.
df_subset<-select(df_log,DebtDimId,Behavior_Score,Ending_Balance,Good_Customer_Score,Opening_Balance,Utility,Bad)
bin_vars<-Rprofet::BinProfet(dat=df_subset,'DebtDimId',varcol = c(2:6),target = 'Bad')#Bucketing the predictors with the function BinProfet before model building
Prior to splitting the data, the 5 input variables were binned into levels. Only Utility was re-bucket by smoothening using the function WOE_custom() from Rprofet package. Plots of the binned input variables are attached.
bin_plots<-function(x,y){
Rprofet::WOEplotter(dat=bin_vars,var = x,target=y)
}
A=bin_plots('Behavior_Score_Bins','Bad')
B=bin_plots('Good_Customer_Score_Bins','Bad')
C=bin_plots('Opening_Balance_Bins','Bad')
D=bin_plots('Ending_Balance_Bins','Bad')
E=bin_plots('Utility_Bins','Bad')#Generating plots for the binned predictors to check if there is the need to prune any before they enter the model
#Pruning or smoothening the binned predictors
bin_vars$Utility_Bins<-Rprofet::WOE_custom(df_subset,'Utility', target='Bad', breaks=c(-Inf,0,.1,.50,.7,.90,Inf), right_bracket = F, color = "turquoise4")
bin_vars$Behavior_Score_Bins<-Rprofet::WOE_custom(df_subset,'Behavior_Score', target='Bad', breaks=c(550,635,650,660,670,685,Inf), right_bracket = F, color = "turquoise4")
bin_vars$Ending_Balance_Bins<-Rprofet::WOE_custom(df_subset,'Ending_Balance', target='Bad', breaks=c(-Inf,70,180,240,280,350,Inf), right_bracket = F, color = "turquoise4")
bin_vars$Good_Customer_Score_Bins<-Rprofet::WOE_custom(df_subset,'Good_Customer_Score', target='Bad', breaks=c(0,700,720,745,790,900,Inf), right_bracket = F, color = "turquoise4")
bin_vars$Opening_Balance_Bins<-Rprofet::WOE_custom(df_subset,'Opening_Balance', target='Bad', breaks=c(-Inf,100,200,250,300,350,Inf), right_bracket = F, color = "turquoise4")
step_model=lm(Bad ~ Behavior_Score_Bins + Ending_Balance_Bins + Good_Customer_Score_Bins +
Opening_Balance_Bins + Utility_Bins,data=bin_vars[,-1])
Stepwise_model=step(step_model,direction="both",test="F")#Stepwise selection was performed on the variables to select those that appear significant at 0.5 significant level
## Start: AIC=-13922
## Bad ~ Behavior_Score_Bins + Ending_Balance_Bins + Good_Customer_Score_Bins +
## Opening_Balance_Bins + Utility_Bins
##
## Df Sum of Sq RSS AIC F value
## - Utility_Bins 5 0.2697 555.49 -13929 0.5716
## - Opening_Balance_Bins 5 0.4614 555.68 -13927 0.9778
## <none> 555.22 -13922
## - Ending_Balance_Bins 5 1.0431 556.26 -13921 2.2105
## - Good_Customer_Score_Bins 5 4.7441 559.96 -13882 10.0535
## - Behavior_Score_Bins 5 23.6068 578.83 -13686 50.0265
## Pr(>F)
## - Utility_Bins 0.7218
## - Opening_Balance_Bins 0.4297
## <none>
## - Ending_Balance_Bins 0.0505 .
## - Good_Customer_Score_Bins 0.000000001343 ***
## - Behavior_Score_Bins < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-13929.13
## Bad ~ Behavior_Score_Bins + Ending_Balance_Bins + Good_Customer_Score_Bins +
## Opening_Balance_Bins
##
## Df Sum of Sq RSS AIC F value
## - Opening_Balance_Bins 5 0.4395 555.93 -13934 0.9318
## <none> 555.49 -13929
## - Ending_Balance_Bins 5 1.2967 556.79 -13925 2.7489
## + Utility_Bins 5 0.2697 555.22 -13922 0.5716
## - Good_Customer_Score_Bins 5 5.4130 560.90 -13882 11.4752
## - Behavior_Score_Bins 5 25.8498 581.34 -13670 54.7997
## Pr(>F)
## - Opening_Balance_Bins 0.4590
## <none>
## - Ending_Balance_Bins 0.0174 *
## + Utility_Bins 0.7218
## - Good_Customer_Score_Bins 0.00000000004793 ***
## - Behavior_Score_Bins < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=-13934.45
## Bad ~ Behavior_Score_Bins + Ending_Balance_Bins + Good_Customer_Score_Bins
##
## Df Sum of Sq RSS AIC F value
## <none> 555.93 -13934
## - Ending_Balance_Bins 5 1.4278 557.36 -13929 3.0270
## + Opening_Balance_Bins 5 0.4395 555.49 -13929 0.9318
## + Utility_Bins 5 0.2479 555.68 -13927 0.5253
## - Good_Customer_Score_Bins 5 5.5367 561.47 -13886 11.7382
## - Behavior_Score_Bins 5 28.1638 584.09 -13652 59.7088
## Pr(>F)
## <none>
## - Ending_Balance_Bins 0.009863 **
## + Opening_Balance_Bins 0.458997
## + Utility_Bins 0.757315
## - Good_Customer_Score_Bins 0.00000000002582 ***
## - Behavior_Score_Bins < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The variable Opening_Balance_Bins and Utility_Bins appears insignificant at 0.5 sig.level. These removed from the final data used for building the model
The logistic regression model was built on the train data using the glm () function, and the model performance evaluated on both the train and test data. The performance of the model on both the train and test data was statistically significant as observed by its low RMSE value on both the train (0.3054019) and test data (0.30960460). This was also true when their MAE (0.1860793 and 0.19063831 for the train and test data, respectively) were computed.
logit_target<-bin_vars$Bad#Setting the target variable aside from the data
logit_predictors<-bin_vars[,-c(1:2,6:7)]#Preparing the binned predictors for the logistic model by removing
#the DebtDimId, the target "Bad" and the predictors that appeared insignificant at 0.5 sig. level
set.seed(551) #Randomly splitting the data into 40% test data and 60% train data
trainingRows<-caret::createDataPartition(logit_target,p=.6,list=F)
trainX<-logit_predictors[trainingRows,]
trainY<-logit_target[trainingRows]
testX<-logit_predictors[-trainingRows,]
testY<-logit_target[-trainingRows]
logit_Fit<-glm(trainY ~.,data=trainX,family = "binomial") #Logistic model with the function glm performed on the train data
summary(logit_Fit)#summary statistics from the logistic model
##
## Call:
## glm(formula = trainY ~ ., family = "binomial", data = trainX)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3415 -0.5178 -0.3511 -0.2115 3.0473
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.3778 0.2702 1.398
## Behavior_Score_Bins[635,650) -0.6532 0.1472 -4.437
## Behavior_Score_Bins[650,660) -1.0784 0.1730 -6.233
## Behavior_Score_Bins[660,670) -1.5599 0.1984 -7.863
## Behavior_Score_Bins[670,685) -1.9431 0.2189 -8.877
## Behavior_Score_Bins[685,Inf) -2.6495 0.3412 -7.766
## Ending_Balance_Bins[180,240) -0.6941 0.2742 -2.531
## Ending_Balance_Bins[240,280) -0.8244 0.2811 -2.933
## Ending_Balance_Bins[280,350) -0.9433 0.2800 -3.368
## Ending_Balance_Bins[350, Inf) -1.1714 0.2919 -4.013
## Ending_Balance_Bins[70,180) -0.6118 0.2915 -2.099
## Good_Customer_Score_Bins[700,720) -0.5745 0.1574 -3.650
## Good_Customer_Score_Bins[720,745) -0.4948 0.1685 -2.937
## Good_Customer_Score_Bins[745,790) -0.5731 0.1911 -2.998
## Good_Customer_Score_Bins[790,900) -1.1900 0.2510 -4.741
## Good_Customer_Score_Bins[900,Inf) -1.7409 0.3426 -5.081
## Pr(>|z|)
## (Intercept) 0.162063
## Behavior_Score_Bins[635,650) 0.00000912797146830 ***
## Behavior_Score_Bins[650,660) 0.00000000045840882 ***
## Behavior_Score_Bins[660,670) 0.00000000000000376 ***
## Behavior_Score_Bins[670,685) < 0.0000000000000002 ***
## Behavior_Score_Bins[685,Inf) 0.00000000000000813 ***
## Ending_Balance_Bins[180,240) 0.011373 *
## Ending_Balance_Bins[240,280) 0.003356 **
## Ending_Balance_Bins[280,350) 0.000756 ***
## Ending_Balance_Bins[350, Inf) 0.00005988904370151 ***
## Ending_Balance_Bins[70,180) 0.035807 *
## Good_Customer_Score_Bins[700,720) 0.000262 ***
## Good_Customer_Score_Bins[720,745) 0.003316 **
## Good_Customer_Score_Bins[745,790) 0.002715 **
## Good_Customer_Score_Bins[790,900) 0.00000213006429703 ***
## Good_Customer_Score_Bins[900,Inf) 0.00000037451860666 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2580.2 on 3545 degrees of freedom
## Residual deviance: 2225.4 on 3530 degrees of freedom
## AIC: 2257.4
##
## Number of Fisher Scoring iterations: 6
The summary statistics of the logistic model shows how the predictors are associated with the target variable. From the output above, it appears that an increase in customers behavior score, customers ending balance and good customers score is associated with a decrease in the chance that a customer will default on his/her account.
logitTrain_pred<-predict(logit_Fit,trainX,type = "response")#validating the model with the train data
df_logitTrain_pred<-data.frame(obs=trainY,
pred=logitTrain_pred)
caret::defaultSummary(df_logitTrain_pred)#The Root Means Square Error (RMSE) and Mean Absolute Error associated with the model
## RMSE Rsquared MAE
## 0.3054019 0.1067739 0.1860793
logit_pred<-predict(logit_Fit,testX,type = "response")#validating the model with the test data
logit_values<-data.frame(obs=testY,
pred=logit_pred)#The Root Means Square Error (RMSE) and Mean Absolute Error associated with the model
caret::defaultSummary(logit_values)
## RMSE Rsquared MAE
## 0.30960460 0.08939299 0.19063831
df_LogisticTrain_pred<-tibble(trainY,logitTrain_pred)#Generating the scoring data for the logistic model on the train data
sort_df_LogisticTrain_pred<-(df_LogisticTrain_pred[order(df_LogisticTrain_pred[,2],decreasing = T),])
DT::datatable(df_marsTrain_pred,filter = "top",caption="Table.2: Predicted verses Actual values in the train Data (Logistic Model)")
logit_results<-tibble(testY,logit_pred)#Generating the scoring data for the logistic model
sort_logit_pred<-(logit_results[order(logit_results[,2],decreasing = T),])
DT::datatable(logit_results,filter = "top",caption="Table.1: Predicted verses Actual values in the test data (Logistic Model)")
if(require('ROCR')){logitTrain_prediction<-prediction(sort_df_LogisticTrain_pred$logitTrain_pred,sort_df_LogisticTrain_pred$trainY)}
if(require('ROCR')){logitTrain_perf<-performance(logitTrain_prediction,"tpr","fpr")}
if(require(ROCR)){logit_prediction<-prediction(sort_logit_pred$logit_pred,sort_logit_pred$testY)}
if(require('ROCR')){logit_perf<-performance(logit_prediction,"tpr","fpr")}
logitTrain_auc<-performance(logitTrain_prediction, measure = "auc") #Computes the area under the ROC curve
print(paste(logitTrain_auc@y.values,"LOGISTIC TRAIN AUC"))
## [1] "0.767847622094263 LOGISTIC TRAIN AUC"
logitTest_auc<-performance(logit_prediction, measure = "auc") #Computes the area under the ROC curve
print(paste(logitTest_auc@y.values,"LOGISTIC TEST AUC"))
## [1] "0.748930717296989 LOGISTIC TEST AUC"
plot(unlist(slot(logitTrain_perf,"x.values")),unlist(slot(logitTrain_perf,"y.values")),
ylab="True Positive Rate",xlab="False Positive Rate",type="l",col="green",main="Receiver Operating Characteristic (ROC) Plot on the Logistic Model")
points(unlist(slot(logit_perf,"x.values")),unlist(slot(logit_perf,"y.values")),
ylab="True Positive Rate",xlab="False Positive Rate",type="l",col="blue")
abline(a=0,b=1,col="red")
legend("topleft", legend=c("Train_AUC","Test_AUC","Baseline_AUC"),
col=c("green","blue", "red"), lty=1:3, cex=0.6)
legend("bottomright", legend=c("LogitTrain_AUC=0.77","LogitTest_AUC=0.75"),
col=c("green","blue"), lty=1:2, cex=0.6)
Figure 7: Receiver Operating Characteristic (ROC) Plot on the Logistic Model
if(require(gains)){logit_Gains<-gains(testY,logit_pred)}#Computing gains for the model prediction on the test data
logit_Gains
## Depth Cume Cume Pct Mean
## of Cume Mean Mean of Total Lift Cume Model
## File N N Resp Resp Resp Index Lift Score
## -------------------------------------------------------------------------
## 10 236 236 0.32 0.32 27.0% 270 270 0.37
## 20 240 476 0.25 0.28 47.9% 206 238 0.25
## 31 259 735 0.19 0.25 65.6% 162 211 0.17
## 40 220 955 0.09 0.21 72.7% 76 180 0.12
## 50 230 1185 0.11 0.19 81.6% 91 163 0.10
## 60 232 1417 0.06 0.17 86.9% 54 145 0.07
## 70 241 1658 0.06 0.16 92.2% 52 131 0.05
## 81 252 1910 0.04 0.14 96.1% 37 119 0.04
## 92 261 2171 0.03 0.13 99.3% 29 108 0.02
## 100 192 2363 0.01 0.12 100.0% 9 100 0.01
From figure @ref(fig:Logit_Gains-Table-Plot) and the Gains table, it is observed that Depth of file of even 70 which has cumulative observation of 1658 out of the 2363 has significant cumulative percentage of 92.2% out of the total response. This implies that in case of predicting customers with the probability of defaulting, if we target the first 70% of the customers (1658) from the predictions made by the model, approximately 92.2% percent of them will default. Thus, approximately 1529 out of the 1658 customers will default on their account.
par(mfrow=c(1,4))#A plot of the Mean Response and the Deciles
plot(logit_Gains,ylim = c(0,0.1),xlim=c(0,100))
barplot(logit_Gains$mean.resp, names.arg = logit_Gains$depth, xlab = "Percentile", #A barplot of the Means Response and the Deciles
ylab = "Mean Response", main = "Decile-wise lift chart")
barplot(logit_Gains$cume.pct.of.total, names.arg = logit_Gains$depth, xlab = "Percentile", #A barplot of the cummulative percentage of the total response vs the deciles
ylab = "Cum(%_Total_Response)", main = "Decile-wise lift chart")
plot(logit_Gains$lift, names.arg = logit_Gains$depth, xlab = "Depth(Bucket)", #A barplot of the cummulative percentage of the total response vs the deciles
ylab = "Lift Index",type="l", main = "Logistic Lift Plot") #A plot of Lift Index vs Depth of File
(#fig:Logit_Gains-Table-Plot)Plots on the logistic Gains table
plot(unlist(slot(testperf, 'x.values')),unlist(slot(testperf,'y.values')),ylab="True Positive Rate",xlab="False Positive Rate",type="l",col="blue",main="Comparing AUROC for Logistics and MARS Model")
points(unlist(slot(logit_perf,"x.values")),unlist(slot(logit_perf,"y.values")),type="l",col="red")
abline(a=0,b=1,col="brown")
legend("topleft", legend=c("MARS Model", "Logistic Model","Random Model"),
col=c("blue","red","brown"), lty=1:3, cex=0.6)
legend("bottomright", legend=c("MARS Model=0.77", "Logistic Model=0.75"),
col=c("blue", "red"), lty=1:2, cex=0.6) #Comparing the the predictive power of MARS Model and the Logistic
Figure 8: AUROC curves for the MARS and Logistic model
#Model by computing their AUROC curves
Both the MARS model and the logistic model appear great in predicting customers that will default on their account. However, the MARS model performs fairly better than the logistic model.