Installing packages and loading the data

Taking a look at the variables. It seems like MobileNo_Avl_Flag is useless because there are only 1’s in it. We will split employment type up with one hot encoding because it contains 3 different variables. Also, there are a few date variables, and the CNS description variable could be removed. We have some string data, which need to be converted into integers. For example the dates. We can also spot the binary variables vs id variables from the summary.

attach(train_data)
summary(train_data)
##     UniqueID      disbursed_amount   asset_cost           ltv       
##  Min.   :417428   Min.   : 13320   Min.   :  37000   Min.   :13.50  
##  1st Qu.:476558   1st Qu.: 47145   1st Qu.:  65720   1st Qu.:68.92  
##  Median :535949   Median : 53803   Median :  70960   Median :76.85  
##  Mean   :535783   Mean   : 54365   Mean   :  75854   Mean   :74.77  
##  3rd Qu.:594874   3rd Qu.: 60350   3rd Qu.:  79183   3rd Qu.:83.69  
##  Max.   :671084   Max.   :990572   Max.   :1628992   Max.   :95.00  
##    branch_id       supplier_id    manufacturer_id  Current_pincode_ID
##  Min.   :  1.00   Min.   :10524   Min.   : 45.00   Min.   :   1      
##  1st Qu.: 14.00   1st Qu.:16535   1st Qu.: 48.00   1st Qu.:1511      
##  Median : 61.00   Median :20328   Median : 86.00   Median :2967      
##  Mean   : 72.95   Mean   :19634   Mean   : 69.07   Mean   :3393      
##  3rd Qu.:130.00   3rd Qu.:23000   3rd Qu.: 86.00   3rd Qu.:5667      
##  Max.   :261.00   Max.   :24803   Max.   :156.00   Max.   :7345      
##  Date.of.Birth      Employment.Type    DisbursalDate         State_ID     
##  Length:174865      Length:174865      Length:174865      Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.000  
##                                                           Mean   : 7.266  
##                                                           3rd Qu.:10.000  
##                                                           Max.   :22.000  
##  Employee_code_ID MobileNo_Avl_Flag  Aadhar_flag        PAN_flag      
##  Min.   :   1     Min.   :1         Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 713     1st Qu.:1         1st Qu.:1.0000   1st Qu.:0.00000  
##  Median :1451     Median :1         Median :1.0000   Median :0.00000  
##  Mean   :1550     Mean   :1         Mean   :0.8403   Mean   :0.07591  
##  3rd Qu.:2365     3rd Qu.:1         3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :3795     Max.   :1         Max.   :1.0000   Max.   :1.00000  
##   VoterID_flag     Driving_flag     Passport_flag      PERFORM_CNS.SCORE
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :  0.0    
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:  0.0    
##  Median :0.0000   Median :0.00000   Median :0.000000   Median :  0.0    
##  Mean   :0.1449   Mean   :0.02337   Mean   :0.002139   Mean   :289.1    
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:678.0    
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000000   Max.   :890.0    
##  PERFORM_CNS.SCORE.DESCRIPTION PRI.NO.OF.ACCTS   PRI.ACTIVE.ACCTS 
##  Length:174865                 Min.   :  0.000   Min.   :  0.000  
##  Class :character              1st Qu.:  0.000   1st Qu.:  0.000  
##  Mode  :character              Median :  0.000   Median :  0.000  
##                                Mean   :  2.436   Mean   :  1.036  
##                                3rd Qu.:  3.000   3rd Qu.:  1.000  
##                                Max.   :453.000   Max.   :144.000  
##  PRI.OVERDUE.ACCTS PRI.CURRENT.BALANCE PRI.SANCTIONED.AMOUNT
##  Min.   : 0.000    Min.   :-6678296    Min.   :0.000e+00    
##  1st Qu.: 0.000    1st Qu.:       0    1st Qu.:0.000e+00    
##  Median : 0.000    Median :       0    Median :0.000e+00    
##  Mean   : 0.156    Mean   :  164886    Mean   :2.184e+05    
##  3rd Qu.: 0.000    3rd Qu.:   34904    3rd Qu.:6.200e+04    
##  Max.   :25.000    Max.   :96524920    Max.   :1.000e+09    
##  PRI.DISBURSED.AMOUNT SEC.NO.OF.ACCTS    SEC.ACTIVE.ACCTS   SEC.OVERDUE.ACCTS 
##  Min.   :0.000e+00    Min.   : 0.00000   Min.   : 0.00000   Min.   :0.000000  
##  1st Qu.:0.000e+00    1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.:0.000000  
##  Median :0.000e+00    Median : 0.00000   Median : 0.00000   Median :0.000000  
##  Mean   :2.179e+05    Mean   : 0.05925   Mean   : 0.02743   Mean   :0.007211  
##  3rd Qu.:6.021e+04    3rd Qu.: 0.00000   3rd Qu.: 0.00000   3rd Qu.:0.000000  
##  Max.   :1.000e+09    Max.   :52.00000   Max.   :36.00000   Max.   :8.000000  
##  SEC.CURRENT.BALANCE SEC.SANCTIONED.AMOUNT SEC.DISBURSED.AMOUNT
##  Min.   : -239782    Min.   :       0      Min.   :       0    
##  1st Qu.:       0    1st Qu.:       0      1st Qu.:       0    
##  Median :       0    Median :       0      Median :       0    
##  Mean   :    5311    Mean   :    7242      Mean   :    7131    
##  3rd Qu.:       0    3rd Qu.:       0      3rd Qu.:       0    
##  Max.   :29560540    Max.   :30000000      Max.   :30000000    
##  PRIMARY.INSTAL.AMT SEC.INSTAL.AMT    NEW.ACCTS.IN.LAST.SIX.MONTHS
##  Min.   :       0   Min.   :      0   Min.   : 0.0000             
##  1st Qu.:       0   1st Qu.:      0   1st Qu.: 0.0000             
##  Median :       0   Median :      0   Median : 0.0000             
##  Mean   :   13477   Mean   :    307   Mean   : 0.3796             
##  3rd Qu.:    1989   3rd Qu.:      0   3rd Qu.: 0.0000             
##  Max.   :25642806   Max.   :4170901   Max.   :35.0000             
##  DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS AVERAGE.ACCT.AGE   CREDIT.HISTORY.LENGTH
##  Min.   : 0.00000                    Length:174865      Length:174865        
##  1st Qu.: 0.00000                    Class :character   Class :character     
##  Median : 0.00000                    Mode  :character   Mode  :character     
##  Mean   : 0.09672                                                            
##  3rd Qu.: 0.00000                                                            
##  Max.   :20.00000                                                            
##  NO.OF_INQUIRIES   loan_default  
##  Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.:0.000  
##  Median : 0.000   Median :0.000  
##  Mean   : 0.206   Mean   :0.218  
##  3rd Qu.: 0.000   3rd Qu.:0.000  
##  Max.   :28.000   Max.   :1.000
table(loan_default)
## loan_default
##      0      1 
## 136748  38117
table(Employment.Type)
## Employment.Type
##                    Salaried Self employed 
##          5679         73441         95745
table(MobileNo_Avl_Flag)
## MobileNo_Avl_Flag
##      1 
## 174865
detach(train_data)

In order to convert the date variables into numeric, I took a look at the structure of the variables. For credit history length and average account age, I ran a for loop to break down the string, and extract the numbers, and convert them into months.

##  [1] "UniqueID"                            "disbursed_amount"                   
##  [3] "asset_cost"                          "ltv"                                
##  [5] "branch_id"                           "supplier_id"                        
##  [7] "manufacturer_id"                     "Current_pincode_ID"                 
##  [9] "Date.of.Birth"                       "Employment.Type"                    
## [11] "DisbursalDate"                       "State_ID"                           
## [13] "Employee_code_ID"                    "MobileNo_Avl_Flag"                  
## [15] "Aadhar_flag"                         "PAN_flag"                           
## [17] "VoterID_flag"                        "Driving_flag"                       
## [19] "Passport_flag"                       "PERFORM_CNS.SCORE"                  
## [21] "PERFORM_CNS.SCORE.DESCRIPTION"       "PRI.NO.OF.ACCTS"                    
## [23] "PRI.ACTIVE.ACCTS"                    "PRI.OVERDUE.ACCTS"                  
## [25] "PRI.CURRENT.BALANCE"                 "PRI.SANCTIONED.AMOUNT"              
## [27] "PRI.DISBURSED.AMOUNT"                "SEC.NO.OF.ACCTS"                    
## [29] "SEC.ACTIVE.ACCTS"                    "SEC.OVERDUE.ACCTS"                  
## [31] "SEC.CURRENT.BALANCE"                 "SEC.SANCTIONED.AMOUNT"              
## [33] "SEC.DISBURSED.AMOUNT"                "PRIMARY.INSTAL.AMT"                 
## [35] "SEC.INSTAL.AMT"                      "NEW.ACCTS.IN.LAST.SIX.MONTHS"       
## [37] "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS" "AVERAGE.ACCT.AGE"                   
## [39] "CREDIT.HISTORY.LENGTH"               "NO.OF_INQUIRIES"                    
## [41] "loan_default"                        "Employment.TypeSalaried"            
## [43] "Employment.TypeSelf.employed"

Now I remove variables that are unlikely to be useful, judging them based on intuition.

I removed all the IDs except state_ID and UniqueID because It makes sense for some states to have higher loan default rates than others, because of variables like income, number of cars, gas prices, road infrastructure, etc.

I also removed Employment Type after the one hot encoding performed above.

I also removed Employee ID Code because I don’t think it will be useful in predicting loan default, unless it gives us information on the type of employment, and thus income.

Lastly, I removed CNS Score Description because it is a description of the cns score and does not contain any useful information

trash = c(5, 6, 7, 8, 10, 13, 14, 21)
train = train_data[-c(trash)]
test = test_data[-c(trash)]
names(train)
##  [1] "UniqueID"                            "disbursed_amount"                   
##  [3] "asset_cost"                          "ltv"                                
##  [5] "Date.of.Birth"                       "DisbursalDate"                      
##  [7] "State_ID"                            "Aadhar_flag"                        
##  [9] "PAN_flag"                            "VoterID_flag"                       
## [11] "Driving_flag"                        "Passport_flag"                      
## [13] "PERFORM_CNS.SCORE"                   "PRI.NO.OF.ACCTS"                    
## [15] "PRI.ACTIVE.ACCTS"                    "PRI.OVERDUE.ACCTS"                  
## [17] "PRI.CURRENT.BALANCE"                 "PRI.SANCTIONED.AMOUNT"              
## [19] "PRI.DISBURSED.AMOUNT"                "SEC.NO.OF.ACCTS"                    
## [21] "SEC.ACTIVE.ACCTS"                    "SEC.OVERDUE.ACCTS"                  
## [23] "SEC.CURRENT.BALANCE"                 "SEC.SANCTIONED.AMOUNT"              
## [25] "SEC.DISBURSED.AMOUNT"                "PRIMARY.INSTAL.AMT"                 
## [27] "SEC.INSTAL.AMT"                      "NEW.ACCTS.IN.LAST.SIX.MONTHS"       
## [29] "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS" "AVERAGE.ACCT.AGE"                   
## [31] "CREDIT.HISTORY.LENGTH"               "NO.OF_INQUIRIES"                    
## [33] "loan_default"                        "Employment.TypeSalaried"            
## [35] "Employment.TypeSelf.employed"

I explored the data a little bit, and here are some interesting plots I have found that give some intuition to the structure of the dataset. The LTV plot seems to indicate that there are ‘sections’ of loan to value, with a peak at the end of each section. Also, the credit history length plot seems to be roughly normally distributed, with a large amount of 0’s.

In fact, for most variables, the most common amount seems to be 0. Intuitively this feels like it will be an issue, because if most of the variables consist of 0s, and most of the default types also consist of 0s, it will mean our model will not be able to learn much from most of the data. There needs to be some resampling done. I think instead of removing all the 0s I could do some bootsrapping.

# patterns where the data seems split into sections
plot1 = ggplot(train, aes(x = ltv)) + geom_bar()
plot2 = ggplot(train, aes(x = CREDIT.HISTORY.LENGTH)) + geom_bar()
plot3 = ggplot(train, aes(x = DisbursalDate)) + geom_bar()
plot4 = ggplot(train, aes(x = PERFORM_CNS.SCORE)) + geom_bar() 

# patterns where there is a decreasing trend of wider binds
plot5 = ggplot(train, aes(x = PRI.ACTIVE.ACCTS)) + geom_bar()
plot6 = ggplot(train, aes(x = DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS)) + geom_bar()
plot7 = ggplot(train, aes(x = NO.OF_INQUIRIES)) + geom_bar()
plot8 = ggplot(train, aes(x = NEW.ACCTS.IN.LAST.SIX.MONTHS)) + geom_bar()
plot9 = ggplot(train, aes(x = SEC.OVERDUE.ACCTS)) + geom_bar()
plot10 = ggplot(train, aes(x = SEC.ACTIVE.ACCTS)) + geom_bar()
plot11 = ggplot(train, aes(x = SEC.NO.OF.ACCTS)) + geom_bar()
plot12 = ggplot(train, aes(x = PRI.OVERDUE.ACCTS)) + geom_bar()

ggarrange(plot1, plot2, plot3, plot4 ,ncol = 2, nrow = 2)

ggarrange(plot5, plot6, plot7, plot8, plot9, plot10, plot11, plot12 ,ncol = 2, nrow = 4)

Next I took a look at the correlation of our cleaned variables. THere were a few with very high correlation

Since PRI sanctioned amount and PRI disbursed amount are correlated, we remove Sanctioned amount because disbursal amount is more impactful because it is actually paid to the borrower as opposed to just a description of the allowed borrowed amount

PRI NO of accounts, and PRI active accounts are correlated, but we remove active accounts because total accounts are intuitively more indicative than currently active number of accounts in predicting loan defaults.

Since SEC Current balance, SEC sanctioned amount, and SEC disbursed amounts are very highly correlated, we remove sanctioned amount and current balance because it makes sense that the amount disbursed is most impactful, similar to what we did for the PRI variables.

Since credit history length and average account age are highly correlated, we also drop one of credit history length and average account age because they are correlated we choose to drop average account age because credit history length seems more important

apply(train, 2, function(x) any(is.na(x)))
##                            UniqueID                    disbursed_amount 
##                               FALSE                               FALSE 
##                          asset_cost                                 ltv 
##                               FALSE                               FALSE 
##                       Date.of.Birth                       DisbursalDate 
##                               FALSE                               FALSE 
##                            State_ID                         Aadhar_flag 
##                               FALSE                               FALSE 
##                            PAN_flag                        VoterID_flag 
##                               FALSE                               FALSE 
##                        Driving_flag                       Passport_flag 
##                               FALSE                               FALSE 
##                   PERFORM_CNS.SCORE                     PRI.NO.OF.ACCTS 
##                               FALSE                               FALSE 
##                    PRI.ACTIVE.ACCTS                   PRI.OVERDUE.ACCTS 
##                               FALSE                               FALSE 
##                 PRI.CURRENT.BALANCE               PRI.SANCTIONED.AMOUNT 
##                               FALSE                               FALSE 
##                PRI.DISBURSED.AMOUNT                     SEC.NO.OF.ACCTS 
##                               FALSE                               FALSE 
##                    SEC.ACTIVE.ACCTS                   SEC.OVERDUE.ACCTS 
##                               FALSE                               FALSE 
##                 SEC.CURRENT.BALANCE               SEC.SANCTIONED.AMOUNT 
##                               FALSE                               FALSE 
##                SEC.DISBURSED.AMOUNT                  PRIMARY.INSTAL.AMT 
##                               FALSE                               FALSE 
##                      SEC.INSTAL.AMT        NEW.ACCTS.IN.LAST.SIX.MONTHS 
##                               FALSE                               FALSE 
## DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS                    AVERAGE.ACCT.AGE 
##                               FALSE                               FALSE 
##               CREDIT.HISTORY.LENGTH                     NO.OF_INQUIRIES 
##                               FALSE                               FALSE 
##                        loan_default             Employment.TypeSalaried 
##                               FALSE                               FALSE 
##        Employment.TypeSelf.employed 
##                               FALSE
cormat = cor(train)
colnames(cormat) = c(1:dim(cormat)[1])
rownames(cormat) = c(1:dim(cormat)[2])
corrplot(cormat, is.corr = FALSE, method = 'circle')

# PRI no accts, PRI sanctioned amount, SEC current balance, SEC sanctioned amount, AVG Acct Age
trash = c(14, 18, 23, 24, 30)
train = train[-trash]
test = test[-trash]

cormat = cor(train)
colnames(cormat) = c(1:dim(cormat)[1])
rownames(cormat) = c(1:dim(cormat)[2])
corrplot(cormat, is.corr = FALSE, method = 'circle')

names(train)
##  [1] "UniqueID"                            "disbursed_amount"                   
##  [3] "asset_cost"                          "ltv"                                
##  [5] "Date.of.Birth"                       "DisbursalDate"                      
##  [7] "State_ID"                            "Aadhar_flag"                        
##  [9] "PAN_flag"                            "VoterID_flag"                       
## [11] "Driving_flag"                        "Passport_flag"                      
## [13] "PERFORM_CNS.SCORE"                   "PRI.ACTIVE.ACCTS"                   
## [15] "PRI.OVERDUE.ACCTS"                   "PRI.CURRENT.BALANCE"                
## [17] "PRI.DISBURSED.AMOUNT"                "SEC.NO.OF.ACCTS"                    
## [19] "SEC.ACTIVE.ACCTS"                    "SEC.OVERDUE.ACCTS"                  
## [21] "SEC.DISBURSED.AMOUNT"                "PRIMARY.INSTAL.AMT"                 
## [23] "SEC.INSTAL.AMT"                      "NEW.ACCTS.IN.LAST.SIX.MONTHS"       
## [25] "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS" "CREDIT.HISTORY.LENGTH"              
## [27] "NO.OF_INQUIRIES"                     "loan_default"                       
## [29] "Employment.TypeSalaried"             "Employment.TypeSelf.employed"

I then performed resampling, because as previously mentioned, the default to non default ratio is too small, and most of the data for the variables are 0. In order to resample, I used the bootstrapping method, which randomly samples from the training minority, keeping the distribution of the data but magnifying the information in the data.

train_majority = train[train$loan_default == 0,]
train_minority = train[train$loan_default == 1,]

# Here are the 0s and 1s for train data BEFORE resampling
dim(train_majority)
## [1] 136748     30
dim(train_minority)
## [1] 38117    30
train_minority_upsampled = train_minority[sample(nrow(train_minority), dim(train_majority)[1], replace = TRUE), ]
final_train = rbind(train_majority, train_minority_upsampled)

# Here are the 0s and 1s for train data AFTER resampling
dim(train_majority)
## [1] 136748     30
dim(train_minority_upsampled)
## [1] 136748     30
# The final training set now has following dimensions
dim(final_train)
## [1] 273496     30

Splitting the training data into training set and a validation set. I used 80% validation and 20% validation.

valcut = 0.8
set = c(sample(nrow(final_train), dim(train_majority)[1]*valcut, replace = FALSE))
train_set = final_train[set, ]
val_set = final_train[-set, ]

First I fit a logistic regression model with the remaining variables. I got an f1 score of 0.6. An interesting observation is that Voter ID was not an important predictor. I guess what this means is that voting tendencies of individuals are unlikely to help predict loan defaults. This might mean Republicans vs Democrats are not good predictors of vehicle loan defaults. It also seems like number of accounts (active, overdue, or new accounts in last six months) are not very useful in prediction. Lastly, employment type salaried is not very important, while self employed is imporant. This could be interpreted as the variable ‘salaried or not’ is less important than the variable ‘self employed’ or not. Maybe the people who call themselves ‘self employed’ are the ones who have a thin bank account.

Then I tried to refit a new logistic regression after removing the variables that were not significant. The results were roughly the same.

logistic.model = glm(formula = train_set$loan_default ~ ., family = binomial, data = train_set)
logistic.probs = predict(logistic.model, newdata = val_set, type="response")
logistic.preds = rep(0, dim(val_set)[1])
logistic.preds[logistic.probs > 0.5] = 1
confusionMatrix(as.factor(logistic.preds), as.factor(val_set$loan_default), mode = "everything", positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 45418 30351
##          1 36546 51783
##                                         
##                Accuracy : 0.5923        
##                  95% CI : (0.59, 0.5947)
##     No Information Rate : 0.5005        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.1846        
##                                         
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.6305        
##             Specificity : 0.5541        
##          Pos Pred Value : 0.5863        
##          Neg Pred Value : 0.5994        
##               Precision : 0.5863        
##                  Recall : 0.6305        
##                      F1 : 0.6076        
##              Prevalence : 0.5005        
##          Detection Rate : 0.3156        
##    Detection Prevalence : 0.5383        
##       Balanced Accuracy : 0.5923        
##                                         
##        'Positive' Class : 1             
## 
summary(logistic.model)
## 
## Call:
## glm(formula = train_set$loan_default ~ ., family = binomial, 
##     data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1284  -1.1522  -0.5027   1.1241   2.4996  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          5.202e+04  5.501e+03   9.456  < 2e-16 ***
## UniqueID                             4.265e-06  3.344e-07  12.752  < 2e-16 ***
## disbursed_amount                    -1.036e-05  2.724e-06  -3.804 0.000142 ***
## asset_cost                           1.211e-05  1.814e-06   6.674 2.49e-11 ***
## ltv                                  3.702e-02  2.150e-03  17.221  < 2e-16 ***
## Date.of.Birth                        9.793e-07  6.855e-08  14.287  < 2e-16 ***
## DisbursalDate                       -2.579e-03  2.726e-04  -9.460  < 2e-16 ***
## State_ID                             2.495e-02  1.413e-03  17.657  < 2e-16 ***
## Aadhar_flag                         -2.475e-01  4.868e-02  -5.084 3.69e-07 ***
## PAN_flag                            -7.623e-02  2.434e-02  -3.131 0.001742 ** 
## VoterID_flag                         3.114e-02  4.790e-02   0.650 0.515681    
## Driving_flag                        -2.147e-01  5.777e-02  -3.717 0.000202 ***
## Passport_flag                       -5.542e-01  1.495e-01  -3.708 0.000209 ***
## PERFORM_CNS.SCORE                   -2.231e-04  2.332e-05  -9.564  < 2e-16 ***
## PRI.ACTIVE.ACCTS                    -8.152e-02  7.095e-03 -11.489  < 2e-16 ***
## PRI.OVERDUE.ACCTS                    2.826e-01  1.546e-02  18.281  < 2e-16 ***
## PRI.CURRENT.BALANCE                 -7.006e-08  1.061e-08  -6.601 4.09e-11 ***
## PRI.DISBURSED.AMOUNT                 2.659e-09  2.016e-09   1.319 0.187180    
## SEC.NO.OF.ACCTS                      4.658e-03  2.084e-02   0.223 0.823147    
## SEC.ACTIVE.ACCTS                    -1.865e-02  4.186e-02  -0.446 0.655912    
## SEC.OVERDUE.ACCTS                    3.453e-02  7.020e-02   0.492 0.622851    
## SEC.DISBURSED.AMOUNT                -4.612e-08  4.223e-08  -1.092 0.274796    
## PRIMARY.INSTAL.AMT                  -1.250e-08  4.901e-08  -0.255 0.798658    
## SEC.INSTAL.AMT                       8.779e-07  5.353e-07   1.640 0.101009    
## NEW.ACCTS.IN.LAST.SIX.MONTHS        -8.369e-03  1.107e-02  -0.756 0.449835    
## DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS  2.891e-01  1.986e-02  14.559  < 2e-16 ***
## CREDIT.HISTORY.LENGTH               -2.959e-03  3.246e-04  -9.118  < 2e-16 ***
## NO.OF_INQUIRIES                      1.678e-01  9.563e-03  17.549  < 2e-16 ***
## Employment.TypeSalaried             -1.065e-02  3.596e-02  -0.296 0.767116    
## Employment.TypeSelf.employed         1.737e-01  3.592e-02   4.835 1.33e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 151658  on 109397  degrees of freedom
## Residual deviance: 146176  on 109368  degrees of freedom
## AIC: 146236
## 
## Number of Fisher Scoring iterations: 4
logroc = roc(val_set$loan_default, logistic.preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logroc, main = 'Logistic Regression' )
text(0, 0.5, paste("AUC = ", auc(logroc)[1], sep = "") )

# Finding the variables that were useless
trash_var = c(10, 17, 18, 19, 20, 24, 29)
train_set2 = train_set[-trash_var]
final_test = test[-trash_var]
names(train_set2)
##  [1] "UniqueID"                            "disbursed_amount"                   
##  [3] "asset_cost"                          "ltv"                                
##  [5] "Date.of.Birth"                       "DisbursalDate"                      
##  [7] "State_ID"                            "Aadhar_flag"                        
##  [9] "PAN_flag"                            "Driving_flag"                       
## [11] "Passport_flag"                       "PERFORM_CNS.SCORE"                  
## [13] "PRI.ACTIVE.ACCTS"                    "PRI.OVERDUE.ACCTS"                  
## [15] "PRI.CURRENT.BALANCE"                 "SEC.DISBURSED.AMOUNT"               
## [17] "PRIMARY.INSTAL.AMT"                  "SEC.INSTAL.AMT"                     
## [19] "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS" "CREDIT.HISTORY.LENGTH"              
## [21] "NO.OF_INQUIRIES"                     "loan_default"                       
## [23] "Employment.TypeSelf.employed"
# Refitting a new model after removing those variables
logistic.model2 = glm(formula = train_set2$loan_default ~ ., family = binomial, data = train_set2)
logistic.probs2 = predict(logistic.model2, newdata = val_set, type="response")
logistic.preds2 = rep(0, dim(val_set)[1])
logistic.preds2[logistic.probs2 > 0.5] = 1
confusionMatrix(as.factor(logistic.preds2), as.factor(val_set$loan_default), mode = "everything", positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 45428 30347
##          1 36536 51787
##                                         
##                Accuracy : 0.5924        
##                  95% CI : (0.59, 0.5948)
##     No Information Rate : 0.5005        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.1848        
##                                         
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.6305        
##             Specificity : 0.5542        
##          Pos Pred Value : 0.5863        
##          Neg Pred Value : 0.5995        
##               Precision : 0.5863        
##                  Recall : 0.6305        
##                      F1 : 0.6076        
##              Prevalence : 0.5005        
##          Detection Rate : 0.3156        
##    Detection Prevalence : 0.5382        
##       Balanced Accuracy : 0.5924        
##                                         
##        'Positive' Class : 1             
## 
summary(logistic.model2)
## 
## Call:
## glm(formula = train_set2$loan_default ~ ., family = binomial, 
##     data = train_set2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1300  -1.1525  -0.5028   1.1237   2.4894  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          5.198e+04  5.501e+03   9.450  < 2e-16 ***
## UniqueID                             4.262e-06  3.344e-07  12.747  < 2e-16 ***
## disbursed_amount                    -1.037e-05  2.722e-06  -3.810 0.000139 ***
## asset_cost                           1.212e-05  1.814e-06   6.684 2.33e-11 ***
## ltv                                  3.701e-02  2.148e-03  17.225  < 2e-16 ***
## Date.of.Birth                        9.801e-07  6.752e-08  14.515  < 2e-16 ***
## DisbursalDate                       -2.577e-03  2.726e-04  -9.454  < 2e-16 ***
## State_ID                             2.494e-02  1.412e-03  17.667  < 2e-16 ***
## Aadhar_flag                         -2.769e-01  1.833e-02 -15.104  < 2e-16 ***
## PAN_flag                            -7.780e-02  2.427e-02  -3.205 0.001350 ** 
## Driving_flag                        -2.392e-01  4.446e-02  -5.379 7.47e-08 ***
## Passport_flag                       -5.770e-01  1.451e-01  -3.975 7.03e-05 ***
## PERFORM_CNS.SCORE                   -2.237e-04  2.329e-05  -9.605  < 2e-16 ***
## PRI.ACTIVE.ACCTS                    -8.533e-02  5.041e-03 -16.926  < 2e-16 ***
## PRI.OVERDUE.ACCTS                    2.844e-01  1.525e-02  18.647  < 2e-16 ***
## PRI.CURRENT.BALANCE                 -6.617e-08  1.029e-08  -6.429 1.28e-10 ***
## SEC.DISBURSED.AMOUNT                -5.022e-08  3.970e-08  -1.265 0.205798    
## PRIMARY.INSTAL.AMT                  -1.261e-08  4.909e-08  -0.257 0.797316    
## SEC.INSTAL.AMT                       8.854e-07  5.183e-07   1.708 0.087571 .  
## DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS  2.900e-01  1.973e-02  14.699  < 2e-16 ***
## CREDIT.HISTORY.LENGTH               -2.899e-03  3.130e-04  -9.261  < 2e-16 ***
## NO.OF_INQUIRIES                      1.664e-01  9.393e-03  17.717  < 2e-16 ***
## Employment.TypeSelf.employed         1.837e-01  1.279e-02  14.357  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 151658  on 109397  degrees of freedom
## Residual deviance: 146180  on 109375  degrees of freedom
## AIC: 146226
## 
## Number of Fisher Scoring iterations: 4
logroc2 = roc(val_set$loan_default, logistic.preds2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logroc2, main = 'Logistic Regression2' )
text(0, 0.5, paste("AUC = ", auc(logroc2)[1], sep = "") )

Here I chose to try a decision tree model. It predicted the results slightly better, with f1 of 61, which made me wonder if random forest would perform better because it combines decision trees in a more robust way. That being said, the AUC was lower, at 55. Judging from the plot, it seems like the ‘wedge’ point, the point where the angle of the curve changes, is shifted to a higher specificity.

I chose Decision Tree to try to get an intuitive sense of which variables were useful. This is because Decision trees classify by finding the best variable to split into pure branches, which is most similar to how humans think of classification. According to the summary, it seems like the only variable used in tree construction was ltv. I think more could be done though, and this is a good start. If I use a random forest I might be able to use more variables.

tree.model = tree(train_set2$loan_default~., train_set, split = c("deviance", "gini"), method = 'class')
tree.probs = predict(tree.model, val_set)
tree.preds = rep(0, dim(val_set)[1])
tree.preds[tree.probs > 0.5] = 1
confusionMatrix(as.factor(tree.preds), as.factor(val_set$loan_default), mode = "everything", positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 32185 23809
##          1 49779 58325
##                                          
##                Accuracy : 0.5516         
##                  95% CI : (0.5492, 0.554)
##     No Information Rate : 0.5005         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.1028         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.7101         
##             Specificity : 0.3927         
##          Pos Pred Value : 0.5395         
##          Neg Pred Value : 0.5748         
##               Precision : 0.5395         
##                  Recall : 0.7101         
##                      F1 : 0.6132         
##              Prevalence : 0.5005         
##          Detection Rate : 0.3554         
##    Detection Prevalence : 0.6588         
##       Balanced Accuracy : 0.5514         
##                                          
##        'Positive' Class : 1              
## 
summary(tree.model)
## 
## Regression tree:
## tree(formula = train_set2$loan_default ~ ., data = train_set, 
##     method = "class", split = c("deviance", "gini"))
## Variables actually used in tree construction:
## [1] "ltv"
## Number of terminal nodes:  2 
## Residual mean deviance:  0.247 = 27020 / 109400 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5389 -0.5389 -0.4224  0.0000  0.4611  0.5776
treeroc = roc(val_set$loan_default, tree.preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(treeroc, main = 'Decision Tree' )
text(0, 0.5, paste("AUC = ", auc(treeroc)[1], sep = "") )

tree.probs2 = predict(tree.model)
tree.preds2 = rep(0, dim(train_set)[1])
tree.preds2[tree.probs2 > 0.5] = 1
treeroc2 = roc(train_set$loan_default, tree.preds2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(treeroc2, main = 'Decision Tree2' )
text(0, 0.5, paste("AUC = ", auc(treeroc2)[1], sep = "") )

I also fit a random forest model, in hopes that the robustness provided by the random forest could better take advantage of resampled data. The f1 score results were varying around 81. Given how well the result for random forest is, I think that the results really depend on how much data we have, so I tried a bagging method, which reduces the variance of random forest. Especially since when I increased the number of trees from 25 to 40, the results improved a little. I could include more trees to maybe improve the f1 score higher by a decreasing amount, but It would take too long to run.

The inclusion of bagging further improved the results slightly (by less than 1). It seems that the initial ratio of default data to non default data was too small. After resampling, and bagging, and using random forest, our resulting model was able to better absorb all the information in the default data to learn the predictors better.

forest.model = randomForest(as.factor(train_set$loan_default) ~., data = train_set, ntree = 40)
forest.preds = predict(forest.model, val_set)
confusionMatrix(forest.preds, as.factor(val_set$loan_default), mode = "everything", positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 61283 12608
##          1 20681 69526
##                                           
##                Accuracy : 0.7971          
##                  95% CI : (0.7952, 0.7991)
##     No Information Rate : 0.5005          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5942          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8465          
##             Specificity : 0.7477          
##          Pos Pred Value : 0.7707          
##          Neg Pred Value : 0.8294          
##               Precision : 0.7707          
##                  Recall : 0.8465          
##                      F1 : 0.8068          
##              Prevalence : 0.5005          
##          Detection Rate : 0.4237          
##    Detection Prevalence : 0.5497          
##       Balanced Accuracy : 0.7971          
##                                           
##        'Positive' Class : 1               
## 
forestroc = roc(val_set$loan_default, as.numeric(forest.preds ))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(forestroc, main = 'Random Forest' )
text(0, 0.5, paste("AUC = ", auc(forestroc)[1], sep = "") )

plot(forest.model)

forest.model2 = randomForest(as.factor(train_set$loan_default) ~., data = train_set, mtry = 22, ntree = 40)
forest.preds2 = predict(forest.model2, val_set)
confusionMatrix(forest.preds2, as.factor(val_set$loan_default), mode = "everything", positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 63987 12830
##          1 17977 69304
##                                           
##                Accuracy : 0.8123          
##                  95% CI : (0.8104, 0.8142)
##     No Information Rate : 0.5005          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6245          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8438          
##             Specificity : 0.7807          
##          Pos Pred Value : 0.7940          
##          Neg Pred Value : 0.8330          
##               Precision : 0.7940          
##                  Recall : 0.8438          
##                      F1 : 0.8182          
##              Prevalence : 0.5005          
##          Detection Rate : 0.4223          
##    Detection Prevalence : 0.5319          
##       Balanced Accuracy : 0.8122          
##                                           
##        'Positive' Class : 1               
## 
forestroc2 = roc(val_set$loan_default, as.numeric(forest.preds2))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(forestroc2, main = 'Random Forest2' )
text(0, 0.5, paste("AUC = ", auc(forestroc2)[1], sep = "") )

plot(forest.model2)

Finally, we refit a final model with the full data and predict with the test data to come up with results and save them into csv. The best model we obtained was one that uses bagging on random forest, which obtained an f1 score of 81 percent.

forest.model_final = randomForest(as.factor(final_train$loan_default) ~., data = final_train, mtry = 22, ntree = 40)
forest.preds_final = predict(forest.model_final, test)
write.csv(test_data,"prediction.csv", row.names = FALSE)