Introduction

This R Markdown document describes the steps for creating a credit risk prediction model using logistic regression. We start by loading the necessary libraries and the dataset, followed by data preprocessing and model training and evaluation. Lastly, we focus on visualizing our results.

Data loading and Summary

# Load the required package

library(caTools)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data <- read.csv("/Users/zihualai/Desktop/credit risk/credit_risk_dataset.csv")

#Summary of the data

str(data)
## 'data.frame':    32581 obs. of  12 variables:
##  $ person_age                : int  22 21 25 23 24 21 26 24 24 21 ...
##  $ person_income             : int  59000 9600 9600 65500 54400 9900 77100 78956 83000 10000 ...
##  $ person_home_ownership     : chr  "RENT" "OWN" "MORTGAGE" "RENT" ...
##  $ person_emp_length         : num  123 5 1 4 8 2 8 5 8 6 ...
##  $ loan_intent               : chr  "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
##  $ loan_grade                : chr  "D" "B" "C" "C" ...
##  $ loan_amnt                 : int  35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
##  $ loan_int_rate             : num  16 11.1 12.9 15.2 14.3 ...
##  $ loan_status               : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ loan_percent_income       : num  0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
##  $ cb_person_default_on_file : chr  "Y" "N" "N" "N" ...
##  $ cb_person_cred_hist_length: int  3 2 3 2 4 2 3 4 2 3 ...
summary(data)
##    person_age     person_income     person_home_ownership person_emp_length
##  Min.   : 20.00   Min.   :   4000   Length:32581          Min.   :  0.00   
##  1st Qu.: 23.00   1st Qu.:  38500   Class :character      1st Qu.:  2.00   
##  Median : 26.00   Median :  55000   Mode  :character      Median :  4.00   
##  Mean   : 27.73   Mean   :  66075                         Mean   :  4.79   
##  3rd Qu.: 30.00   3rd Qu.:  79200                         3rd Qu.:  7.00   
##  Max.   :144.00   Max.   :6000000                         Max.   :123.00   
##                                                           NA's   :895      
##  loan_intent         loan_grade          loan_amnt     loan_int_rate  
##  Length:32581       Length:32581       Min.   :  500   Min.   : 5.42  
##  Class :character   Class :character   1st Qu.: 5000   1st Qu.: 7.90  
##  Mode  :character   Mode  :character   Median : 8000   Median :10.99  
##                                        Mean   : 9589   Mean   :11.01  
##                                        3rd Qu.:12200   3rd Qu.:13.47  
##                                        Max.   :35000   Max.   :23.22  
##                                                        NA's   :3116   
##   loan_status     loan_percent_income cb_person_default_on_file
##  Min.   :0.0000   Min.   :0.0000      Length:32581             
##  1st Qu.:0.0000   1st Qu.:0.0900      Class :character         
##  Median :0.0000   Median :0.1500      Mode  :character         
##  Mean   :0.2182   Mean   :0.1702                               
##  3rd Qu.:0.0000   3rd Qu.:0.2300                               
##  Max.   :1.0000   Max.   :0.8300                               
##                                                                
##  cb_person_cred_hist_length
##  Min.   : 2.000            
##  1st Qu.: 3.000            
##  Median : 4.000            
##  Mean   : 5.804            
##  3rd Qu.: 8.000            
##  Max.   :30.000            
## 

Handling Missing Data

# Visualzie missing data

matrixplot(data)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
## Impute missing values
data$person_emp_length[is.na(data$person_emp_length)] <- mean(data$person_emp_length, na.rm = TRUE)
data$loan_int_rate[is.na(data$loan_int_rate)] <- mean(data$loan_int_rate, na.rm = TRUE)

#Dealing with Outliers

##Boxplot for age
boxplot(data$person_age)

## Remove outliers
data <- data[data$person_age <= 65 & data$person_age >= 18,]

Feature Selection

## Correlation analysis
cor_data <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(cor_data)

Model trainning and evaluation

## Data Splitting
set.seed(123)
splitIndex <- createDataPartition(data$loan_status, p=0.7, list=FALSE)
trainData <- data[splitIndex,]
testData <- data[-splitIndex,]

## Model Training

classifier <- glm(loan_status ~ ., data=trainData, family=binomial)
summary(classifier)
## 
## Call:
## glm(formula = loan_status ~ ., family = binomial, data = trainData)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.283e+00  2.069e-01 -20.696  < 2e-16 ***
## person_age                 -3.977e-04  7.077e-03  -0.056 0.955187    
## person_income               2.070e-06  5.770e-07   3.588 0.000333 ***
## person_home_ownershipOTHER  4.477e-01  3.319e-01   1.349 0.177329    
## person_home_ownershipOWN   -1.649e+00  1.186e-01 -13.907  < 2e-16 ***
## person_home_ownershipRENT   8.441e-01  4.776e-02  17.672  < 2e-16 ***
## person_emp_length          -5.773e-03  5.667e-03  -1.019 0.308383    
## loan_intentEDUCATION       -8.950e-01  6.839e-02 -13.087  < 2e-16 ***
## loan_intentHOMEIMPROVEMENT -1.216e-02  7.577e-02  -0.160 0.872528    
## loan_intentMEDICAL         -1.556e-01  6.411e-02  -2.427 0.015206 *  
## loan_intentPERSONAL        -6.650e-01  6.940e-02  -9.582  < 2e-16 ***
## loan_intentVENTURE         -1.116e+00  7.347e-02 -15.183  < 2e-16 ***
## loan_gradeB                 1.666e-01  7.544e-02   2.209 0.027179 *  
## loan_gradeC                 3.588e-01  1.076e-01   3.335 0.000854 ***
## loan_gradeD                 2.458e+00  1.314e-01  18.702  < 2e-16 ***
## loan_gradeE                 2.718e+00  1.715e-01  15.848  < 2e-16 ***
## loan_gradeF                 2.923e+00  2.486e-01  11.760  < 2e-16 ***
## loan_gradeG                 6.444e+00  1.047e+00   6.156 7.47e-10 ***
## loan_amnt                  -1.094e-04  5.273e-06 -20.751  < 2e-16 ***
## loan_int_rate               6.756e-02  1.529e-02   4.419 9.94e-06 ***
## loan_percent_income         1.362e+01  3.083e-01  44.186  < 2e-16 ***
## cb_person_default_on_fileY -2.368e-02  5.967e-02  -0.397 0.691498    
## cb_person_cred_hist_length -5.247e-03  1.063e-02  -0.493 0.621744    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23891  on 22782  degrees of freedom
## Residual deviance: 15496  on 22760  degrees of freedom
## AIC: 15542
## 
## Number of Fisher Scoring iterations: 6
## Model evaluation

pred <- predict(classifier, newdata=testData, type="response")
roc_obj <- roc(testData$loan_status, pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Visulizaiton

## ROC Curve
plot(roc_obj, print.auc=TRUE)

## Feature Importance

# Install and load the randomForest package

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Fit the Random Forest model
rf_model <- randomForest(loan_status ~ ., data=trainData, ntree=100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
rf_model
## 
## Call:
##  randomForest(formula = loan_status ~ ., data = trainData, ntree = 100) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.06189702
##                     % Var explained: 63.69
# Extract and plot feature importance
importance <- importance(rf_model)
imp_plot <- varImpPlot(rf_model)

print(imp_plot)
##                            IncNodePurity
## person_age                     139.74988
## person_income                  533.55966
## person_home_ownership          382.12128
## person_emp_length              217.88718
## loan_intent                    262.90475
## loan_grade                     441.10415
## loan_amnt                      258.35769
## loan_int_rate                  376.47473
## loan_percent_income            845.11577
## cb_person_default_on_file       42.18434
## cb_person_cred_hist_length     109.52154

Class distribution

# Count the number of each class
class_count <- table(data$loan_status)

# Create a bar plot for class distribution
barplot(class_count, main="Class Distribution", xlab="Class", ylab="Frequency", col=c("blue", "red"))