This R Markdown document describes the steps for creating a credit risk prediction model using logistic regression. We start by loading the necessary libraries and the dataset, followed by data preprocessing and model training and evaluation. Lastly, we focus on visualizing our results.
# Load the required package
library(caTools)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read.csv("/Users/zihualai/Desktop/credit risk/credit_risk_dataset.csv")
#Summary of the data
str(data)
## 'data.frame': 32581 obs. of 12 variables:
## $ person_age : int 22 21 25 23 24 21 26 24 24 21 ...
## $ person_income : int 59000 9600 9600 65500 54400 9900 77100 78956 83000 10000 ...
## $ person_home_ownership : chr "RENT" "OWN" "MORTGAGE" "RENT" ...
## $ person_emp_length : num 123 5 1 4 8 2 8 5 8 6 ...
## $ loan_intent : chr "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
## $ loan_grade : chr "D" "B" "C" "C" ...
## $ loan_amnt : int 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ loan_int_rate : num 16 11.1 12.9 15.2 14.3 ...
## $ loan_status : int 1 0 1 1 1 1 1 1 1 1 ...
## $ loan_percent_income : num 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
## $ cb_person_default_on_file : chr "Y" "N" "N" "N" ...
## $ cb_person_cred_hist_length: int 3 2 3 2 4 2 3 4 2 3 ...
summary(data)
## person_age person_income person_home_ownership person_emp_length
## Min. : 20.00 Min. : 4000 Length:32581 Min. : 0.00
## 1st Qu.: 23.00 1st Qu.: 38500 Class :character 1st Qu.: 2.00
## Median : 26.00 Median : 55000 Mode :character Median : 4.00
## Mean : 27.73 Mean : 66075 Mean : 4.79
## 3rd Qu.: 30.00 3rd Qu.: 79200 3rd Qu.: 7.00
## Max. :144.00 Max. :6000000 Max. :123.00
## NA's :895
## loan_intent loan_grade loan_amnt loan_int_rate
## Length:32581 Length:32581 Min. : 500 Min. : 5.42
## Class :character Class :character 1st Qu.: 5000 1st Qu.: 7.90
## Mode :character Mode :character Median : 8000 Median :10.99
## Mean : 9589 Mean :11.01
## 3rd Qu.:12200 3rd Qu.:13.47
## Max. :35000 Max. :23.22
## NA's :3116
## loan_status loan_percent_income cb_person_default_on_file
## Min. :0.0000 Min. :0.0000 Length:32581
## 1st Qu.:0.0000 1st Qu.:0.0900 Class :character
## Median :0.0000 Median :0.1500 Mode :character
## Mean :0.2182 Mean :0.1702
## 3rd Qu.:0.0000 3rd Qu.:0.2300
## Max. :1.0000 Max. :0.8300
##
## cb_person_cred_hist_length
## Min. : 2.000
## 1st Qu.: 3.000
## Median : 4.000
## Mean : 5.804
## 3rd Qu.: 8.000
## Max. :30.000
##
# Visualzie missing data
matrixplot(data)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
## Impute missing values
data$person_emp_length[is.na(data$person_emp_length)] <- mean(data$person_emp_length, na.rm = TRUE)
data$loan_int_rate[is.na(data$loan_int_rate)] <- mean(data$loan_int_rate, na.rm = TRUE)
#Dealing with Outliers
##Boxplot for age
boxplot(data$person_age)
## Remove outliers
data <- data[data$person_age <= 65 & data$person_age >= 18,]
## Correlation analysis
cor_data <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(cor_data)
## Data Splitting
set.seed(123)
splitIndex <- createDataPartition(data$loan_status, p=0.7, list=FALSE)
trainData <- data[splitIndex,]
testData <- data[-splitIndex,]
## Model Training
classifier <- glm(loan_status ~ ., data=trainData, family=binomial)
summary(classifier)
##
## Call:
## glm(formula = loan_status ~ ., family = binomial, data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.283e+00 2.069e-01 -20.696 < 2e-16 ***
## person_age -3.977e-04 7.077e-03 -0.056 0.955187
## person_income 2.070e-06 5.770e-07 3.588 0.000333 ***
## person_home_ownershipOTHER 4.477e-01 3.319e-01 1.349 0.177329
## person_home_ownershipOWN -1.649e+00 1.186e-01 -13.907 < 2e-16 ***
## person_home_ownershipRENT 8.441e-01 4.776e-02 17.672 < 2e-16 ***
## person_emp_length -5.773e-03 5.667e-03 -1.019 0.308383
## loan_intentEDUCATION -8.950e-01 6.839e-02 -13.087 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT -1.216e-02 7.577e-02 -0.160 0.872528
## loan_intentMEDICAL -1.556e-01 6.411e-02 -2.427 0.015206 *
## loan_intentPERSONAL -6.650e-01 6.940e-02 -9.582 < 2e-16 ***
## loan_intentVENTURE -1.116e+00 7.347e-02 -15.183 < 2e-16 ***
## loan_gradeB 1.666e-01 7.544e-02 2.209 0.027179 *
## loan_gradeC 3.588e-01 1.076e-01 3.335 0.000854 ***
## loan_gradeD 2.458e+00 1.314e-01 18.702 < 2e-16 ***
## loan_gradeE 2.718e+00 1.715e-01 15.848 < 2e-16 ***
## loan_gradeF 2.923e+00 2.486e-01 11.760 < 2e-16 ***
## loan_gradeG 6.444e+00 1.047e+00 6.156 7.47e-10 ***
## loan_amnt -1.094e-04 5.273e-06 -20.751 < 2e-16 ***
## loan_int_rate 6.756e-02 1.529e-02 4.419 9.94e-06 ***
## loan_percent_income 1.362e+01 3.083e-01 44.186 < 2e-16 ***
## cb_person_default_on_fileY -2.368e-02 5.967e-02 -0.397 0.691498
## cb_person_cred_hist_length -5.247e-03 1.063e-02 -0.493 0.621744
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23891 on 22782 degrees of freedom
## Residual deviance: 15496 on 22760 degrees of freedom
## AIC: 15542
##
## Number of Fisher Scoring iterations: 6
## Model evaluation
pred <- predict(classifier, newdata=testData, type="response")
roc_obj <- roc(testData$loan_status, pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## ROC Curve
plot(roc_obj, print.auc=TRUE)
## Feature Importance
# Install and load the randomForest package
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
# Fit the Random Forest model
rf_model <- randomForest(loan_status ~ ., data=trainData, ntree=100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_model
##
## Call:
## randomForest(formula = loan_status ~ ., data = trainData, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.06189702
## % Var explained: 63.69
# Extract and plot feature importance
importance <- importance(rf_model)
imp_plot <- varImpPlot(rf_model)
print(imp_plot)
## IncNodePurity
## person_age 139.74988
## person_income 533.55966
## person_home_ownership 382.12128
## person_emp_length 217.88718
## loan_intent 262.90475
## loan_grade 441.10415
## loan_amnt 258.35769
## loan_int_rate 376.47473
## loan_percent_income 845.11577
## cb_person_default_on_file 42.18434
## cb_person_cred_hist_length 109.52154
# Count the number of each class
class_count <- table(data$loan_status)
# Create a bar plot for class distribution
barplot(class_count, main="Class Distribution", xlab="Class", ylab="Frequency", col=c("blue", "red"))