This R Markdown document describes the steps for creating a credit risk prediction model using logistic regression. We start by loading the necessary libraries and the dataset, followed by data preprocessing and model training and evaluation. Lastly, we focus on visualizing our results.
# Load the required package
library(colorspace)
library(grid)
library(caTools)
library(VIM)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
data <- read.csv("/Users/zihualai/Desktop/credit risk/credit_risk_dataset.csv")
str(data)
## 'data.frame': 32581 obs. of 12 variables:
## $ person_age : int 22 21 25 23 24 21 26 24 24 21 ...
## $ person_income : int 59000 9600 9600 65500 54400 9900 77100 78956 83000 10000 ...
## $ person_home_ownership : chr "RENT" "OWN" "MORTGAGE" "RENT" ...
## $ person_emp_length : num 123 5 1 4 8 2 8 5 8 6 ...
## $ loan_intent : chr "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
## $ loan_grade : chr "D" "B" "C" "C" ...
## $ loan_amnt : int 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
## $ loan_int_rate : num 16 11.1 12.9 15.2 14.3 ...
## $ loan_status : int 1 0 1 1 1 1 1 1 1 1 ...
## $ loan_percent_income : num 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
## $ cb_person_default_on_file : chr "Y" "N" "N" "N" ...
## $ cb_person_cred_hist_length: int 3 2 3 2 4 2 3 4 2 3 ...
summary(data)
## person_age person_income person_home_ownership person_emp_length
## Min. : 20.00 Min. : 4000 Length:32581 Min. : 0.00
## 1st Qu.: 23.00 1st Qu.: 38500 Class :character 1st Qu.: 2.00
## Median : 26.00 Median : 55000 Mode :character Median : 4.00
## Mean : 27.73 Mean : 66075 Mean : 4.79
## 3rd Qu.: 30.00 3rd Qu.: 79200 3rd Qu.: 7.00
## Max. :144.00 Max. :6000000 Max. :123.00
## NA's :895
## loan_intent loan_grade loan_amnt loan_int_rate
## Length:32581 Length:32581 Min. : 500 Min. : 5.42
## Class :character Class :character 1st Qu.: 5000 1st Qu.: 7.90
## Mode :character Mode :character Median : 8000 Median :10.99
## Mean : 9589 Mean :11.01
## 3rd Qu.:12200 3rd Qu.:13.47
## Max. :35000 Max. :23.22
## NA's :3116
## loan_status loan_percent_income cb_person_default_on_file
## Min. :0.0000 Min. :0.0000 Length:32581
## 1st Qu.:0.0000 1st Qu.:0.0900 Class :character
## Median :0.0000 Median :0.1500 Mode :character
## Mean :0.2182 Mean :0.1702
## 3rd Qu.:0.0000 3rd Qu.:0.2300
## Max. :1.0000 Max. :0.8300
##
## cb_person_cred_hist_length
## Min. : 2.000
## 1st Qu.: 3.000
## Median : 4.000
## Mean : 5.804
## 3rd Qu.: 8.000
## Max. :30.000
##
# Visualzie missing data
matrixplot(data)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
# We noted that columns `person_emp_length` and `loan_int_rate` had missing data. We imputed these missing values with the respective means of the columns.
# Impute missing values
data$person_emp_length[is.na(data$person_emp_length)] <- mean(data$person_emp_length, na.rm = TRUE)
data$loan_int_rate[is.na(data$loan_int_rate)] <- mean(data$loan_int_rate, na.rm = TRUE)
##Dealing with Outliers
#Age was the primary concern where outliers were identified. We set an acceptable range (18-65 years) for age and filtered out records that were outside this range.
#Boxplot for age to visualize outliers
boxplot(data$person_age, main= "Age Distribution", ylab = "Age")
## Remove outliers
data <- data[data$person_age <= 65 & data$person_age >= 18,]
# We visualized correlations among the numeric variables. This helps in identifying multicollinearity which might be a concern for certain models.
# Correlation analysis for numeric variable
cor_data <- data[, sapply(data, is.numeric)]
cor_matrix <- cor(cor_data)
corrplot(cor_matrix, method = "circle", type = "upper")
## Remove the STRONG correlation of person_age and
cb_person_cred_hist_length
# Check the correlation coefficient
cor_coeff <- cor(data$person_age, data$cb_person_cred_hist_length)
cat("Correlation between person_age and cb_person_cred_hist_length:", cor_coeff, "\n")
## Correlation between person_age and cb_person_cred_hist_length: 0.8774069
# If the correlation is above a threshold 0.8, drop one of the columns
threshold <- 0.8
if (abs(cor_coeff) > threshold) {
data$cb_person_cred_hist_length <- NULL
cat("Dropped cb_person_cred_hist_length due to high correlation.\n")
}
## Dropped cb_person_cred_hist_length due to high correlation.
# Data Splitting
set.seed(123)
splitIndex <- createDataPartition(data$loan_status, p=0.7, list=FALSE)
trainData <- data[splitIndex,]
testData <- data[-splitIndex,]
# Model Training
classifier <- glm(loan_status ~ ., data=trainData, family=binomial)
summary(classifier)
##
## Call:
## glm(formula = loan_status ~ ., family = binomial, data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.227e+00 1.739e-01 -24.312 < 2e-16 ***
## person_age -3.445e-03 3.464e-03 -0.995 0.319905
## person_income 2.069e-06 5.770e-07 3.586 0.000336 ***
## person_home_ownershipOTHER 4.489e-01 3.319e-01 1.353 0.176180
## person_home_ownershipOWN -1.650e+00 1.186e-01 -13.912 < 2e-16 ***
## person_home_ownershipRENT 8.439e-01 4.776e-02 17.670 < 2e-16 ***
## person_emp_length -5.729e-03 5.666e-03 -1.011 0.312026
## loan_intentEDUCATION -8.954e-01 6.838e-02 -13.094 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT -1.178e-02 7.577e-02 -0.155 0.876471
## loan_intentMEDICAL -1.558e-01 6.411e-02 -2.430 0.015103 *
## loan_intentPERSONAL -6.655e-01 6.939e-02 -9.590 < 2e-16 ***
## loan_intentVENTURE -1.116e+00 7.347e-02 -15.187 < 2e-16 ***
## loan_gradeB 1.666e-01 7.544e-02 2.208 0.027238 *
## loan_gradeC 3.591e-01 1.076e-01 3.337 0.000846 ***
## loan_gradeD 2.458e+00 1.314e-01 18.709 < 2e-16 ***
## loan_gradeE 2.718e+00 1.715e-01 15.855 < 2e-16 ***
## loan_gradeF 2.924e+00 2.486e-01 11.766 < 2e-16 ***
## loan_gradeG 6.440e+00 1.047e+00 6.154 7.58e-10 ***
## loan_amnt -1.094e-04 5.273e-06 -20.746 < 2e-16 ***
## loan_int_rate 6.745e-02 1.529e-02 4.412 1.03e-05 ***
## loan_percent_income 1.362e+01 3.083e-01 44.184 < 2e-16 ***
## cb_person_default_on_fileY -2.342e-02 5.967e-02 -0.393 0.694646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23891 on 22782 degrees of freedom
## Residual deviance: 15496 on 22761 degrees of freedom
## AIC: 15540
##
## Number of Fisher Scoring iterations: 6
# Model evaluation
pred <- predict(classifier, newdata=testData, type="response")
roc_obj <- roc(testData$loan_status, pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, print.auc=TRUE, main="ROC Curve for Loan Status Prediction")
## AUC = 0.87 here’s an 87% probability that the model will rank the
defaulter higher in terms of credit risk than the non-defaulte ##Feature
importance using Random Forest
#Fit the Random Forest model for feature importance
rf_model <- randomForest(loan_status ~ ., data=trainData, ntree=100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Extract and plot feature importance
importance <- importance(rf_model)
varImpPlot(rf_model, main="Feature Importance based on Random Forest")
## Using a Random Forest model, we derived the importance of each
feature in predicting the loan status. Features with higher importance
can be focused upon in future analyses ## I found some of the variables
have high scores which are over 400 . They are loan_int_rate;
person_home_ownership; loan_grade; person_income;
laon_percent_income
##Visualizing Distribution of Numeric Variables
numeric_vars <- c("loan_int_rate", "person_income", "loan_percent_income")
for (var in numeric_vars) {
p <- ggplot(data, aes(.data[[var]])) +
geom_histogram(fill = "blue", color = "black", bins = 30) +
theme_minimal() +
ggtitle(paste("Distribution of", var))
print(p)
}
# Visualizing Distribution of Categorical Variables
categorical_vars <- c("person_home_ownership", "loan_grade")
for (var in categorical_vars) {
p<-ggplot(data, aes(.data[[var]])) +
geom_bar(fill = "blue", color = "black") +
theme_minimal() +
ggtitle(paste("Distribution of", var))
print(p)
}
# Plotting against the target variable loan_status
ggplot(data, aes(x=loan_int_rate, fill=factor(loan_status))) +
geom_histogram(position="identity", alpha=0.5, bins=30) +
theme_minimal() +
ggtitle("Distribution of loan_int_rate vs loan_status")
## when loan_int_rate is higher than 15%, almost no one can stand it ##
Checking correlations
cor_matrix <- cor(data[, c("loan_int_rate", "person_income", "loan_percent_income")])
print(cor_matrix)
## loan_int_rate person_income loan_percent_income
## loan_int_rate 1.0000000000 -0.0009710371 0.1148879
## person_income -0.0009710371 1.0000000000 -0.2945190
## loan_percent_income 0.1148878736 -0.2945189861 1.0000000
##loan_int_rate and person_income have a very weak negative correlation (almost zero). ##loan_int_rate and loan_percent_income have a weak positive correlation. ##person_income and loan_percent_income have a moderate negative correlation. ## So I can safely include all 3 in my model
# Create a bar plot for class distribution
class_count <- table(data$loan_status)
barplot(class_count, main="Class Distribution of Loan Status", xlab="Class", ylab="Frequency", col=c("blue", "red"), legend=rownames(class_count))
## Then this table tells us the information about business understanding
in a group
```