This is my project for prediction Credit Risk Scoring
Dataset from kaggle Credit Risk Dataset
library(knitr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.16.2 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
library(corrplot)
Warning: package ‘corrplot’ was built under R version 4.4.3corrplot 0.95 loaded
library(ggplot2)
Learn more about the underlying theory at https://ggplot2-book.org/
library(reshape2)
Warning: package ‘reshape2’ was built under R version 4.4.3
Attaching package: ‘reshape2’
The following objects are masked from ‘package:data.table’:
dcast, melt
library(e1071)
Detailed data description of Credit Risk dataset:
df <- read.csv("Dataset/credit_risk_dataset.csv")
df %>% head(10) %>% data.table()
str(df)
'data.frame': 32581 obs. of 12 variables:
$ person_age : int 22 21 25 23 24 21 26 24 24 21 ...
$ person_income : int 59000 9600 9600 65500 54400 9900 77100 78956 83000 10000 ...
$ person_home_ownership : chr "RENT" "OWN" "MORTGAGE" "RENT" ...
$ person_emp_length : num 123 5 1 4 8 2 8 5 8 6 ...
$ loan_intent : chr "PERSONAL" "EDUCATION" "MEDICAL" "MEDICAL" ...
$ loan_grade : chr "D" "B" "C" "C" ...
$ loan_amnt : int 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
$ loan_int_rate : num 16 11.1 12.9 15.2 14.3 ...
$ loan_status : int 1 0 1 1 1 1 1 1 1 1 ...
$ loan_percent_income : num 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
$ cb_person_default_on_file : chr "Y" "N" "N" "N" ...
$ cb_person_cred_hist_length: int 3 2 3 2 4 2 3 4 2 3 ...
# Home ownership
df %>%
count(person_home_ownership, sort = TRUE)
set_levels_person_home_ownership <- unique(df$person_home_ownership)
# Home ownership
df %>%
count(loan_intent, sort = TRUE)
set_levels_loan_intent <- unique(df$loan_intent)
# Loan Grade
df %>%
count(loan_grade, sort = TRUE)
set_levels_loan_grade <- sort(unique(df$loan_grade))
# Historical default
df %>%
count(cb_person_default_on_file , sort = TRUE)
set_levels_cb_person_default_on_file <- c("N", "Y")
df <- df %>%
mutate(
person_emp_length = ifelse(is.na(person_emp_length), 0, person_emp_length),
loan_int_rate = ifelse(is.na(loan_int_rate), 0, person_emp_length),
person_home_ownership = factor(person_home_ownership, levels = set_levels_person_home_ownership),
person_home_ownership_int = as.integer(person_home_ownership),
loan_intent = factor(loan_intent, levels = set_levels_loan_intent),
loan_intent_int = as.integer(loan_intent),
loan_grade = factor(loan_grade, levels = set_levels_loan_grade),
loan_grade_int = as.integer(loan_grade),
cb_person_default_on_file = factor(cb_person_default_on_file, levels = set_levels_cb_person_default_on_file),
cb_person_default_on_file_int = as.integer(cb_person_default_on_file)-1
)
df %>% head()
# Compute the correlation matrix
cor_matrix <- cor(df %>% select_if(is.numeric))
# Convert the matrix to a format suitable for ggplot
cor_df <- melt(cor_matrix)
# Plot the heatmap with correlation values and rotated axis labels
ggplot(cor_df, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) + # Show correlation numbers
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + # Rotate x-axis labels
labs(title = "Heatmap of Correlation Matrix", fill = "Correlation")
NA
NA
2 variable are indicate have strong correlation - cb_person_cred_hist_length (strong with person_age) - loan_int_rate (strong with person_employe_length)
#List Exclude Variable
list_exclude <- c("cb_person_cred_hist_length", "loan_int_rate")
# Compute the correlation matrix
cor_matrix <- cor(df %>% select_if(is.numeric) %>% select(-list_exclude))
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(list_exclude)
# Now:
data %>% select(all_of(list_exclude))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Convert the matrix to a format suitable for ggplot
cor_df <- melt(cor_matrix)
# Plot the heatmap with correlation values and rotated axis labels
ggplot(cor_df, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) + # Show correlation numbers
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + # Rotate x-axis labels
labs(title = "Heatmap of Correlation Matrix", fill = "Correlation")
df_clean <- df %>%
select_if(is.numeric) %>%
select(-list_exclude)
str(df_clean)
'data.frame': 32581 obs. of 10 variables:
$ person_age : int 22 21 25 23 24 21 26 24 24 21 ...
$ person_income : int 59000 9600 9600 65500 54400 9900 77100 78956 83000 10000 ...
$ person_emp_length : num 123 5 1 4 8 2 8 5 8 6 ...
$ loan_amnt : int 35000 1000 5500 35000 35000 2500 35000 35000 35000 1600 ...
$ loan_status : int 1 0 1 1 1 1 1 1 1 1 ...
$ loan_percent_income : num 0.59 0.1 0.57 0.53 0.55 0.25 0.45 0.44 0.42 0.16 ...
$ person_home_ownership_int : int 1 2 3 1 1 2 1 1 1 2 ...
$ loan_intent_int : int 1 2 3 3 3 4 2 3 1 4 ...
$ loan_grade_int : int 4 2 3 3 3 1 2 2 1 4 ...
$ cb_person_default_on_file_int: num 1 0 0 0 1 0 0 0 0 0 ...
library(caTools)
Warning: package ‘caTools’ was built under R version 4.4.3
set.seed(123)
split <- sample.split(df_clean$loan_status, SplitRatio = 0.7)
df_train <- subset(df_clean, split == TRUE)
df_test <- subset(df_clean, split == FALSE)
df_train %>%
head()
df_test %>%
head()
# Train the Naive Bayes model
model <- naiveBayes(loan_status ~ ., data = df_train)
# Make predictions
predictions <- predict(model, df_test)
# View results
table(df_test$loan_status, predictions)
predictions
0 1
0 6610 1032
1 830 1302
# Accuracy
accuracy <- sum(predictions == df_test$loan_status) / nrow(df_test)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
[1] "Accuracy: 80.95 %"
# Evaluate accuracy
results <- sapply(models, function(model) {
predictions <- predict(model, df_train)
mean(predictions == df_train$loan_status) * 100 # Accuracy percentage
})
# Evaluate accuracy
results <- sapply(models, function(model) {
predictions <- predict(model, df_train)
mean(predictions == df_train$loan_status) * 100 # Accuracy percentage
})
print(results)
naive_bayes logistic_regression decision_tree random_forest
81.71176 0.00000 11.38685 0.00000
svm
0.00000
Best of model : Naive Bayes