mydata <- read.table("./credit_risk_dataset.csv", header=TRUE, sep=";", dec=".")
head(mydata)
## person_age person_income loan_intent loan_amnt loan_status
## 1 22 59000 PERSONAL 35000 1
## 2 21 9600 EDUCATION 1000 0
## 3 25 9600 MEDICAL 5500 1
## 4 23 65500 MEDICAL 35000 1
## 5 24 54400 MEDICAL 35000 1
## 6 21 9900 VENTURE 2500 1
## loan_percent_income
## 1 0.59
## 2 0.10
## 3 0.57
## 4 0.53
## 5 0.55
## 6 0.25
General:
Variables:
person_age
person_income
loan_intent
loan_amnt
loan_status
loan_percent_income
head(mydata)
## person_age person_income loan_intent loan_amnt loan_status
## 1 22 59000 PERSONAL 35000 1
## 2 21 9600 EDUCATION 1000 0
## 3 25 9600 MEDICAL 5500 1
## 4 23 65500 MEDICAL 35000 1
## 5 24 54400 MEDICAL 35000 1
## 6 21 9900 VENTURE 2500 1
## loan_percent_income
## 1 0.59
## 2 0.10
## 3 0.57
## 4 0.53
## 5 0.55
## 6 0.25
mydata$loan_status <- factor(mydata$loan_status,
levels = c(0, 1),
labels = c("non-default", "default"))
There was one individual with 144 years.
mydata[82, 1] <- 27
names(mydata)[names(mydata) == "person_age"] <- "Age"
head(mydata)
## Age person_income loan_intent loan_amnt loan_status loan_percent_income
## 1 22 59000 PERSONAL 35000 default 0.59
## 2 21 9600 EDUCATION 1000 non-default 0.10
## 3 25 9600 MEDICAL 5500 default 0.57
## 4 23 65500 MEDICAL 35000 default 0.53
## 5 24 54400 MEDICAL 35000 default 0.55
## 6 21 9900 VENTURE 2500 default 0.25
library(pastecs)
round(stat.desc(mydata[ , -3])[ , -4], 1)
## Age person_income loan_amnt loan_percent_income
## nbr.val 150.0 150.0 150.0 150.0
## nbr.null 0.0 0.0 0.0 0.0
## nbr.na 0.0 0.0 0.0 0.0
## min 21.0 9600.0 1000.0 0.0
## max 27.0 500000.0 35000.0 0.6
## range 6.0 490400.0 34000.0 0.6
## sum 3564.0 16784761.0 3110300.0 42.6
## median 24.0 77550.0 25000.0 0.3
## mean 23.8 111898.4 20735.3 0.3
## SE.mean 0.1 7996.0 950.7 0.0
## CI.mean.0.95 0.3 15800.2 1878.5 0.0
## var 2.7 9590395264.4 135565110.6 0.0
## std.dev 1.7 97930.6 11643.2 0.2
## coef.var 0.1 0.9 0.6 0.6
Calculated descriptive statistics for the numeric variables in the dataset, excluding categorical variables.
Mode - reason for loan
library(modeest)
mlv(mydata$loan_intent)
## [1] "EDUCATION"
Standard deviation
sd(mydata$person_income)
## [1] 97930.56
hist(mydata$person_income,
main = "Distribution of Income",
xlab = "Income",
ylab = "Amount",
col = "green")
options(scipen = 999)
Explanation:
A histogram that shows the distribution of applicants annual income in the dataset.
A small number of individuals earn between $300,000 and $500,000 — these are outliers that make a right-skew (common with income)
Number of Loans by Intent
library(ggplot2)
ggplot(mydata, aes(x= loan_intent))+
geom_bar() +
ylab("Frequency") +
xlab("Loan Intent") +
ggtitle("Number of Loans by Intent") +
theme_minimal()
Explanation:
Education loans are the most common, followed by venture and debt consolidation.
It is a Bar chart
Loan Amount by Loan Intent
library(ggplot2)
ggplot(mydata, aes(x = loan_intent, y = loan_amnt)) +
geom_boxplot(fill = "lightgray", color = "black") +
xlab("Loan Intent") +
ylab("Loan Amount") +
ggtitle("Loan Amount by Loan Intent") +
theme_minimal()
Explanation:
A boxplot displays the distribution of loan amounts for each loan
intent.
Personal loans tend to have the highest median loan amounts. (very little difference)
All categories are in a similar range; only “Home Improvements” seems to have strong outliers.