library(tidyverse)
library(miscset)
library(ggthemr)
library(recipes)
library(plotly)
library(GGally)
library(tidyquant)
library(ggrepel)
library(parsnip)
library(skimr)
german_data <- read.csv("~/Documents/german_credit_data.csv")
german_data %>% head()
## X Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 0 67 male 2 own <NA> little 1169
## 2 1 22 female 2 own little moderate 5951
## 3 2 49 male 1 own little <NA> 2096
## 4 3 45 male 2 free little little 7882
## 5 4 53 male 2 free little little 4870
## 6 5 35 male 1 free <NA> <NA> 9055
## Duration Purpose Risk
## 1 6 radio/TV good
## 2 48 radio/TV bad
## 3 12 education good
## 4 42 furniture/equipment good
## 5 24 car bad
## 6 36 education good
german_data %>% glimpse()
## Observations: 1,000
## Variables: 11
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, …
## $ Sex <fct> male, female, male, male, male, male, male, male, ma…
## $ Job <int> 2, 2, 1, 2, 2, 1, 2, 3, 1, 3, 2, 2, 2, 1, 2, 1, 2, 2…
## $ Housing <fct> own, own, own, free, free, free, own, rent, own, own…
## $ Saving.accounts <fct> NA, little, little, little, little, NA, quite rich, …
## $ Checking.account <fct> little, moderate, NA, little, little, NA, NA, modera…
## $ Credit.amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059…
## $ Duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 2…
## $ Purpose <fct> radio/TV, radio/TV, education, furniture/equipment, …
## $ Risk <fct> good, bad, good, good, bad, good, good, good, good, …
german_data %>% map(~ sum(is.na(.)))
## $X
## [1] 0
##
## $Age
## [1] 0
##
## $Sex
## [1] 0
##
## $Job
## [1] 0
##
## $Housing
## [1] 0
##
## $Saving.accounts
## [1] 183
##
## $Checking.account
## [1] 394
##
## $Credit.amount
## [1] 0
##
## $Duration
## [1] 0
##
## $Purpose
## [1] 0
##
## $Risk
## [1] 0
german_credit <- german_data %>%
select(Age:Risk)
german_credit %>% summary()
## Age Sex Job Housing Saving.accounts
## Min. :19.00 female:310 Min. :0.000 free:108 little :603
## 1st Qu.:27.00 male :690 1st Qu.:2.000 own :713 moderate :103
## Median :33.00 Median :2.000 rent:179 quite rich: 63
## Mean :35.55 Mean :1.904 rich : 48
## 3rd Qu.:42.00 3rd Qu.:2.000 NA's :183
## Max. :75.00 Max. :3.000
##
## Checking.account Credit.amount Duration Purpose
## little :274 Min. : 250 Min. : 4.0 car :337
## moderate:269 1st Qu.: 1366 1st Qu.:12.0 radio/TV :280
## rich : 63 Median : 2320 Median :18.0 furniture/equipment:181
## NA's :394 Mean : 3271 Mean :20.9 business : 97
## 3rd Qu.: 3972 3rd Qu.:24.0 education : 59
## Max. :18424 Max. :72.0 repairs : 22
## (Other) : 24
## Risk
## bad :300
## good:700
##
##
##
##
##
Most of people did credit for car
Convert variable value from numeric into character
german_credit[german_credit$Job == 0,]$Job <- "unskilled0"
german_credit[german_credit$Job == 1,]$Job <- "unskilled1"
german_credit[german_credit$Job == 2,]$Job <- "skilled"
german_credit[german_credit$Job == 3,]$Job <- "highskilled"
german_credit$Job <- as.factor(german_credit$Job)
str(german_credit)
## 'data.frame': 1000 obs. of 10 variables:
## $ Age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
## $ Job : Factor w/ 4 levels "highskilled",..: 2 2 4 2 2 4 2 1 4 1 ...
## $ Housing : Factor w/ 3 levels "free","own","rent": 2 2 2 1 1 1 2 3 2 2 ...
## $ Saving.accounts : Factor w/ 4 levels "little","moderate",..: NA 1 1 1 1 NA 3 1 4 1 ...
## $ Checking.account: Factor w/ 3 levels "little","moderate",..: 1 2 NA 1 1 NA NA 2 NA 2 ...
## $ Credit.amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ Duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ Purpose : Factor w/ 8 levels "business","car",..: 6 6 4 5 2 4 5 2 6 2 ...
## $ Risk : Factor w/ 2 levels "bad","good": 2 1 2 2 1 2 2 2 2 1 ...
ggthemr("grape", type="outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplotGrid(ncol=2,
lapply(c("Sex", "Housing", "Job", "Saving.accounts", "Checking.account"),
function(col){
ggplot(german_credit,aes_string(col)) +
geom_bar(aes(fill=Risk),position="dodge")
}))
ggthemr("grape", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplot(german_credit, aes(x = Purpose, fill = Risk)) +
geom_bar(position = "dodge")
ggthemr_reset()
ggplotGrid(ncol=1,
lapply(c("Age", "Credit.amount", "Duration"),
function(col){
ggplot(german_credit,aes_string(col)) +
geom_density(aes(fill=Risk), bins=30, alpha = 0.5)
}))
## Warning: Ignoring unknown parameters: bins
## Warning: Ignoring unknown parameters: bins
## Warning: Ignoring unknown parameters: bins
ggthemr("flat", type="outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
saving_accountplot <- german_data %>%
select(Saving.accounts, Purpose) %>%
filter(!is.na(Saving.accounts)) %>%
group_by(Saving.accounts, Purpose) %>%
arrange(desc(Purpose)) %>%
ggplot(aes(x = Saving.accounts, fill = Purpose))+
geom_bar(position = "dodge", alpha = 0.7)
saving_accountplot
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
purpose_by_age2 <- german_credit %>%
select(Purpose, Age) %>%
group_by(Purpose) %>%
ggplot(aes(x = Purpose, y = Age, fill = Purpose))+
geom_boxplot(alpha = 0.7)
purpose_by_age2
saving_by_credit_amo <- german_credit %>%
select(Saving.accounts, Credit.amount, Risk) %>%
filter(!is.na(Saving.accounts)) %>%
group_by(Saving.accounts, Risk) %>%
ggplot(aes(x = Saving.accounts, y = Credit.amount, fill = Risk)) +
geom_boxplot(alpha=0.7)+
geom_jitter(alpha=0.4)
saving_by_credit_amo
summary(german_credit$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 27.00 33.00 35.55 42.00 75.00
x <- german_credit$Age
agect <- cut(x, 4)
head(agect)
## [1] (61,75.1] (18.9,33] (47,61] (33,47] (47,61] (33,47]
## Levels: (18.9,33] (33,47] (47,61] (61,75.1]
lab <- cut(x, 4, labels = c("Young", "Adult", "Senior", "Elder"))
german_credit$Age_group <- lab
head(german_credit)
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 67 male skilled own <NA> little 1169
## 2 22 female skilled own little moderate 5951
## 3 49 male unskilled1 own little <NA> 2096
## 4 45 male skilled free little little 7882
## 5 53 male skilled free little little 4870
## 6 35 male unskilled1 free <NA> <NA> 9055
## Duration Purpose Risk Age_group
## 1 6 radio/TV good Elder
## 2 48 radio/TV bad Young
## 3 12 education good Senior
## 4 42 furniture/equipment good Adult
## 5 24 car bad Senior
## 6 36 education good Adult
x <- german_credit$Duration
ct <- cut(x,4)
head(ct)
## [1] (3.93,21] (38,55] (3.93,21] (38,55] (21,38] (21,38]
## Levels: (3.93,21] (21,38] (38,55] (55,72.1]
lab <- cut(x,4, labels = c("Short","Medium","Long","Longer"))
german_credit$Duration_group <- lab
summary(german_data$Credit.amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
x <- german_credit$Credit.amount
lab <- cut(x, 4, labels = c("Low","Medium","High","Higher"))
german_credit$Credit_amount_group <- lab
head(german_credit)
## Age Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 67 male skilled own <NA> little 1169
## 2 22 female skilled own little moderate 5951
## 3 49 male unskilled1 own little <NA> 2096
## 4 45 male skilled free little little 7882
## 5 53 male skilled free little little 4870
## 6 35 male unskilled1 free <NA> <NA> 9055
## Duration Purpose Risk Age_group Duration_group
## 1 6 radio/TV good Elder Short
## 2 48 radio/TV bad Young Long
## 3 12 education good Senior Short
## 4 42 furniture/equipment good Adult Long
## 5 24 car bad Senior Medium
## 6 36 education good Adult Medium
## Credit_amount_group
## 1 Low
## 2 Medium
## 3 Low
## 4 Medium
## 5 Medium
## 6 Medium
german_clean <- german_credit %>%
select(-Age, -Credit.amount, -Duration) %>%
na.omit(german_credit)
ggplot(german_clean, aes(x = Risk, fill = Risk)) +
geom_bar(position = "dodge") +
theme_bw()
head(german_clean)
## Sex Job Housing Saving.accounts Checking.account
## 2 female skilled own little moderate
## 4 male skilled free little little
## 5 male skilled free little little
## 8 male highskilled rent little moderate
## 10 male highskilled own little moderate
## 11 female skilled rent little moderate
## Purpose Risk Age_group Duration_group Credit_amount_group
## 2 radio/TV bad Young Long Medium
## 4 furniture/equipment good Adult Long Medium
## 5 car bad Senior Medium Medium
## 8 car good Adult Medium Medium
## 10 car bad Young Medium Medium
## 11 car bad Young Short Low
recipe_obj <- recipe(~ ., data = german_clean) %>%
step_dummy(all_nominal(), one_hot = TRUE, naming = partial(dummy_names, sep = "__")) %>%
prep()
data_transformed_tbl <- german_clean %>%
bake(recipe_obj, new_data = .)
correlation_tbl <- data_transformed_tbl %>%
cor(y = data_transformed_tbl$Risk__good) %>%
as_tibble(rownames = "feature") %>%
rename(Risk__good = V1) %>%
separate(feature, into = c("feature", "bin"), sep = "__") %>%
filter(!is.na(Risk__good)) %>%
filter(!str_detect(feature, "Risk")) %>%
arrange(abs(Risk__good) %>% desc()) %>%
mutate(feature = as_factor(feature) %>% fct_rev())
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
#ggthemr_reset()
ggthemr("chalk", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
correlation_tbl %>%
ggplot(aes(Risk__good, y = feature, text = bin)) +
# Geometries
geom_vline(xintercept = 0, linetype = 2, color = "red") +
geom_point(color = "#2c3e50") +
geom_text_repel(aes(label = bin), size = 3.2, color = "#2c3e50") +
# Formatting
expand_limits(x = c(-0.4, 0.4)) +
theme_bw()+
labs(title = "Credit Risk Analysis",
subtitle = "Features correlation to Risk",
y = "", x = "Correlation to Risk good") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
library(randomForest)
library(caret)
library(e1071)
german_model <- na.omit(german_credit) %>%
select(-Credit_amount_group, - Duration_group, -Age_group)
head(german_model)
## Age Sex Job Housing Saving.accounts Checking.account
## 2 22 female skilled own little moderate
## 4 45 male skilled free little little
## 5 53 male skilled free little little
## 8 35 male highskilled rent little moderate
## 10 28 male highskilled own little moderate
## 11 25 female skilled rent little moderate
## Credit.amount Duration Purpose Risk
## 2 5951 48 radio/TV bad
## 4 7882 42 furniture/equipment good
## 5 4870 24 car bad
## 8 6948 36 car good
## 10 5234 30 car bad
## 11 1295 12 car bad
german_before <- german_model %>%
rename(Credit_amount = Credit.amount,
Checking_account = Checking.account,
Saving_accounts = Saving.accounts)
recipe_german <- recipe(~., data = german_before) %>%
step_dummy(Sex,Job,Housing,Saving_accounts,Checking_account,Purpose,
one_hot = TRUE,
naming = partial(dummy_names, sep = "_")) %>%
step_scale(Credit_amount, Age, Duration) %>%
prep()
german_enginered <- german_before %>%
bake(recipe_german, new_data = .)
library(caTools)
set.seed(123)
sample <- sample.split(german_enginered, SplitRatio = 0.8)
train <- subset(german_enginered, sample == T)
## Warning: Length of logical index must be 1 or 522, not 28
test <- subset(german_enginered, sample == F)
## Warning: Length of logical index must be 1 or 522, not 28
modelRF <- randomForest(Risk~., data=train, ntree = 500, importance=TRUE)
modelRF
##
## Call:
## randomForest(formula = Risk ~ ., data = train, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 37.8%
## Confusion matrix:
## bad good class.error
## bad 81 99 0.5500000
## good 56 174 0.2434783
modelRFP <- predict(modelRF, test)
confusionMatrix(modelRFP, test$Risk)
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good
## bad 25 14
## good 26 47
##
## Accuracy : 0.6429
## 95% CI : (0.5468, 0.7312)
## No Information Rate : 0.5446
## P-Value [Acc > NIR] : 0.02246
##
## Kappa : 0.2658
##
## Mcnemar's Test P-Value : 0.08199
##
## Sensitivity : 0.4902
## Specificity : 0.7705
## Pos Pred Value : 0.6410
## Neg Pred Value : 0.6438
## Prevalence : 0.4554
## Detection Rate : 0.2232
## Detection Prevalence : 0.3482
## Balanced Accuracy : 0.6303
##
## 'Positive' Class : bad
##
xgboost_caret <- caret::train(
Risk ~.,
data = train,
method = "xgbTree",
preProcess = c("scale", "center"),
trControl = trainControl(
method = "repeatedcv",
number = 5,
repeats = 3,
savePredictions = TRUE,
verboseIter = FALSE))
confusionMatrix(predict(xgboost_caret, test), as.factor(test$Risk))
## Confusion Matrix and Statistics
##
## Reference
## Prediction bad good
## bad 21 13
## good 30 48
##
## Accuracy : 0.6161
## 95% CI : (0.5194, 0.7064)
## No Information Rate : 0.5446
## P-Value [Acc > NIR] : 0.07678
##
## Kappa : 0.2042
##
## Mcnemar's Test P-Value : 0.01469
##
## Sensitivity : 0.4118
## Specificity : 0.7869
## Pos Pred Value : 0.6176
## Neg Pred Value : 0.6154
## Prevalence : 0.4554
## Detection Rate : 0.1875
## Detection Prevalence : 0.3036
## Balanced Accuracy : 0.5993
##
## 'Positive' Class : bad
##
importance <- varImp(xgboost_caret, scale = TRUE)
plot(importance)
## # A tibble: 112 x 2
## .pred_class Risk
## <fct> <fct>
## 1 good bad
## 2 good good
## 3 bad bad
## 4 good good
## 5 good good
## 6 bad bad
## 7 good good
## 8 good bad
## 9 bad bad
## 10 bad bad
## # … with 102 more rows
## # A tibble: 1 x 2
## .metric .estimate
## <chr> <dbl>
## 1 accuracy 0.598
Credit amount has high positif correlation to risk good, it means that the lower credit amount borrowed the lower risk of default will be.
Duration also has positif correlation and the second importance from xgboost importance plot, people that borrowed with short duration have high probility to not default or have bad loan.
Age The youngest the borrower the higher probability to have bad loan or bad risk.