library(tidyverse)
library(miscset)
library(ggthemr)
library(recipes)
library(plotly)
library(GGally)
library(tidyquant)
library(ggrepel)
library(parsnip)
library(skimr)
german_data <- read.csv("~/Documents/german_credit_data.csv")
german_data %>% head()
##   X Age    Sex Job Housing Saving.accounts Checking.account Credit.amount
## 1 0  67   male   2     own            <NA>           little          1169
## 2 1  22 female   2     own          little         moderate          5951
## 3 2  49   male   1     own          little             <NA>          2096
## 4 3  45   male   2    free          little           little          7882
## 5 4  53   male   2    free          little           little          4870
## 6 5  35   male   1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk
## 1        6            radio/TV good
## 2       48            radio/TV  bad
## 3       12           education good
## 4       42 furniture/equipment good
## 5       24                 car  bad
## 6       36           education good
german_data %>% glimpse()
## Observations: 1,000
## Variables: 11
## $ X                <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Age              <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, …
## $ Sex              <fct> male, female, male, male, male, male, male, male, ma…
## $ Job              <int> 2, 2, 1, 2, 2, 1, 2, 3, 1, 3, 2, 2, 2, 1, 2, 1, 2, 2…
## $ Housing          <fct> own, own, own, free, free, free, own, rent, own, own…
## $ Saving.accounts  <fct> NA, little, little, little, little, NA, quite rich, …
## $ Checking.account <fct> little, moderate, NA, little, little, NA, NA, modera…
## $ Credit.amount    <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059…
## $ Duration         <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 2…
## $ Purpose          <fct> radio/TV, radio/TV, education, furniture/equipment, …
## $ Risk             <fct> good, bad, good, good, bad, good, good, good, good, …
german_data %>% map(~ sum(is.na(.)))
## $X
## [1] 0
## 
## $Age
## [1] 0
## 
## $Sex
## [1] 0
## 
## $Job
## [1] 0
## 
## $Housing
## [1] 0
## 
## $Saving.accounts
## [1] 183
## 
## $Checking.account
## [1] 394
## 
## $Credit.amount
## [1] 0
## 
## $Duration
## [1] 0
## 
## $Purpose
## [1] 0
## 
## $Risk
## [1] 0

Data Preprosesing

german_credit <- german_data %>% 
  select(Age:Risk)
german_credit %>% summary()
##       Age            Sex           Job        Housing      Saving.accounts
##  Min.   :19.00   female:310   Min.   :0.000   free:108   little    :603   
##  1st Qu.:27.00   male  :690   1st Qu.:2.000   own :713   moderate  :103   
##  Median :33.00                Median :2.000   rent:179   quite rich: 63   
##  Mean   :35.55                Mean   :1.904              rich      : 48   
##  3rd Qu.:42.00                3rd Qu.:2.000              NA's      :183   
##  Max.   :75.00                Max.   :3.000                               
##                                                                           
##  Checking.account Credit.amount      Duration                   Purpose   
##  little  :274     Min.   :  250   Min.   : 4.0   car                :337  
##  moderate:269     1st Qu.: 1366   1st Qu.:12.0   radio/TV           :280  
##  rich    : 63     Median : 2320   Median :18.0   furniture/equipment:181  
##  NA's    :394     Mean   : 3271   Mean   :20.9   business           : 97  
##                   3rd Qu.: 3972   3rd Qu.:24.0   education          : 59  
##                   Max.   :18424   Max.   :72.0   repairs            : 22  
##                                                  (Other)            : 24  
##    Risk    
##  bad :300  
##  good:700  
##            
##            
##            
##            
## 

Summary

  • Average of Credit Amount is 3271
  • Average 0f Age is 35
  • There are a lot of male than female
  • Most of people did credit for car

  • Convert variable value from numeric into character

german_credit[german_credit$Job == 0,]$Job <- "unskilled0"
german_credit[german_credit$Job == 1,]$Job <- "unskilled1"
german_credit[german_credit$Job == 2,]$Job <- "skilled"
german_credit[german_credit$Job == 3,]$Job <- "highskilled"
  • Convert variable “Job” into factor
german_credit$Job <- as.factor(german_credit$Job)
  • Checking the structure of the data
str(german_credit)
## 'data.frame':    1000 obs. of  10 variables:
##  $ Age             : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ Sex             : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Job             : Factor w/ 4 levels "highskilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ Housing         : Factor w/ 3 levels "free","own","rent": 2 2 2 1 1 1 2 3 2 2 ...
##  $ Saving.accounts : Factor w/ 4 levels "little","moderate",..: NA 1 1 1 1 NA 3 1 4 1 ...
##  $ Checking.account: Factor w/ 3 levels "little","moderate",..: 1 2 NA 1 1 NA NA 2 NA 2 ...
##  $ Credit.amount   : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ Duration        : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ Purpose         : Factor w/ 8 levels "business","car",..: 6 6 4 5 2 4 5 2 6 2 ...
##  $ Risk            : Factor w/ 2 levels "bad","good": 2 1 2 2 1 2 2 2 2 1 ...

Visualize categorical variable

ggthemr("grape", type="outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplotGrid(ncol=2,
lapply(c("Sex", "Housing", "Job", "Saving.accounts", "Checking.account"),
       function(col){
         ggplot(german_credit,aes_string(col)) +
           geom_bar(aes(fill=Risk),position="dodge")
       }))

Insight

  • Male is dominating the observation.
ggthemr("grape", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplot(german_credit, aes(x = Purpose, fill = Risk)) +
  geom_bar(position = "dodge")

ggthemr_reset()

ggplotGrid(ncol=1,
lapply(c("Age", "Credit.amount", "Duration"),
       function(col){
         ggplot(german_credit,aes_string(col)) + 
           geom_density(aes(fill=Risk), bins=30, alpha = 0.5)
       }))
## Warning: Ignoring unknown parameters: bins

## Warning: Ignoring unknown parameters: bins

## Warning: Ignoring unknown parameters: bins

ggthemr("flat", type="outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
saving_accountplot <- german_data %>%
  select(Saving.accounts, Purpose) %>% 
  filter(!is.na(Saving.accounts)) %>%
  group_by(Saving.accounts, Purpose) %>% 
  arrange(desc(Purpose)) %>% 
  ggplot(aes(x = Saving.accounts, fill = Purpose))+
  geom_bar(position = "dodge", alpha = 0.7)

saving_accountplot

ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
purpose_by_age2 <- german_credit %>% 
  select(Purpose, Age) %>%
  group_by(Purpose) %>% 
  ggplot(aes(x = Purpose, y = Age, fill = Purpose))+
  geom_boxplot(alpha = 0.7)

purpose_by_age2

saving_by_credit_amo <- german_credit %>% 
  select(Saving.accounts, Credit.amount, Risk) %>%
  filter(!is.na(Saving.accounts)) %>% 
  group_by(Saving.accounts, Risk) %>% 
  ggplot(aes(x = Saving.accounts, y = Credit.amount, fill = Risk)) +
    geom_boxplot(alpha=0.7)+
    geom_jitter(alpha=0.4)

saving_by_credit_amo

summary(german_credit$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   27.00   33.00   35.55   42.00   75.00
x <- german_credit$Age

agect <- cut(x, 4)
head(agect)
## [1] (61,75.1] (18.9,33] (47,61]   (33,47]   (47,61]   (33,47]  
## Levels: (18.9,33] (33,47] (47,61] (61,75.1]
lab <- cut(x, 4, labels = c("Young", "Adult", "Senior", "Elder"))

german_credit$Age_group <- lab
head(german_credit)
##   Age    Sex        Job Housing Saving.accounts Checking.account Credit.amount
## 1  67   male    skilled     own            <NA>           little          1169
## 2  22 female    skilled     own          little         moderate          5951
## 3  49   male unskilled1     own          little             <NA>          2096
## 4  45   male    skilled    free          little           little          7882
## 5  53   male    skilled    free          little           little          4870
## 6  35   male unskilled1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk Age_group
## 1        6            radio/TV good     Elder
## 2       48            radio/TV  bad     Young
## 3       12           education good    Senior
## 4       42 furniture/equipment good     Adult
## 5       24                 car  bad    Senior
## 6       36           education good     Adult
x <- german_credit$Duration

ct <- cut(x,4)
head(ct)
## [1] (3.93,21] (38,55]   (3.93,21] (38,55]   (21,38]   (21,38]  
## Levels: (3.93,21] (21,38] (38,55] (55,72.1]
lab <- cut(x,4, labels = c("Short","Medium","Long","Longer"))

german_credit$Duration_group <- lab
summary(german_data$Credit.amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
x <- german_credit$Credit.amount

lab <- cut(x, 4, labels = c("Low","Medium","High","Higher"))

german_credit$Credit_amount_group <- lab
head(german_credit)
##   Age    Sex        Job Housing Saving.accounts Checking.account Credit.amount
## 1  67   male    skilled     own            <NA>           little          1169
## 2  22 female    skilled     own          little         moderate          5951
## 3  49   male unskilled1     own          little             <NA>          2096
## 4  45   male    skilled    free          little           little          7882
## 5  53   male    skilled    free          little           little          4870
## 6  35   male unskilled1    free            <NA>             <NA>          9055
##   Duration             Purpose Risk Age_group Duration_group
## 1        6            radio/TV good     Elder          Short
## 2       48            radio/TV  bad     Young           Long
## 3       12           education good    Senior          Short
## 4       42 furniture/equipment good     Adult           Long
## 5       24                 car  bad    Senior         Medium
## 6       36           education good     Adult         Medium
##   Credit_amount_group
## 1                 Low
## 2              Medium
## 3                 Low
## 4              Medium
## 5              Medium
## 6              Medium

Correlation analysis

  • Preparation for analysis
german_clean <- german_credit %>% 
  select(-Age, -Credit.amount, -Duration) %>% 
  na.omit(german_credit)
ggplot(german_clean, aes(x = Risk, fill = Risk)) +
  geom_bar(position = "dodge") +
  theme_bw()

head(german_clean)
##       Sex         Job Housing Saving.accounts Checking.account
## 2  female     skilled     own          little         moderate
## 4    male     skilled    free          little           little
## 5    male     skilled    free          little           little
## 8    male highskilled    rent          little         moderate
## 10   male highskilled     own          little         moderate
## 11 female     skilled    rent          little         moderate
##                Purpose Risk Age_group Duration_group Credit_amount_group
## 2             radio/TV  bad     Young           Long              Medium
## 4  furniture/equipment good     Adult           Long              Medium
## 5                  car  bad    Senior         Medium              Medium
## 8                  car good     Adult         Medium              Medium
## 10                 car  bad     Young         Medium              Medium
## 11                 car  bad     Young          Short                 Low
recipe_obj <- recipe(~ ., data = german_clean) %>%
    step_dummy(all_nominal(), one_hot = TRUE, naming = partial(dummy_names, sep = "__")) %>%
    prep()
data_transformed_tbl <- german_clean %>%
    bake(recipe_obj, new_data = .) 
correlation_tbl <- data_transformed_tbl %>%
    cor(y = data_transformed_tbl$Risk__good) %>%
    as_tibble(rownames = "feature") %>%
    rename(Risk__good = V1) %>%
    separate(feature, into = c("feature", "bin"), sep = "__") %>%
    filter(!is.na(Risk__good)) %>%
    filter(!str_detect(feature, "Risk")) %>%
    arrange(abs(Risk__good) %>% desc()) %>%
    mutate(feature = as_factor(feature) %>% fct_rev())
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
#ggthemr_reset()
ggthemr("chalk", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
correlation_tbl %>%
    
    ggplot(aes(Risk__good, y = feature, text = bin)) +
    
    # Geometries
    geom_vline(xintercept = 0, linetype = 2, color = "red") +
    geom_point(color = "#2c3e50") +
    geom_text_repel(aes(label = bin), size = 3.2, color = "#2c3e50") +
    
    # Formatting
    expand_limits(x = c(-0.4, 0.4)) +
    theme_bw()+
    labs(title = "Credit Risk Analysis",
         subtitle = "Features correlation to Risk",
         y = "", x = "Correlation to Risk good") +
    theme(plot.title = element_text(hjust = 0.5), 
          plot.subtitle = element_text(hjust = 0.5))

Model building

Import library that’s gonna be used

library(randomForest)
library(caret)
library(e1071)

Remove missing data

german_model <- na.omit(german_credit) %>% 
  select(-Credit_amount_group, - Duration_group, -Age_group)
head(german_model)
##    Age    Sex         Job Housing Saving.accounts Checking.account
## 2   22 female     skilled     own          little         moderate
## 4   45   male     skilled    free          little           little
## 5   53   male     skilled    free          little           little
## 8   35   male highskilled    rent          little         moderate
## 10  28   male highskilled     own          little         moderate
## 11  25 female     skilled    rent          little         moderate
##    Credit.amount Duration             Purpose Risk
## 2           5951       48            radio/TV  bad
## 4           7882       42 furniture/equipment good
## 5           4870       24                 car  bad
## 8           6948       36                 car good
## 10          5234       30                 car  bad
## 11          1295       12                 car  bad

Feature enginering

  • Convert categorical features with one hot encoding
  • Scaling numerical data
german_before <- german_model %>% 
  rename(Credit_amount = Credit.amount,
         Checking_account = Checking.account,
         Saving_accounts = Saving.accounts)

recipe_german <- recipe(~., data = german_before) %>% 
  step_dummy(Sex,Job,Housing,Saving_accounts,Checking_account,Purpose, 
             one_hot = TRUE, 
             naming = partial(dummy_names, sep = "_")) %>% 
  step_scale(Credit_amount, Age, Duration) %>%
  prep()

german_enginered <- german_before %>% 
  bake(recipe_german, new_data = .)

Data splitting

library(caTools)

set.seed(123)
sample <- sample.split(german_enginered, SplitRatio = 0.8)
train <- subset(german_enginered, sample == T)
## Warning: Length of logical index must be 1 or 522, not 28
test <- subset(german_enginered, sample == F)
## Warning: Length of logical index must be 1 or 522, not 28
modelRF <- randomForest(Risk~., data=train, ntree = 500, importance=TRUE)
modelRF
## 
## Call:
##  randomForest(formula = Risk ~ ., data = train, ntree = 500, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 37.8%
## Confusion matrix:
##      bad good class.error
## bad   81   99   0.5500000
## good  56  174   0.2434783
modelRFP <- predict(modelRF, test)
confusionMatrix(modelRFP, test$Risk)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good
##       bad   25   14
##       good  26   47
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.5468, 0.7312)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : 0.02246         
##                                           
##                   Kappa : 0.2658          
##                                           
##  Mcnemar's Test P-Value : 0.08199         
##                                           
##             Sensitivity : 0.4902          
##             Specificity : 0.7705          
##          Pos Pred Value : 0.6410          
##          Neg Pred Value : 0.6438          
##              Prevalence : 0.4554          
##          Detection Rate : 0.2232          
##    Detection Prevalence : 0.3482          
##       Balanced Accuracy : 0.6303          
##                                           
##        'Positive' Class : bad             
## 
xgboost_caret <- caret::train(
  Risk ~.,
  data = train,
  method = "xgbTree",
  preProcess = c("scale", "center"),
  trControl = trainControl(
    method = "repeatedcv", 
    number = 5, 
    repeats = 3, 
    savePredictions = TRUE, 
    verboseIter = FALSE))
confusionMatrix(predict(xgboost_caret, test), as.factor(test$Risk))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction bad good
##       bad   21   13
##       good  30   48
##                                           
##                Accuracy : 0.6161          
##                  95% CI : (0.5194, 0.7064)
##     No Information Rate : 0.5446          
##     P-Value [Acc > NIR] : 0.07678         
##                                           
##                   Kappa : 0.2042          
##                                           
##  Mcnemar's Test P-Value : 0.01469         
##                                           
##             Sensitivity : 0.4118          
##             Specificity : 0.7869          
##          Pos Pred Value : 0.6176          
##          Neg Pred Value : 0.6154          
##              Prevalence : 0.4554          
##          Detection Rate : 0.1875          
##    Detection Prevalence : 0.3036          
##       Balanced Accuracy : 0.5993          
##                                           
##        'Positive' Class : bad             
## 
importance <- varImp(xgboost_caret, scale = TRUE)
plot(importance)

## # A tibble: 112 x 2
##    .pred_class Risk 
##    <fct>       <fct>
##  1 good        bad  
##  2 good        good 
##  3 bad         bad  
##  4 good        good 
##  5 good        good 
##  6 bad         bad  
##  7 good        good 
##  8 good        bad  
##  9 bad         bad  
## 10 bad         bad  
## # … with 102 more rows
## # A tibble: 1 x 2
##   .metric  .estimate
##   <chr>        <dbl>
## 1 accuracy     0.598

Result

  • Credit amount has high positif correlation to risk good, it means that the lower credit amount borrowed the lower risk of default will be.

  • Duration also has positif correlation and the second importance from xgboost importance plot, people that borrowed with short duration have high probility to not default or have bad loan.

  • Age The youngest the borrower the higher probability to have bad loan or bad risk.

Summary

  • From the analysis above we can combine the features importance to reduce the probability of default and make better strategy