Description

in this project we build model to predict if an applicant is ‘good’ or ‘bad’ customer, different from other tasks, the definition of ‘good’ or ‘bad’ is not given.

The dataset Link: Here

1. Data Extraction

Import Libraries

Read house datasets

final_df <- read.csv("clean_dataset.csv")

str(final_df)
## 'data.frame':    690 obs. of  16 variables:
##  $ Gender        : int  1 0 0 1 1 1 1 0 1 1 ...
##  $ Age           : num  30.8 58.7 24.5 27.8 20.2 ...
##  $ Debt          : num  0 4.46 0.5 1.54 5.62 ...
##  $ Married       : int  1 1 1 1 1 1 1 1 0 0 ...
##  $ BankCustomer  : int  1 1 1 1 1 1 1 1 0 0 ...
##  $ Industry      : chr  "Industrials" "Materials" "Materials" "Industrials" ...
##  $ Ethnicity     : chr  "White" "Black" "Black" "White" ...
##  $ YearsEmployed : num  1.25 3.04 1.5 3.75 1.71 ...
##  $ PriorDefault  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Employed      : int  1 1 0 1 0 0 0 0 0 0 ...
##  $ CreditScore   : int  1 6 0 5 0 0 0 0 0 0 ...
##  $ DriversLicense: int  0 0 0 1 0 1 1 0 0 1 ...
##  $ Citizen       : chr  "ByBirth" "ByBirth" "ByBirth" "ByBirth" ...
##  $ ZipCode       : int  202 43 280 100 120 360 164 80 180 52 ...
##  $ Income        : int  0 560 824 3 0 0 31285 1349 314 1442 ...
##  $ Approved      : int  1 1 1 1 1 1 1 1 1 1 ...
unique(final_df$ZipCode)
##   [1]  202   43  280  100  120  360  164   80  180   52  128  260    0  320  396
##  [16]   96  200  300  145  500  168  434  583   30  240   70  455  311  216  491
##  [31]  400  239  160  711  250  520  515  420  980  443  140   94  368  288  928
##  [46]  188  112  171  268  167   75  152  176  329  212  410  274  375  408  350
##  [61]  204   40  181  399  440   93   60  395  393   21   29  102  431  370   24
##  [76]   20  129  510  195  144  380   49   50  381  150  117   56  211  230  156
##  [91]   22  228  519  253  487  220   88   73  121  470  136  132  292  154  272
## [106]  340  108  720  450  232  170 1160  411  460  348  480  640  372  276  221
## [121]  352  141  178  600  550 2000  225  210  110  356   45   62   92  174   17
## [136]   86  454  254   28  263  333  312  290  371   99  252  760  560  130  523
## [151]  680  163  208  383  330  422  840  432   32  186  303  349  224  369   76
## [166]  231  309  416  465  256

This dataset contain 690 rows and 16 columns. The target variable is Price

2. Exploratory Data Analysis

Plot Distribution of Approval (Histogram, Density)

ggplot(data = final_df, 
       aes(x = Gender, fill = Approved)) +
  geom_bar(position = "fill") +
  labs(y = "Rate", x = 'Gender') +
  ggtitle('Gender vs Credit Approval')

ggplot(data = final_df, 
       aes(x = Gender, fill = Approved)) +
  geom_bar() +
  labs(y = "Costumer Count", x = 'Gender') +
  ggtitle('Gender vs Credit Approval') +
  scale_x_discrete(breaks=c("0","1"),
                   labels=c("Female","Male"))

Gender Female has a bigger proportion of approvals than the gender male, but the difference between both rates doesn’t seem to be that significant.

There is a clear correlation between those who previously defaulted and those who didn’t, we assume that the bank don’t easily approve credits to people who have already defaulted.

There seems to be a clear correlation between whether a person is employed or not and the credit approval rate.

3. Data Preprocessing

3.1 data Cleaning

final_df$Industry <- NULL
final_df$Ethnicity <- NULL
final_df$Citizen <- NULL
final_df$ZipCode <- NULL

# Handle Missing Data, CHECKER data NULL NA
idx <- complete.cases(final_df)
idx[1:20]
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE

3.2 Imbalanced

table(final_df$Approved)
## 
##   0   1 
## 383 307
dim(final_df)
## [1] 690  12
row = dim(final_df)[1]
row
## [1] 690
train_idx <- sample(row, 0.8*row)

training_data <- final_df[train_idx,]
testing_data <- final_df[-train_idx,]

4. Modeling

Creating Decision Tree

library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
dt <- ctree(formula = Approved~.,
            data = training_data)

5. Evaluation

actual <- testing_data$Approved

pred.dt <- predict(dt, testing_data)
performance <- function(prediction, actual, nama_model){
  #Confusion Matrix
  cm <- table(actual, prediction,
              dnn = c("Actual", "Prediction"))
  #dnn -> the dimension names
  
  TP <- cm[2, 2]
  TN <- cm[1, 1]
  FN <- cm[2, 1]
  FP <- cm[1, 2]
  
  accuracy <- (TP + TN) / (TP + TN + FP + FN)
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  f1_score <- (2*precision*recall) / (precision + recall)
  
  result <- paste("Model : ", nama_model,
                  "\nAccuracy : ", round(accuracy, 3),
                  "\nPrecision : ", round(precision, 3),
                  "\nRecall : ", round(recall, 3),
                  "\nF1 Score : ", round(f1_score, 3))
  
  cat(result)
}
performance(pred.dt, actual, "Decision Tree")
## Model :  Decision Tree 
## Accuracy :  0.855 
## Precision :  0.808 
## Recall :  0.908 
## F1 Score :  0.855