setwd("/Users/lekhana/Harrisburgh University/2018_05to08_Aram_MachineLearning/Laboratory #2-20180728")
credit <- read.csv("creditData.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(credit)
## Creditability Account.Balance Duration.of.Credit..month.
## Min. :0.0 Min. :1.000 Min. : 4.0
## 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:12.0
## Median :1.0 Median :2.000 Median :18.0
## Mean :0.7 Mean :2.577 Mean :20.9
## 3rd Qu.:1.0 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :1.0 Max. :4.000 Max. :72.0
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex...Marital.Status Guarantors Duration.in.Current.address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most.valuable.available.asset Age..years. Concurrent.Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type.of.apartment No.of.Credits.at.this.Bank Occupation
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :1.000 Median :3.000
## Mean :1.928 Mean :1.407 Mean :2.904
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000 Max. :4.000
## No.of.dependents Telephone Foreign.Worker
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :1.000
## Mean :1.155 Mean :1.404 Mean :1.037
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000 Max. :2.000
Data Pre processing: We need to split the dataset and we have to randomize it. The original dataset might be set by a specific order.
credit$Creditability <- as.factor(credit$Creditability)
sum(is.na(credit))
## [1] 0
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
summary(credit_rand$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
Split data to 75% train and 25% test
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.256 0.744
Model Training: We are using Naive Bayes here.
library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## Tables:
##
## Account.Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
##
## Duration.of.Credit..month. 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
##
## Payment.Status.of.Previous.Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
##
## Credit.Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## # ... and 15 more tables
Evaluating the model performance:
# Evaluate the model
(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
##
## 0 1
## 0 42 35
## 1 22 151
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2
The columns here are about 193 predictions which are correct and 77% are chances that is predicted correctly.