ANLY 530 Lab 2

setwd("/Users/lekhana/Harrisburgh University/2018_05to08_Aram_MachineLearning/Laboratory #2-20180728")
credit <- read.csv("creditData.csv")
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

summary(credit)

##  Creditability Account.Balance Duration.of.Credit..month.
##  Min.   :0.0   Min.   :1.000   Min.   : 4.0              
##  1st Qu.:0.0   1st Qu.:1.000   1st Qu.:12.0              
##  Median :1.0   Median :2.000   Median :18.0              
##  Mean   :0.7   Mean   :2.577   Mean   :20.9              
##  3rd Qu.:1.0   3rd Qu.:4.000   3rd Qu.:24.0              
##  Max.   :1.0   Max.   :4.000   Max.   :72.0              
##  Payment.Status.of.Previous.Credit    Purpose       Credit.Amount  
##  Min.   :0.000                     Min.   : 0.000   Min.   :  250  
##  1st Qu.:2.000                     1st Qu.: 1.000   1st Qu.: 1366  
##  Median :2.000                     Median : 2.000   Median : 2320  
##  Mean   :2.545                     Mean   : 2.828   Mean   : 3271  
##  3rd Qu.:4.000                     3rd Qu.: 3.000   3rd Qu.: 3972  
##  Max.   :4.000                     Max.   :10.000   Max.   :18424  
##  Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
##  Min.   :1.000        Min.   :1.000                Min.   :1.000      
##  1st Qu.:1.000        1st Qu.:3.000                1st Qu.:2.000      
##  Median :1.000        Median :3.000                Median :3.000      
##  Mean   :2.105        Mean   :3.384                Mean   :2.973      
##  3rd Qu.:3.000        3rd Qu.:5.000                3rd Qu.:4.000      
##  Max.   :5.000        Max.   :5.000                Max.   :4.000      
##  Sex...Marital.Status   Guarantors    Duration.in.Current.address
##  Min.   :1.000        Min.   :1.000   Min.   :1.000              
##  1st Qu.:2.000        1st Qu.:1.000   1st Qu.:2.000              
##  Median :3.000        Median :1.000   Median :3.000              
##  Mean   :2.682        Mean   :1.145   Mean   :2.845              
##  3rd Qu.:3.000        3rd Qu.:1.000   3rd Qu.:4.000              
##  Max.   :4.000        Max.   :3.000   Max.   :4.000              
##  Most.valuable.available.asset  Age..years.    Concurrent.Credits
##  Min.   :1.000                 Min.   :19.00   Min.   :1.000     
##  1st Qu.:1.000                 1st Qu.:27.00   1st Qu.:3.000     
##  Median :2.000                 Median :33.00   Median :3.000     
##  Mean   :2.358                 Mean   :35.54   Mean   :2.675     
##  3rd Qu.:3.000                 3rd Qu.:42.00   3rd Qu.:3.000     
##  Max.   :4.000                 Max.   :75.00   Max.   :3.000     
##  Type.of.apartment No.of.Credits.at.this.Bank   Occupation   
##  Min.   :1.000     Min.   :1.000              Min.   :1.000  
##  1st Qu.:2.000     1st Qu.:1.000              1st Qu.:3.000  
##  Median :2.000     Median :1.000              Median :3.000  
##  Mean   :1.928     Mean   :1.407              Mean   :2.904  
##  3rd Qu.:2.000     3rd Qu.:2.000              3rd Qu.:3.000  
##  Max.   :3.000     Max.   :4.000              Max.   :4.000  
##  No.of.dependents   Telephone     Foreign.Worker 
##  Min.   :1.000    Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000    1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000    Median :1.000   Median :1.000  
##  Mean   :1.155    Mean   :1.404   Mean   :1.037  
##  3rd Qu.:1.000    3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :2.000    Max.   :2.000   Max.   :2.000

Data Pre processing: We need to split the dataset and we have to randomize it. The original dataset might be set by a specific order.

credit$Creditability <- as.factor(credit$Creditability)
sum(is.na(credit))

## [1] 0

set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

summary(credit_rand$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

Split data to 75% train and 25% test

credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(credit_test$Creditability))

## 
##     0     1 
## 0.256 0.744

Model Training: We are using Naive Bayes here.

library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model

## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##              
## Credit.Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## # ... and 15 more tables

Evaluating the model performance:

# Evaluate the model
(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))

##    
##       0   1
##   0  42  35
##   1  22 151

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

The columns here are about 193 predictions which are correct and 77% are chances that is predicted correctly.