R Notebook

Install packages and datasets

library(FSelector)
library(datasets)
library(class)
library(readr)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(base)
library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:plyr':
## 
##     empty

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

library(rpart)
library(rpart.plot)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(lattice)

Set working directory, read data table, and rename to maining original

setwd("/Users/carolyn.khalil/Desktop/R-Tutorial")
bankcredit<- read.csv("R-tutorial/Data/BankCred.csv")
BCE <- bankcredit

Recode 1 to Y and 0 to N in default column, then print columns in BCE

BCE[BCE$default == 1, "default"] <- "Y"
BCE[BCE$default == 0, "default"] <- "N"
names(BCE)

##  [1] "default"                    "account_check_status"      
##  [3] "duration_in_month"          "credit_history"            
##  [5] "purpose"                    "credit_amount"             
##  [7] "savings"                    "present_emp_since"         
##  [9] "installment_as_income_perc" "personal_status_sex"       
## [11] "other_debtors"              "present_res_since"         
## [13] "property"                   "age"                       
## [15] "other_installment_plans"    "housing"                   
## [17] "credits_this_bank"          "job"                       
## [19] "people_under_maintenance"   "telephone"                 
## [21] "foreign_worker"

Calculate information gain for default column, then remove all columns with an attribute importance of 0.

defaultIG <-information.gain(default~., BCE, unit ="log2")
print(defaultIG)

##                            attr_importance
## account_check_status           0.094738842
## duration_in_month              0.023329147
## credit_history                 0.043617799
## purpose                        0.024893540
## credit_amount                  0.018708664
## savings                        0.028114675
## present_emp_since              0.013102323
## installment_as_income_perc     0.000000000
## personal_status_sex            0.006810550
## other_debtors                  0.004797021
## present_res_since              0.000000000
## property                       0.016985186
## age                            0.000000000
## other_installment_plans        0.008875070
## housing                        0.012753186
## credits_this_bank              0.000000000
## job                            0.001337357
## people_under_maintenance       0.000000000
## telephone                      0.000963660
## foreign_worker                 0.005822991

BDEE <-select(BCE, -duration_in_month,-credit_amount, -installment_as_income_perc, -present_res_since, - age, -people_under_maintenance, -credits_this_bank)
#print(BDEE)

Confirm IG by calculate information gain for default column

defaultIG2 <-information.gain(default~., BDEE, unit ="log2")
print(defaultIG2)

##                         attr_importance
## account_check_status        0.094738842
## credit_history              0.043617799
## purpose                     0.024893540
## savings                     0.028114675
## present_emp_since           0.013102323
## personal_status_sex         0.006810550
## other_debtors               0.004797021
## property                    0.016985186
## other_installment_plans     0.008875070
## housing                     0.012753186
## job                         0.001337357
## telephone                   0.000963660
## foreign_worker              0.005822991

Isolate the top 4 columns

isolate<-select(BCE,default,account_check_status, credit_history, purpose, savings)
#print(isolate)

Convert default column from character to factor

isolate$default <-factor(isolate$default)

Make decision tree with new data isolate, divide data into 3 sections. designate one section as test.index. New table train is the same as isolate but without that third (test.index)

set.seed(10)  
index <- c(1:nrow(isolate))
test.index <- sample(index, size = (length(index)/3))
train <- isolate[-test.index ,]
test <- isolate[test.index ,]
print(test$default)

##   [1] N Y N N N N Y N N Y Y N N N N Y N N Y N N N Y N Y N N Y Y N N Y N N N N N
##  [38] N N N N N N Y Y N N N N N N Y Y N N N N Y N N N Y Y N N N N N N N N N Y N
##  [75] N Y N N N Y N Y N Y N Y N N N N N N N N N N N N N N Y N Y N Y N N N N N Y
## [112] N Y Y N N N N N N Y N N Y N Y Y N N Y Y N N Y N N N N Y N N Y Y Y N N N Y
## [149] N N N Y N Y N N N Y Y N N N N N N N N N Y N Y N N N N N Y N N N N N Y N Y
## [186] Y N Y N N N Y N N Y N N N N N Y N N Y N Y N N N Y Y Y N Y N N N N Y N N Y
## [223] N N N N N N N Y N N N Y N N Y N Y Y N N Y N N N Y N N N N N N N Y N N N Y
## [260] Y N Y Y N N N N N N Y N N N N N N N N N N N N N Y N N N N N Y N Y Y Y Y Y
## [297] N N Y N Y Y N N N N N N N Y N Y N N N N N N N Y N Y Y Y N N N N N Y N N Y
## Levels: N Y

Tree of isolate,

tree3<-ctree(default~account_check_status+credit_history+purpose+savings, data=isolate)
plot(tree3)

Build the model

treeFit <- rpart(default~.,data=train)
print(treeFit)

## n= 667 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 667 205 N (0.6926537 0.3073463)  
##     2) account_check_status=>= 200 DM / salary assignments for at least 1 year,no checking account 313  40 N (0.8722045 0.1277955) *
##     3) account_check_status=< 0 DM,0 <= ... < 200 DM 354 165 N (0.5338983 0.4661017)  
##       6) credit_history=critical account/ other credits existing (not at this bank),delay in paying off in the past 115  36 N (0.6869565 0.3130435) *
##       7) credit_history=all credits at this bank paid back duly,existing credits paid back duly till now,no credits taken/ all credits paid back duly 239 110 Y (0.4602510 0.5397490)  
##        14) savings=.. >= 1000 DM ,500 <= ... < 1000 DM ,unknown/ no savings account 53  19 N (0.6415094 0.3584906)  
##          28) purpose=business,car (used),domestic appliances,radio/television,repairs,retraining 33   8 N (0.7575758 0.2424242) *
##          29) purpose=(vacation - does not exist?),car (new),furniture/equipment 20   9 Y (0.4500000 0.5500000)  
##            58) account_check_status=0 <= ... < 200 DM 9   3 N (0.6666667 0.3333333) *
##            59) account_check_status=< 0 DM 11   3 Y (0.2727273 0.7272727) *
##        15) savings=... < 100 DM,100 <= ... < 500 DM 186  76 Y (0.4086022 0.5913978)  
##          30) credit_history=existing credits paid back duly till now 153  69 Y (0.4509804 0.5490196)  
##            60) purpose=car (used),domestic appliances,education,furniture/equipment,radio/television 96  46 N (0.5208333 0.4791667)  
##             120) savings=... < 100 DM 84  37 N (0.5595238 0.4404762)  
##               240) account_check_status=0 <= ... < 200 DM 36  14 N (0.6111111 0.3888889)  
##                 480) purpose=domestic appliances,education 27   7 N (0.7407407 0.2592593) *
##                 481) purpose=car (used),radio/television 9   2 Y (0.2222222 0.7777778) *
##               241) account_check_status=< 0 DM 48  23 N (0.5208333 0.4791667)  
##                 482) purpose=car (used),furniture/equipment 8   1 N (0.8750000 0.1250000) *
##                 483) purpose=domestic appliances,education,radio/television 40  18 Y (0.4500000 0.5500000) *
##             121) savings=100 <= ... < 500 DM 12   3 Y (0.2500000 0.7500000) *
##            61) purpose=(vacation - does not exist?),business,car (new),repairs,retraining 57  19 Y (0.3333333 0.6666667) *
##          31) credit_history=all credits at this bank paid back duly,no credits taken/ all credits paid back duly 33   7 Y (0.2121212 0.7878788) *

rpart.plot(treeFit, box.col=c("red", "green", "blue", "yellow"))

Predict test columns in treeFit

Prediction <- predict(treeFit,test, type = "class")
plot(Prediction)

Print the confusion matrix to check the accuracy and other statistics

Check of prediction and test$default coloumn are the same size

identical(levels(Prediction),levels(test$default))

## [1] TRUE

length(Prediction)

## [1] 333

length(test$default)

## [1] 333

print(test$default)

##   [1] N Y N N N N Y N N Y Y N N N N Y N N Y N N N Y N Y N N Y Y N N Y N N N N N
##  [38] N N N N N N Y Y N N N N N N Y Y N N N N Y N N N Y Y N N N N N N N N N Y N
##  [75] N Y N N N Y N Y N Y N Y N N N N N N N N N N N N N N Y N Y N Y N N N N N Y
## [112] N Y Y N N N N N N Y N N Y N Y Y N N Y Y N N Y N N N N Y N N Y Y Y N N N Y
## [149] N N N Y N Y N N N Y Y N N N N N N N N N Y N Y N N N N N Y N N N N N Y N Y
## [186] Y N Y N N N Y N N Y N N N N N Y N N Y N Y N N N Y Y Y N Y N N N N Y N N Y
## [223] N N N N N N N Y N N N Y N N Y N Y Y N N Y N N N Y N N N N N N N Y N N N Y
## [260] Y N Y Y N N N N N N Y N N N N N N N N N N N N N Y N N N N N Y N Y Y Y Y Y
## [297] N N Y N Y Y N N N N N N N Y N Y N N N N N N N Y N Y Y Y N N N N N Y N N Y
## Levels: N Y

Prepare confusion matrix

confusionMatrix(Prediction,test$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   N   Y
##          N 191  50
##          Y  47  45
##                                          
##                Accuracy : 0.7087         
##                  95% CI : (0.6567, 0.757)
##     No Information Rate : 0.7147         
##     P-Value [Acc > NIR] : 0.6222         
##                                          
##                   Kappa : 0.2789         
##                                          
##  Mcnemar's Test P-Value : 0.8391         
##                                          
##             Sensitivity : 0.8025         
##             Specificity : 0.4737         
##          Pos Pred Value : 0.7925         
##          Neg Pred Value : 0.4891         
##              Prevalence : 0.7147         
##          Detection Rate : 0.5736         
##    Detection Prevalence : 0.7237         
##       Balanced Accuracy : 0.6381         
##                                          
##        'Positive' Class : N              
##