Install packages and datasets
library(FSelector)
library(datasets)
library(class)
library(readr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(base)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
##
## empty
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(lattice)
Set working directory, read data table, and rename to maining original
setwd("/Users/carolyn.khalil/Desktop/R-Tutorial")
bankcredit<- read.csv("R-tutorial/Data/BankCred.csv")
BCE <- bankcredit
Recode 1 to Y and 0 to N in default column, then print columns in BCE
BCE[BCE$default == 1, "default"] <- "Y"
BCE[BCE$default == 0, "default"] <- "N"
names(BCE)
## [1] "default" "account_check_status"
## [3] "duration_in_month" "credit_history"
## [5] "purpose" "credit_amount"
## [7] "savings" "present_emp_since"
## [9] "installment_as_income_perc" "personal_status_sex"
## [11] "other_debtors" "present_res_since"
## [13] "property" "age"
## [15] "other_installment_plans" "housing"
## [17] "credits_this_bank" "job"
## [19] "people_under_maintenance" "telephone"
## [21] "foreign_worker"
Calculate information gain for default column, then remove all columns with an attribute importance of 0.
defaultIG <-information.gain(default~., BCE, unit ="log2")
print(defaultIG)
## attr_importance
## account_check_status 0.094738842
## duration_in_month 0.023329147
## credit_history 0.043617799
## purpose 0.024893540
## credit_amount 0.018708664
## savings 0.028114675
## present_emp_since 0.013102323
## installment_as_income_perc 0.000000000
## personal_status_sex 0.006810550
## other_debtors 0.004797021
## present_res_since 0.000000000
## property 0.016985186
## age 0.000000000
## other_installment_plans 0.008875070
## housing 0.012753186
## credits_this_bank 0.000000000
## job 0.001337357
## people_under_maintenance 0.000000000
## telephone 0.000963660
## foreign_worker 0.005822991
BDEE <-select(BCE, -duration_in_month,-credit_amount, -installment_as_income_perc, -present_res_since, - age, -people_under_maintenance, -credits_this_bank)
#print(BDEE)
Confirm IG by calculate information gain for default column
defaultIG2 <-information.gain(default~., BDEE, unit ="log2")
print(defaultIG2)
## attr_importance
## account_check_status 0.094738842
## credit_history 0.043617799
## purpose 0.024893540
## savings 0.028114675
## present_emp_since 0.013102323
## personal_status_sex 0.006810550
## other_debtors 0.004797021
## property 0.016985186
## other_installment_plans 0.008875070
## housing 0.012753186
## job 0.001337357
## telephone 0.000963660
## foreign_worker 0.005822991
Isolate the top 4 columns
isolate<-select(BCE,default,account_check_status, credit_history, purpose, savings)
#print(isolate)
Convert default column from character to factor
isolate$default <-factor(isolate$default)
Make decision tree with new data isolate, divide data into 3 sections. designate one section as test.index. New table train is the same as isolate but without that third (test.index)
set.seed(10)
index <- c(1:nrow(isolate))
test.index <- sample(index, size = (length(index)/3))
train <- isolate[-test.index ,]
test <- isolate[test.index ,]
print(test$default)
## [1] N Y N N N N Y N N Y Y N N N N Y N N Y N N N Y N Y N N Y Y N N Y N N N N N
## [38] N N N N N N Y Y N N N N N N Y Y N N N N Y N N N Y Y N N N N N N N N N Y N
## [75] N Y N N N Y N Y N Y N Y N N N N N N N N N N N N N N Y N Y N Y N N N N N Y
## [112] N Y Y N N N N N N Y N N Y N Y Y N N Y Y N N Y N N N N Y N N Y Y Y N N N Y
## [149] N N N Y N Y N N N Y Y N N N N N N N N N Y N Y N N N N N Y N N N N N Y N Y
## [186] Y N Y N N N Y N N Y N N N N N Y N N Y N Y N N N Y Y Y N Y N N N N Y N N Y
## [223] N N N N N N N Y N N N Y N N Y N Y Y N N Y N N N Y N N N N N N N Y N N N Y
## [260] Y N Y Y N N N N N N Y N N N N N N N N N N N N N Y N N N N N Y N Y Y Y Y Y
## [297] N N Y N Y Y N N N N N N N Y N Y N N N N N N N Y N Y Y Y N N N N N Y N N Y
## Levels: N Y
Tree of isolate,
tree3<-ctree(default~account_check_status+credit_history+purpose+savings, data=isolate)
plot(tree3)
Build the model
treeFit <- rpart(default~.,data=train)
print(treeFit)
## n= 667
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 667 205 N (0.6926537 0.3073463)
## 2) account_check_status=>= 200 DM / salary assignments for at least 1 year,no checking account 313 40 N (0.8722045 0.1277955) *
## 3) account_check_status=< 0 DM,0 <= ... < 200 DM 354 165 N (0.5338983 0.4661017)
## 6) credit_history=critical account/ other credits existing (not at this bank),delay in paying off in the past 115 36 N (0.6869565 0.3130435) *
## 7) credit_history=all credits at this bank paid back duly,existing credits paid back duly till now,no credits taken/ all credits paid back duly 239 110 Y (0.4602510 0.5397490)
## 14) savings=.. >= 1000 DM ,500 <= ... < 1000 DM ,unknown/ no savings account 53 19 N (0.6415094 0.3584906)
## 28) purpose=business,car (used),domestic appliances,radio/television,repairs,retraining 33 8 N (0.7575758 0.2424242) *
## 29) purpose=(vacation - does not exist?),car (new),furniture/equipment 20 9 Y (0.4500000 0.5500000)
## 58) account_check_status=0 <= ... < 200 DM 9 3 N (0.6666667 0.3333333) *
## 59) account_check_status=< 0 DM 11 3 Y (0.2727273 0.7272727) *
## 15) savings=... < 100 DM,100 <= ... < 500 DM 186 76 Y (0.4086022 0.5913978)
## 30) credit_history=existing credits paid back duly till now 153 69 Y (0.4509804 0.5490196)
## 60) purpose=car (used),domestic appliances,education,furniture/equipment,radio/television 96 46 N (0.5208333 0.4791667)
## 120) savings=... < 100 DM 84 37 N (0.5595238 0.4404762)
## 240) account_check_status=0 <= ... < 200 DM 36 14 N (0.6111111 0.3888889)
## 480) purpose=domestic appliances,education 27 7 N (0.7407407 0.2592593) *
## 481) purpose=car (used),radio/television 9 2 Y (0.2222222 0.7777778) *
## 241) account_check_status=< 0 DM 48 23 N (0.5208333 0.4791667)
## 482) purpose=car (used),furniture/equipment 8 1 N (0.8750000 0.1250000) *
## 483) purpose=domestic appliances,education,radio/television 40 18 Y (0.4500000 0.5500000) *
## 121) savings=100 <= ... < 500 DM 12 3 Y (0.2500000 0.7500000) *
## 61) purpose=(vacation - does not exist?),business,car (new),repairs,retraining 57 19 Y (0.3333333 0.6666667) *
## 31) credit_history=all credits at this bank paid back duly,no credits taken/ all credits paid back duly 33 7 Y (0.2121212 0.7878788) *
rpart.plot(treeFit, box.col=c("red", "green", "blue", "yellow"))
Predict test columns in treeFit
Prediction <- predict(treeFit,test, type = "class")
plot(Prediction)
Print the confusion matrix to check the accuracy and other statistics
Check of prediction and test$default coloumn are the same size
identical(levels(Prediction),levels(test$default))
## [1] TRUE
length(Prediction)
## [1] 333
length(test$default)
## [1] 333
print(test$default)
## [1] N Y N N N N Y N N Y Y N N N N Y N N Y N N N Y N Y N N Y Y N N Y N N N N N
## [38] N N N N N N Y Y N N N N N N Y Y N N N N Y N N N Y Y N N N N N N N N N Y N
## [75] N Y N N N Y N Y N Y N Y N N N N N N N N N N N N N N Y N Y N Y N N N N N Y
## [112] N Y Y N N N N N N Y N N Y N Y Y N N Y Y N N Y N N N N Y N N Y Y Y N N N Y
## [149] N N N Y N Y N N N Y Y N N N N N N N N N Y N Y N N N N N Y N N N N N Y N Y
## [186] Y N Y N N N Y N N Y N N N N N Y N N Y N Y N N N Y Y Y N Y N N N N Y N N Y
## [223] N N N N N N N Y N N N Y N N Y N Y Y N N Y N N N Y N N N N N N N Y N N N Y
## [260] Y N Y Y N N N N N N Y N N N N N N N N N N N N N Y N N N N N Y N Y Y Y Y Y
## [297] N N Y N Y Y N N N N N N N Y N Y N N N N N N N Y N Y Y Y N N N N N Y N N Y
## Levels: N Y
Prepare confusion matrix
confusionMatrix(Prediction,test$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 191 50
## Y 47 45
##
## Accuracy : 0.7087
## 95% CI : (0.6567, 0.757)
## No Information Rate : 0.7147
## P-Value [Acc > NIR] : 0.6222
##
## Kappa : 0.2789
##
## Mcnemar's Test P-Value : 0.8391
##
## Sensitivity : 0.8025
## Specificity : 0.4737
## Pos Pred Value : 0.7925
## Neg Pred Value : 0.4891
## Prevalence : 0.7147
## Detection Rate : 0.5736
## Detection Prevalence : 0.7237
## Balanced Accuracy : 0.6381
##
## 'Positive' Class : N
##