Decision Tree (C4.5) Classifier Tutorial

  1. Install the package
install.packages("rpart",repos="http://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("rpart.plot",repos="http://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("caret",repos="http://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
library(lattice)
library(ggplot2)
library(caret)
#packages for preprocess
library(rpart)
library(rpart.plot)
#packages for decision tree
library(mlbench)
#package with data sample

2.Read data

data("PimaIndiansDiabetes2",package = 'mlbench')
data <- PimaIndiansDiabetes2
head(data)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35      NA 33.6    0.627  50      pos
## 2        1      85       66      29      NA 26.6    0.351  31      neg
## 3        8     183       64      NA      NA 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74      NA      NA 25.6    0.201  30      neg
summary(data)
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     insulin            mass          pedigree           age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780   Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437   1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725   Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##  NA's   :374      NA's   :11                                      
##  diabetes 
##  neg:500  
##  pos:268  
##           
##           
##           
##           
## 

3.Preprocess Data

preProValues <- preProcess(data[,-9],method = c("center","scale"))
scaleddata <- predict(preProValues,data[,-9])
#Normalization
preProcbox <- preProcess(scaleddata,method = c("YeoJohnson"))
boxdata <- predict(preProcbox,scaleddata)
#YeoJohnson Transfer (to norm distribution)
preProcimp <- preProcess(boxdata,method = "bagImpute")
procdata <- predict(preProcimp,boxdata)
#Missing Values
procdata$class <- data[,9]
head(procdata)
##     pregnant    glucose    pressure     triceps     insulin       mass
## 1  0.5284016  0.7595155 -0.03275471  0.52823166  0.12783441  0.1613449
## 2 -1.0902050 -1.4250227 -0.52420088 -0.01466832 -1.20274425 -0.9327976
## 3  0.8956985  1.5845945 -0.69028150 -0.78108015  0.03740067 -1.5205042
## 4 -1.0902050 -1.2509263 -0.52420088 -0.62258605 -0.67266546 -0.6791257
## 5 -1.5823833  0.4627602 -2.74133178  0.52823166  0.09906180  1.3218244
## 6  0.3065886 -0.1924995  0.12832774 -0.83700141 -0.56409029 -1.1067781
##     pedigree         age class
## 1  0.3807017  0.90154325   pos
## 2 -0.4347451 -0.20794297   neg
## 3  0.4674736 -0.11085497   pos
## 4 -1.3678689 -1.55542743   neg
## 5  1.7918753 -0.02068449   pos
## 6 -1.1705718 -0.31192824   neg
summary(procdata)
##     pregnant          glucose           pressure           triceps       
##  Min.   :-1.5824   Min.   :-3.4150   Min.   :-4.14946   Min.   :-2.4794  
##  1st Qu.:-1.0902   1st Qu.:-0.8339   1st Qu.:-0.69028   1st Qu.:-0.7462  
##  Median :-0.2734   Median :-0.1578   Median :-0.03275   Median :-0.1115  
##  Mean   :-0.2452   Mean   :-0.1315   Mean   :-0.02782   Mean   :-0.1174  
##  3rd Qu.: 0.5284   3rd Qu.: 0.5530   3rd Qu.: 0.60405   3rd Qu.: 0.5282  
##  Max.   : 1.9793   Max.   : 1.9208   Max.   : 3.77277   Max.   : 4.9183  
##     insulin             mass             pedigree            age         
##  Min.   :-2.0260   Min.   :-2.49746   Min.   :-1.9381   Min.   :-1.5554  
##  1st Qu.:-0.8153   1st Qu.:-0.77951   1st Qu.:-0.9385   1st Qu.:-1.0797  
##  Median :-0.2642   Median :-0.02281   Median :-0.3472   Median :-0.4228  
##  Mean   :-0.3049   Mean   :-0.09971   Mean   :-0.2983   Mean   :-0.3063  
##  3rd Qu.: 0.1426   3rd Qu.: 0.55689   3rd Qu.: 0.3792   3rd Qu.: 0.5118  
##  Max.   : 1.7013   Max.   : 3.63423   Max.   : 1.8466   Max.   : 1.6746  
##  class    
##  neg:500  
##  pos:268  
##           
##           
##           
## 
featurePlot(scaleddata,data[,9],plot='box')

4.Decision Tree

rpartModel <- rpart(class~.,data=procdata,control = rpart.control(cp=0))
#Tree growth without limitation
rpart.plot(rpartModel)

#Print tree plot
plotcp(rpartModel)

#Print CP value Vs. tree levels 
rpartModel$cptable
##             CP nsplit rel error    xerror       xstd
## 1  0.250000000      0 1.0000000 1.0000000 0.04928752
## 2  0.100746269      1 0.7500000 0.8134328 0.04662235
## 3  0.017723881      2 0.6492537 0.6902985 0.04421857
## 4  0.016169154      6 0.5783582 0.6791045 0.04397128
## 5  0.011194030      9 0.5298507 0.6828358 0.04405428
## 6  0.009328358     11 0.5074627 0.7126866 0.04469811
## 7  0.007462687     17 0.4440299 0.7201493 0.04485359
## 8  0.005597015     19 0.4291045 0.7126866 0.04469811
## 9  0.003731343     22 0.4104478 0.7238806 0.04493052
## 10 0.002487562     23 0.4067164 0.7014925 0.04446083
## 11 0.001865672     26 0.3992537 0.6902985 0.04421857
## 12 0.000000000     28 0.3955224 0.6940299 0.04429988
cptable <- as.data.frame(rpartModel$cptable)
cptable$errsd <- cptable$xerror + cptable$xstd
cpvalue <- cptable[which.min(cptable$errsd),"CP"]
#Find out the best CP value for tree
pruneModel <- prune(rpartModel,0.007462687)
#prune the tree
rpart.plot(pruneModel)

#Print the tree after prune
pre <- predict(pruneModel,procdata,type='class')
pretable <- table(pre,procdata$class)
pretable
##      
## pre   neg pos
##   neg 432  51
##   pos  68 217
#Show the confusion matrix
accurary <- sum(diag(pretable))/sum(pretable)
accurary
## [1] 0.8450521
#Calculate the accurary
varImp(pruneModel)
##            Overall
## age       94.58974
## glucose  106.90657
## insulin  100.42287
## mass      93.06814
## pedigree  34.17301
## pregnant  18.21234
## pressure  14.25917
## triceps   58.80070
#Show the importance of each features