install.packages("rpart",repos="http://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("rpart.plot",repos="http://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
install.packages("caret",repos="http://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/3z/jqczpc_95yq_sbgl2665kg2c0000gq/T//Rtmpaaw3sP/downloaded_packages
library(lattice)
library(ggplot2)
library(caret)
#packages for preprocess
library(rpart)
library(rpart.plot)
#packages for decision tree
library(mlbench)
#package with data sample
2.Read data
data("PimaIndiansDiabetes2",package = 'mlbench')
data <- PimaIndiansDiabetes2
head(data)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 NA 33.6 0.627 50 pos
## 2 1 85 66 29 NA 26.6 0.351 31 neg
## 3 8 183 64 NA NA 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 NA NA 25.6 0.201 30 neg
summary(data)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin mass pedigree age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## diabetes
## neg:500
## pos:268
##
##
##
##
##
3.Preprocess Data
preProValues <- preProcess(data[,-9],method = c("center","scale"))
scaleddata <- predict(preProValues,data[,-9])
#Normalization
preProcbox <- preProcess(scaleddata,method = c("YeoJohnson"))
boxdata <- predict(preProcbox,scaleddata)
#YeoJohnson Transfer (to norm distribution)
preProcimp <- preProcess(boxdata,method = "bagImpute")
procdata <- predict(preProcimp,boxdata)
#Missing Values
procdata$class <- data[,9]
head(procdata)
## pregnant glucose pressure triceps insulin mass
## 1 0.5284016 0.7595155 -0.03275471 0.52823166 0.12783441 0.1613449
## 2 -1.0902050 -1.4250227 -0.52420088 -0.01466832 -1.20274425 -0.9327976
## 3 0.8956985 1.5845945 -0.69028150 -0.78108015 0.03740067 -1.5205042
## 4 -1.0902050 -1.2509263 -0.52420088 -0.62258605 -0.67266546 -0.6791257
## 5 -1.5823833 0.4627602 -2.74133178 0.52823166 0.09906180 1.3218244
## 6 0.3065886 -0.1924995 0.12832774 -0.83700141 -0.56409029 -1.1067781
## pedigree age class
## 1 0.3807017 0.90154325 pos
## 2 -0.4347451 -0.20794297 neg
## 3 0.4674736 -0.11085497 pos
## 4 -1.3678689 -1.55542743 neg
## 5 1.7918753 -0.02068449 pos
## 6 -1.1705718 -0.31192824 neg
summary(procdata)
## pregnant glucose pressure triceps
## Min. :-1.5824 Min. :-3.4150 Min. :-4.14946 Min. :-2.4794
## 1st Qu.:-1.0902 1st Qu.:-0.8339 1st Qu.:-0.69028 1st Qu.:-0.7462
## Median :-0.2734 Median :-0.1578 Median :-0.03275 Median :-0.1115
## Mean :-0.2452 Mean :-0.1315 Mean :-0.02782 Mean :-0.1174
## 3rd Qu.: 0.5284 3rd Qu.: 0.5530 3rd Qu.: 0.60405 3rd Qu.: 0.5282
## Max. : 1.9793 Max. : 1.9208 Max. : 3.77277 Max. : 4.9183
## insulin mass pedigree age
## Min. :-2.0260 Min. :-2.49746 Min. :-1.9381 Min. :-1.5554
## 1st Qu.:-0.8153 1st Qu.:-0.77951 1st Qu.:-0.9385 1st Qu.:-1.0797
## Median :-0.2642 Median :-0.02281 Median :-0.3472 Median :-0.4228
## Mean :-0.3049 Mean :-0.09971 Mean :-0.2983 Mean :-0.3063
## 3rd Qu.: 0.1426 3rd Qu.: 0.55689 3rd Qu.: 0.3792 3rd Qu.: 0.5118
## Max. : 1.7013 Max. : 3.63423 Max. : 1.8466 Max. : 1.6746
## class
## neg:500
## pos:268
##
##
##
##
featurePlot(scaleddata,data[,9],plot='box')
4.Decision Tree
rpartModel <- rpart(class~.,data=procdata,control = rpart.control(cp=0))
#Tree growth without limitation
rpart.plot(rpartModel)
#Print tree plot
plotcp(rpartModel)
#Print CP value Vs. tree levels
rpartModel$cptable
## CP nsplit rel error xerror xstd
## 1 0.250000000 0 1.0000000 1.0000000 0.04928752
## 2 0.100746269 1 0.7500000 0.8134328 0.04662235
## 3 0.017723881 2 0.6492537 0.6902985 0.04421857
## 4 0.016169154 6 0.5783582 0.6791045 0.04397128
## 5 0.011194030 9 0.5298507 0.6828358 0.04405428
## 6 0.009328358 11 0.5074627 0.7126866 0.04469811
## 7 0.007462687 17 0.4440299 0.7201493 0.04485359
## 8 0.005597015 19 0.4291045 0.7126866 0.04469811
## 9 0.003731343 22 0.4104478 0.7238806 0.04493052
## 10 0.002487562 23 0.4067164 0.7014925 0.04446083
## 11 0.001865672 26 0.3992537 0.6902985 0.04421857
## 12 0.000000000 28 0.3955224 0.6940299 0.04429988
cptable <- as.data.frame(rpartModel$cptable)
cptable$errsd <- cptable$xerror + cptable$xstd
cpvalue <- cptable[which.min(cptable$errsd),"CP"]
#Find out the best CP value for tree
pruneModel <- prune(rpartModel,0.007462687)
#prune the tree
rpart.plot(pruneModel)
#Print the tree after prune
pre <- predict(pruneModel,procdata,type='class')
pretable <- table(pre,procdata$class)
pretable
##
## pre neg pos
## neg 432 51
## pos 68 217
#Show the confusion matrix
accurary <- sum(diag(pretable))/sum(pretable)
accurary
## [1] 0.8450521
#Calculate the accurary
varImp(pruneModel)
## Overall
## age 94.58974
## glucose 106.90657
## insulin 100.42287
## mass 93.06814
## pedigree 34.17301
## pregnant 18.21234
## pressure 14.25917
## triceps 58.80070
#Show the importance of each features