library(rpart) #classification and regression trees
library(partykit) #treeplots
## Loading required package: grid
library(randomForest) #random forests
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(gbm) #gradient boosting
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(caret) #tune hyper-parameters
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
levels(PimaIndiansDiabetes$diabetes)
## [1] "neg" "pos"
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## diabetes
## neg:500
## pos:268
##
##
##
##
set.seed(123) #random number generator
ind = sample(2, nrow(PimaIndiansDiabetes), replace=TRUE, prob=c(0.5, 0.5))
train = PimaIndiansDiabetes[ind==1,] #the training data set
test = PimaIndiansDiabetes[ind==2,] #the test data set
str(test) #confirm it worked
## 'data.frame': 394 obs. of 9 variables:
## $ pregnant: num 6 8 5 8 10 5 0 7 1 13 ...
## $ glucose : num 148 183 116 125 168 166 118 107 103 145 ...
## $ pressure: num 72 64 74 96 74 72 84 74 30 82 ...
## $ triceps : num 35 0 0 0 0 19 47 0 38 19 ...
## $ insulin : num 0 0 0 0 0 175 230 0 83 110 ...
## $ mass : num 33.6 23.3 25.6 0 38 25.8 45.8 29.6 43.3 22.2 ...
## $ pedigree: num 0.627 0.672 0.201 0.232 0.537 0.587 0.551 0.254 0.183 0.245 ...
## $ age : num 50 32 30 54 34 51 31 31 33 57 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 2 1 1 ...
table(train$diabetes)
##
## neg pos
## 239 135
table(test$diabetes)
##
## neg pos
## 261 133
set.seed(123)
rf.pima = randomForest(diabetes~., data=train)
print(rf.pima)
##
## Call:
## randomForest(formula = diabetes ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 25.67%
## Confusion matrix:
## neg pos class.error
## neg 203 36 0.1506276
## pos 60 75 0.4444444
plot(rf.pima)

which.min(rf.pima$err.rate[,1])
## [1] 97
rf.pima.2 = randomForest(diabetes~., data=train, ntree=97)
print(rf.pima.2)
##
## Call:
## randomForest(formula = diabetes ~ ., data = train, ntree = 97)
## Type of random forest: classification
## Number of trees: 97
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 25.94%
## Confusion matrix:
## neg pos class.error
## neg 202 37 0.1548117
## pos 60 75 0.4444444
(202+75)/374
## [1] 0.7406417
varImpPlot(rf.pima.2)
