Machine learning random forest classification

library(rpart) #classification and regression trees
library(partykit) #treeplots

## Loading required package: grid

library(randomForest) #random forests

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

library(gbm) #gradient boosting

## Loading required package: survival

## Loading required package: lattice

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.3

library(caret) #tune hyper-parameters

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)

## [1] 768   9

levels(PimaIndiansDiabetes$diabetes)

## [1] "neg" "pos"

head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##  diabetes 
##  neg:500  
##  pos:268  
##           
##           
##           
##

set.seed(123) #random number generator
ind = sample(2, nrow(PimaIndiansDiabetes), replace=TRUE, prob=c(0.5, 0.5))
train = PimaIndiansDiabetes[ind==1,] #the training data set
test = PimaIndiansDiabetes[ind==2,] #the test data set
str(test) #confirm it worked

## 'data.frame':    394 obs. of  9 variables:
##  $ pregnant: num  6 8 5 8 10 5 0 7 1 13 ...
##  $ glucose : num  148 183 116 125 168 166 118 107 103 145 ...
##  $ pressure: num  72 64 74 96 74 72 84 74 30 82 ...
##  $ triceps : num  35 0 0 0 0 19 47 0 38 19 ...
##  $ insulin : num  0 0 0 0 0 175 230 0 83 110 ...
##  $ mass    : num  33.6 23.3 25.6 0 38 25.8 45.8 29.6 43.3 22.2 ...
##  $ pedigree: num  0.627 0.672 0.201 0.232 0.537 0.587 0.551 0.254 0.183 0.245 ...
##  $ age     : num  50 32 30 54 34 51 31 31 33 57 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 2 1 2 2 2 2 2 1 1 ...

table(train$diabetes)

## 
## neg pos 
## 239 135

table(test$diabetes)

## 
## neg pos 
## 261 133

set.seed(123)
rf.pima = randomForest(diabetes~., data=train)
print(rf.pima)

## 
## Call:
##  randomForest(formula = diabetes ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.67%
## Confusion matrix:
##     neg pos class.error
## neg 203  36   0.1506276
## pos  60  75   0.4444444

plot(rf.pima)

which.min(rf.pima$err.rate[,1])

## [1] 97

rf.pima.2 = randomForest(diabetes~., data=train, ntree=97)
print(rf.pima.2)

## 
## Call:
##  randomForest(formula = diabetes ~ ., data = train, ntree = 97) 
##                Type of random forest: classification
##                      Number of trees: 97
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.94%
## Confusion matrix:
##     neg pos class.error
## neg 202  37   0.1548117
## pos  60  75   0.4444444

(202+75)/374

## [1] 0.7406417

varImpPlot(rf.pima.2)

Machine learning random forest classification

Kushan De Silva

August 4, 2017