Today we’re going to make a brief comparison to random forest classifiers and linear models.
First we load in our randomForest library, choose a dataset, partition training/test sets and go to town.
library(randomForest)
data(boys)
DF = data.frame(boys)
DF = DF[complete.cases(DF), ]
condition = median(DF[,'age'])
#Make a column to determine whether or not an animal is big brained
DF$oldGuy = as.factor(ifelse(DF$age>condition,'yes','no'))
DF[,c('age','gen','phb','tv','reg')] = NULL
#training/test set
trainE = createDataPartition(DF$oldGuy,p=.6,list=FALSE)
trainingData = DF[ trainE, ]
testingData = DF[ -trainE, ]
#Run random Forest
rfm = randomForest(data=trainingData,oldGuy~.,na.action=na.omit,importance=TRUE)
#Run generalized linear model
trainingData$oldGuy = ifelse(trainingData$oldGuy == 'yes',1,0)
gm = glm(data=trainingData, formula = oldGuy~hgt+wgt+bmi+hc)
rfm
##
## Call:
## randomForest(formula = oldGuy ~ ., data = trainingData, importance = TRUE, na.action = na.omit)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 14.81%
## Confusion matrix:
## no yes class.error
## no 57 11 0.1617647
## yes 9 58 0.1343284
gm
##
## Call: glm(formula = oldGuy ~ hgt + wgt + bmi + hc, data = trainingData)
##
## Coefficients:
## (Intercept) hgt wgt bmi hc
## -2.681763 0.015303 0.008769 -0.018460 0.009599
##
## Degrees of Freedom: 134 Total (i.e. Null); 130 Residual
## Null Deviance: 33.75
## Residual Deviance: 12.23 AIC: 70.92
importance(rfm)
## no yes MeanDecreaseAccuracy MeanDecreaseGini
## hgt 20.4442318 30.579285 34.635223 31.209065
## wgt 11.0026542 18.150830 20.852348 19.975646
## bmi 0.3590969 5.741561 5.091777 5.277815
## hc 0.4396475 9.122534 7.737677 10.522955
table(testingData$oldGuy,predict(rfm,testingData),dnn=list('actual','predicted'))
## predicted
## actual no yes
## no 38 6
## yes 3 41
testingData$oldGuy = ifelse(testingData$oldGuy == 'yes',1,0)
pred = predict(gm, newdata=testingData)
pred = ifelse(pred<.5,0,1)
theMatrix = confusionMatrix(data=pred,testingData$oldGuy,positive = '1')
theMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 1
## 1 6 43
##
## Accuracy : 0.9205
## 95% CI : (0.843, 0.9674)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8409
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.9773
## Specificity : 0.8636
## Pos Pred Value : 0.8776
## Neg Pred Value : 0.9744
## Prevalence : 0.5000
## Detection Rate : 0.4886
## Detection Prevalence : 0.5568
## Balanced Accuracy : 0.9205
##
## 'Positive' Class : 1
##
Source : ‘http://blog.yhat.com/posts/10-R-packages-I-wish-I-knew-about-earlier.html’