Example Iris data

data(iris)
library(ggplot2)
library(caret)
## Loading required package: lattice
inTrain<- createDataPartition(y=iris$Species,p=0.7,list=F)
training<- iris[inTrain,]
testing<- iris[-inTrain,]

Random forests

modFit<- train(Species~.,data=training,method="rf",prox=TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
modFit
## Random Forest 
## 
## 105 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD  Kappa SD  
##   2     0.9688479  0.9522425  0.02867577   0.04401324
##   3     0.9644136  0.9455036  0.03271878   0.05012127
##   4     0.9661158  0.9480755  0.03229235   0.04952524
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

Getting a single tree

getTree(modFit$finalModel,k=2) #take the 2nd single tree out
##    left daughter right daughter split var split point status prediction
## 1              2              3         4        0.80      1          0
## 2              0              0         0        0.00     -1          1
## 3              4              5         1        5.95      1          0
## 4              0              0         0        0.00     -1          2
## 5              6              7         2        2.60      1          0
## 6              8              9         3        4.95      1          0
## 7             10             11         3        4.75      1          0
## 8              0              0         0        0.00     -1          2
## 9              0              0         0        0.00     -1          3
## 10             0              0         0        0.00     -1          2
## 11            12             13         3        5.00      1          0
## 12            14             15         4        1.65      1          0
## 13             0              0         0        0.00     -1          3
## 14             0              0         0        0.00     -1          2
## 15             0              0         0        0.00     -1          3

Class “centers”

irisP<- classCenter(training[,c(3,4)],training$Species,modFit$finalModel$prox)
#Plot the center point of each cluster
irisP<- as.data.frame(irisP)
irisP$Species<- rownames(irisP)
q<- qplot(Petal.Width,Petal.Length,col=Species,data=training)
q+geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)

#Predicting new values

pred<- predict(modFit,testing)
table(pred,testing$Species)#3 virginica are not predicted well
##             
## pred         setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         1
##   virginica       0          2        14
testing$predRight<- pred==testing$Species 
qplot(Petal.Width,Petal.Length,col=predRight,data=testing,main="newdata Predictions")

#In this plot, we can see, in the what range of Width and Length, the mistake happen