data(iris)
library(ggplot2)
library(caret)
## Loading required package: lattice
inTrain<- createDataPartition(y=iris$Species,p=0.7,list=F)
training<- iris[inTrain,]
testing<- iris[-inTrain,]
modFit<- train(Species~.,data=training,method="rf",prox=TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
modFit
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9688479 0.9522425 0.02867577 0.04401324
## 3 0.9644136 0.9455036 0.03271878 0.05012127
## 4 0.9661158 0.9480755 0.03229235 0.04952524
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
getTree(modFit$finalModel,k=2) #take the 2nd single tree out
## left daughter right daughter split var split point status prediction
## 1 2 3 4 0.80 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 1 5.95 1 0
## 4 0 0 0 0.00 -1 2
## 5 6 7 2 2.60 1 0
## 6 8 9 3 4.95 1 0
## 7 10 11 3 4.75 1 0
## 8 0 0 0 0.00 -1 2
## 9 0 0 0 0.00 -1 3
## 10 0 0 0 0.00 -1 2
## 11 12 13 3 5.00 1 0
## 12 14 15 4 1.65 1 0
## 13 0 0 0 0.00 -1 3
## 14 0 0 0 0.00 -1 2
## 15 0 0 0 0.00 -1 3
irisP<- classCenter(training[,c(3,4)],training$Species,modFit$finalModel$prox)
#Plot the center point of each cluster
irisP<- as.data.frame(irisP)
irisP$Species<- rownames(irisP)
q<- qplot(Petal.Width,Petal.Length,col=Species,data=training)
q+geom_point(aes(x=Petal.Width,y=Petal.Length,col=Species),size=5,shape=4,data=irisP)
#Predicting new values
pred<- predict(modFit,testing)
table(pred,testing$Species)#3 virginica are not predicted well
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 1
## virginica 0 2 14
testing$predRight<- pred==testing$Species
qplot(Petal.Width,Petal.Length,col=predRight,data=testing,main="newdata Predictions")
#In this plot, we can see, in the what range of Width and Length, the mistake happen