Libraries

require(car)
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
require(ggplot2)
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin

Plot

ggplot(data=iris,mapping=aes(x=Petal.Length,y=Petal.Width,color=Species))+
  geom_point()+
  geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

Another Plot

ggplot(data=iris,mapping=aes(x=Petal.Length,y=Petal.Width))+
  geom_point(aes(color=Species,size=Sepal.Width))+
  geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'

Train Test Split

set.seed(42)
p1=sample(seq(1,50), 10, replace=F)
p2=sample(seq(51,100), 10, replace=F)
p3=sample(seq(101,150), 10, replace=F)

test=rbind(iris[p1,], iris[p2,], iris[p3,])
train=iris[-c(p1,p2,p3),]

Train/Validation Split for Hyperparameter Tuning

mtry=seq(2,4)
ntree=c(50,100,500,900)
replace=c('F','T')
nodesize=c(1,2,3,4)
mygrid=expand.grid(mtry=mtry, ntree=ntree,replace=replace, nodesize=nodesize)
acc=0
for (j in 1:nrow(mygrid)){
  temprf=randomForest(Species~., data=train,
                      mtry=mygrid$mtry[j], 
                      ntree=mygrid$ntree[j], 
                      replace=mygrid$replace[j],
                      nodesize=mygrid$nodesize[j])
  tempacc= sum(diag(temprf$confusion))/120
  if(tempacc>acc){
    acc=tempacc
    best=mygrid[j,]              
  }

}
print(best)
##   mtry ntree replace nodesize
## 4    2   100       F        1
print(acc)
## [1] 0.9666667

Use Tuned Hyperparameters in Final Model

myrf=randomForest(Species~., data=train, xtest=test[,1:4],ytest=test[,5],mtry=2,ntree=50,replace=F, nodesize=1)
print('Training Set Confusion Matrix')
## [1] "Training Set Confusion Matrix"
confusionMatrix(train$Species,myrf$predicted, dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         37         3
##   virginica       0          3        37
## 
## Overall Statistics
##                                           
##                Accuracy : 0.95            
##                  95% CI : (0.8943, 0.9814)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.925           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9250           0.9250
## Specificity                 1.0000            0.9625           0.9625
## Pos Pred Value              1.0000            0.9250           0.9250
## Neg Pred Value              1.0000            0.9625           0.9625
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3083           0.3083
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9438           0.9438
print('Test Set Confusion Matrix')
## [1] "Test Set Confusion Matrix"
(mycm=confusionMatrix(test$Species,myrf$test$predicted, dnn = c("Prediction", "Reference")))
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         0
##   virginica       0          2         8
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.7793, 0.9918)
##     No Information Rate : 0.4             
##     P-Value [Acc > NIR] : 1.181e-09       
##                                           
##                   Kappa : 0.9             
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8333           1.0000
## Specificity                 1.0000            1.0000           0.9091
## Pos Pred Value              1.0000            1.0000           0.8000
## Neg Pred Value              1.0000            0.9000           1.0000
## Prevalence                  0.3333            0.4000           0.2667
## Detection Rate              0.3333            0.3333           0.2667
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9167           0.9545

Plot Confusion matrix

mycm=confusionMatrix(test$Species,myrf$test$predicted, dnn = c("Prediction", "Reference"))
plt=as.data.frame(mycm$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
ggplot(plt, aes(Reference, Prediction, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Reference",y = "Prediction") +
        scale_x_discrete(labels=c("Setosa","Versicolor","Virginica")) +
        scale_y_discrete(labels=c("Virginica","Versicolor","Setosa"))

# Importances

myimp=as.data.frame(round(myrf$importance,2))
myimp$Variable=row.names(myimp)
colnames(myimp)=c('Importances', 'Variable')
myimp=myimp[order(myimp$Importances, decreasing = T),]
p=ggplot(data=myimp, aes(x=reorder(Variable,-Importances), y=Importances, fill=Variable)) +
  geom_col()+
  geom_text(aes(label = Importances), vjust = -0.2)
p