require(car)
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.2
require(caret)
## Loading required package: caret
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
require(ggplot2)
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
ggplot(data=iris,mapping=aes(x=Petal.Length,y=Petal.Width,color=Species))+
geom_point()+
geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data=iris,mapping=aes(x=Petal.Length,y=Petal.Width))+
geom_point(aes(color=Species,size=Sepal.Width))+
geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
set.seed(42)
p1=sample(seq(1,50), 10, replace=F)
p2=sample(seq(51,100), 10, replace=F)
p3=sample(seq(101,150), 10, replace=F)
test=rbind(iris[p1,], iris[p2,], iris[p3,])
train=iris[-c(p1,p2,p3),]
mtry=seq(2,4)
ntree=c(50,100,500,900)
replace=c('F','T')
nodesize=c(1,2,3,4)
mygrid=expand.grid(mtry=mtry, ntree=ntree,replace=replace, nodesize=nodesize)
acc=0
for (j in 1:nrow(mygrid)){
temprf=randomForest(Species~., data=train,
mtry=mygrid$mtry[j],
ntree=mygrid$ntree[j],
replace=mygrid$replace[j],
nodesize=mygrid$nodesize[j])
tempacc= sum(diag(temprf$confusion))/120
if(tempacc>acc){
acc=tempacc
best=mygrid[j,]
}
}
print(best)
## mtry ntree replace nodesize
## 4 2 100 F 1
print(acc)
## [1] 0.9666667
myrf=randomForest(Species~., data=train, xtest=test[,1:4],ytest=test[,5],mtry=2,ntree=50,replace=F, nodesize=1)
print('Training Set Confusion Matrix')
## [1] "Training Set Confusion Matrix"
confusionMatrix(train$Species,myrf$predicted, dnn = c("Prediction", "Reference"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 40 0 0
## versicolor 0 37 3
## virginica 0 3 37
##
## Overall Statistics
##
## Accuracy : 0.95
## 95% CI : (0.8943, 0.9814)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.925
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9250 0.9250
## Specificity 1.0000 0.9625 0.9625
## Pos Pred Value 1.0000 0.9250 0.9250
## Neg Pred Value 1.0000 0.9625 0.9625
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3083 0.3083
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9438 0.9438
print('Test Set Confusion Matrix')
## [1] "Test Set Confusion Matrix"
(mycm=confusionMatrix(test$Species,myrf$test$predicted, dnn = c("Prediction", "Reference")))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 0
## virginica 0 2 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 1.181e-09
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8333 1.0000
## Specificity 1.0000 1.0000 0.9091
## Pos Pred Value 1.0000 1.0000 0.8000
## Neg Pred Value 1.0000 0.9000 1.0000
## Prevalence 0.3333 0.4000 0.2667
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9167 0.9545
mycm=confusionMatrix(test$Species,myrf$test$predicted, dnn = c("Prediction", "Reference"))
plt=as.data.frame(mycm$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
ggplot(plt, aes(Reference, Prediction, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="white", high="#009194") +
labs(x = "Reference",y = "Prediction") +
scale_x_discrete(labels=c("Setosa","Versicolor","Virginica")) +
scale_y_discrete(labels=c("Virginica","Versicolor","Setosa"))
# Importances
myimp=as.data.frame(round(myrf$importance,2))
myimp$Variable=row.names(myimp)
colnames(myimp)=c('Importances', 'Variable')
myimp=myimp[order(myimp$Importances, decreasing = T),]
p=ggplot(data=myimp, aes(x=reorder(Variable,-Importances), y=Importances, fill=Variable)) +
geom_col()+
geom_text(aes(label = Importances), vjust = -0.2)
p