library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.2.1
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.2.2
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(rpart)
## 'data.frame': 10 obs. of 4 variables:
## $ Left : int 1 0 1 0 0 0 0 1 1 0
## $ Right: int 45 0 92 18 26 48 41 52 64 80
## $ Up : int 24 26 32 41 80 76 92 39 46 50
## $ Down : int 100 69 46 24 0 32 86 71 65 48
#Create a Random Forest Model
set.seed(18)
rf <- randomForest(Left~., data=LG)
rf
##
## Call:
## randomForest(formula = Left ~ ., data = LG)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 70%
## Confusion matrix:
## 0 1 class.error
## 0 3 3 0.5
## 1 4 0 1.0
rf_2<-randomForest(Left ~., data=LG, mtry=2,ntree=500)
rf_2
##
## Call:
## randomForest(formula = Left ~ ., data = LG, mtry = 2, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 50%
## Confusion matrix:
## 0 1 class.error
## 0 4 2 0.3333333
## 1 3 1 0.7500000
rf_3<-randomForest(Left~., data=LG, mtry=3, ntree=500)
rf_3
##
## Call:
## randomForest(formula = Left ~ ., data = LG, mtry = 3, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 60%
## Confusion matrix:
## 0 1 class.error
## 0 3 3 0.50
## 1 3 1 0.75
control <- trainControl(method = "cv", number = 3)
grid_rf <- expand.grid(mtry=3)
cv_rf1 <- train(Left~., data=LG, method = "rf", importance=TRUE,
trControl=control, tuneGrid = grid_rf)
cv_rf1
## Random Forest
##
## 10 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 7, 7, 6
## Resampling results:
##
## Accuracy Kappa
## 0.6944444 0.5
##
## Tuning parameter 'mtry' was held constant at a value of 3
grid_rf <- expand.grid(mtry=2)
cv_rf2 <- train(Left~., data=LG, method = "rf", importance=TRUE,
trControl=control, metric="Accuracy", tuneGrid = grid_rf)
cv_rf2
## Random Forest
##
## 10 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 7, 7, 6
## Resampling results:
##
## Accuracy Kappa
## 0.4722222 -3.700743e-17
##
## Tuning parameter 'mtry' was held constant at a value of 2
trainControl<-trainControl(method="repeatedcv", number=6, repeats = 5)
cv_rf3<-train(Left~., data=LG,method="rf", metric="Accuracy", trControl=trainControl, ntree=500)
## note: only 2 unique complexity parameters in default grid. Truncating the grid to 2 .
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
cv_rf3
## Random Forest
##
## 10 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 5 times)
## Summary of sample sizes: 8, 8, 8, 8, 9, 9, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5833333 0.04347826
## 3 0.6666667 0.09523810
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
Several values for the number of folds and the number of repetitions were tried and the best accuracy was obtained with 4 folds and 10 repetitions.
#Making predictions
pred <- predict(cv_rf3, LG)
table(pred,LG$Left)
##
## pred 0 1
## 0 6 0
## 1 0 4
Of course, since the same dataset has been used for training and testing, all predictions are done correctly.
library(caret)
data("PimaIndiansDiabetes")
PIDD<-PimaIndiansDiabetes
#write.csv(PIDD, "C:/Users/Ghemri/Documents/AI4OPT/pima.csv", row.names=FALSE)
str(PIDD)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
# define an 80%/20% train/test split of the dataset
trainIndex <- createDataPartition(PIDD$diabetes, p=0.80, list=FALSE)
dataTrain <- PIDD[trainIndex, ]
dataTest <-PIDD[ -trainIndex, ]
##Creating Forest Tree
set.seed(18)
modelRF <- randomForest(diabetes~., data=dataTrain)
print(modelRF)
##
## Call:
## randomForest(formula = diabetes ~ ., data = dataTrain)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21.79%
## Confusion matrix:
## neg pos class.error
## neg 348 52 0.1300000
## pos 82 133 0.3813953
rf_2<-randomForest(diabetes ~., data=dataTrain, mtry=4,ntree=500)
rf_2
##
## Call:
## randomForest(formula = diabetes ~ ., data = dataTrain, mtry = 4, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 21.63%
## Confusion matrix:
## neg pos class.error
## neg 347 53 0.132500
## pos 80 135 0.372093
rf_3<-randomForest(diabetes~., data=dataTrain, mtry=3, ntree=500)
rf_3
##
## Call:
## randomForest(formula = diabetes ~ ., data = dataTrain, mtry = 3, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 22.11%
## Confusion matrix:
## neg pos class.error
## neg 348 52 0.1300000
## pos 84 131 0.3906977
rf_4<-randomForest(diabetes ~., data=dataTrain, mtry=5, ntree=1000)
rf_4
##
## Call:
## randomForest(formula = diabetes ~ ., data = dataTrain, mtry = 5, ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 22.76%
## Confusion matrix:
## neg pos class.error
## neg 344 56 0.1400000
## pos 84 131 0.3906977
With mtry=4 and ntree=500, the error rate was 21.63% that is a precision of 78.24% with mtry =3 and ntree=500, the error rate was 22.11% that is a precision of 77.89% I increased the number of trees to 1000, 1000 and mtry to 5, but the precision decreased.
#Making predictions
pred <- predict(rf_2, dataTest)
table <- confusionMatrix(dataTest$diabetes, pred)
print(table)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 83 17
## pos 23 30
##
## Accuracy : 0.7386
## 95% CI : (0.6615, 0.8062)
## No Information Rate : 0.6928
## P-Value [Acc > NIR] : 0.1265
##
## Kappa : 0.4069
##
## Mcnemar's Test P-Value : 0.4292
##
## Sensitivity : 0.7830
## Specificity : 0.6383
## Pos Pred Value : 0.8300
## Neg Pred Value : 0.5660
## Prevalence : 0.6928
## Detection Rate : 0.5425
## Detection Prevalence : 0.6536
## Balanced Accuracy : 0.7107
##
## 'Positive' Class : neg
##
#Note: The accuracy of the model on the test dataset is 73.8%
#Using another model
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
grid <- expand.grid(mtry =c(3,4,5))
tmodel = train(diabetes~., data=dataTrain, method="rf", tuneGrid = grid, trControl=trainControl)
print(tmodel)
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 554, 554, 554, 554, 553, 553, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.7826547 0.5040030
## 4 0.7864974 0.5159318
## 5 0.7827604 0.5051838
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
trainControl <- trainControl(method="repeatedcv", number=10, repeats=5)
grid <- expand.grid(mtry =c(3,4,5,6))
tmodel_2 = train(diabetes~., data=dataTrain, method="rf", tuneGrid = grid, trControl=trainControl)
tmodel_2
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 554, 554, 553, 553, 553, 554, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.7818033 0.5016447
## 4 0.7791909 0.4966946
## 5 0.7788683 0.4971861
## 6 0.7746378 0.4877061
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
plot(tmodel_2)
#Note: This model has an accuracy of 76.2% for mtry=3, but it keeps
changing.
trainControl <- trainControl(method="repeatedcv", number=10, repeats=5)
grid <- expand.grid(mtry =c(3,4,5,6))
tmodel_22 = train(diabetes~., data=dataTrain, method="rf", tuneGrid = grid, trControl=trainControl)
tmodel_22
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 554, 553, 553, 553, 554, 554, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.7752406 0.4890137
## 4 0.7791856 0.4992016
## 5 0.7729667 0.4862320
## 6 0.7752353 0.4910127
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
plot(tmodel_22)
pred2 <- predict(tmodel_2, dataTest)
table <- confusionMatrix(dataTest$diabetes, pred2)
print(table)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 83 17
## pos 22 31
##
## Accuracy : 0.7451
## 95% CI : (0.6684, 0.812)
## No Information Rate : 0.6863
## P-Value [Acc > NIR] : 0.06736
##
## Kappa : 0.4243
##
## Mcnemar's Test P-Value : 0.52184
##
## Sensitivity : 0.7905
## Specificity : 0.6458
## Pos Pred Value : 0.8300
## Neg Pred Value : 0.5849
## Prevalence : 0.6863
## Detection Rate : 0.5425
## Detection Prevalence : 0.6536
## Balanced Accuracy : 0.7182
##
## 'Positive' Class : neg
##
The model performance keeps changing each time the program is run. The prediction accuracy is around the mid 70% and Kappa has been ranging from the 40% to 50%.