This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

LOAD THE NECESSARY LIBRARIES

# For manipulating the datasets
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(readxl)

# For plotting correlation matrix
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.3
## Loading required package: ggplot2
# Machine Learning library
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
# For Multi-core processing support
library(parallel)
numCores <- detectCores()

GET THE DATA

Load the datasets

#Numerical dataset
dataset_num <- read_excel("rice.xlsx")

#Categorical dataset
dataset_cat <- read.csv("mushrooms.csv")

#Mix dataset
dataset_mix <- read_excel("bank.xlsx")

CLEAN, PREPARE & MANIPULATE THE DATA

dataset_cat %>% group_by(VEIL.TYPE) %>% summarise(total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
#Eliminate VEIL.TYPE since it only has one value
dataset_cat <- dataset_cat %>% select(-VEIL.TYPE)

dataset_cat %>% group_by(STALK.ROOT) %>% summarise(total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
#Eliminate STALK.ROOT since it has missing values
dataset_cat <- dataset_cat %>% select(-STALK.ROOT)

SELECT THE DATASET

#dataset <- dataset_num
dataset <- dataset_cat
#dataset <- dataset_mix

dataset

TRAIN THE MODEL

Split train and test

trainIndex <- createDataPartition(dataset$CLASS, p=0.80, list=FALSE)
data_train <- dataset[ trainIndex,]
data_test <-  dataset[-trainIndex,]

Train model

fitControl <- trainControl(method="cv", 
                     repeats=1,
                     number=5, 
                   #  summaryFunction=twoClassSummary,
                     verboseIter=T,
                     classProbs=F,
                     allowParallel = TRUE)
## Warning: `repeats` has no meaning for this resampling method.
train_formula<-formula(CLASS~.)
rfFitupsam<- train(train_formula,
               data = data_train,
               method = "rf",
               #tuneLength = 9,
               #tuneGrid = svmGrid,
               #preProcess=c("scale","center"),
               #metric="ROC",
               #weights = model_weights,
               trControl = fitControl)
## + Fold1: mtry= 2 
## - Fold1: mtry= 2 
## + Fold1: mtry=46 
## - Fold1: mtry=46 
## + Fold1: mtry=91 
## - Fold1: mtry=91 
## + Fold2: mtry= 2 
## - Fold2: mtry= 2 
## + Fold2: mtry=46 
## - Fold2: mtry=46 
## + Fold2: mtry=91 
## - Fold2: mtry=91 
## + Fold3: mtry= 2 
## - Fold3: mtry= 2 
## + Fold3: mtry=46 
## - Fold3: mtry=46 
## + Fold3: mtry=91 
## - Fold3: mtry=91 
## + Fold4: mtry= 2 
## - Fold4: mtry= 2 
## + Fold4: mtry=46 
## - Fold4: mtry=46 
## + Fold4: mtry=91 
## - Fold4: mtry=91 
## + Fold5: mtry= 2 
## - Fold5: mtry= 2 
## + Fold5: mtry=46 
## - Fold5: mtry=46 
## + Fold5: mtry=91 
## - Fold5: mtry=91 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 46 on full training set
rfFitupsam
## Random Forest 
## 
## 6500 samples
##   20 predictor
##    2 classes: 'e', 'p' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5200, 5200, 5200, 5201, 5199 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9480015  0.8954717
##   46    1.0000000  1.0000000
##   91    1.0000000  1.0000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 46.
importance <- varImp(rfFitupsam, scale=FALSE)
plot(importance)

TEST THE DATA

predsrfprobsamp=predict(rfFitupsam,data_test)
confusionMatrix(predsrfprobsamp,as.factor(data_test$CLASS))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   e   p
##          e 841   0
##          p   0 783
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9977, 1)
##     No Information Rate : 0.5179     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.5179     
##          Detection Rate : 0.5179     
##    Detection Prevalence : 0.5179     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : e          
##