Good?, Bad?, Indifferent?

Good?, Bad?, Indifferent?

Exploration of CART, Naive Bayes Classifier, Random Forests, and Linear Discriminant Analysis modeling and perform cross validation

This project evaluates four model options to determine the best predictive ability for deciding if a wild mushroom is edible or poisonous. We explore Classification and Regression Trees (CART), Naive Bayes Classifier, Random Forests, and Linear Discriminant Analysis.

## Load the libraries we will be using
library(gapminder)
library(here)
library(socviz)
library(tidyverse)
library(data.table)
library(caret)
library(rpart)
library(corrplot)
library(klaR)

Load the dataset and perform EDA

Load the mushroom dataset.

Data downloaded from https://archive.ics.uci.edu/ml/datasets/Mushroom and saved as .data file “agaricus.lepiota.data”.

rename dataset as “MushroomData” and look at the structure of the dataset.

MushroomURL <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
MushroomData <- fread(MushroomURL)
str(MushroomData)
## Classes 'data.table' and 'data.frame':   8123 obs. of  23 variables:
##  $ p: chr  "e" "e" "p" "e" ...
##  $ x: chr  "x" "b" "x" "x" ...
##  $ s: chr  "s" "s" "y" "s" ...
##  $ n: chr  "y" "w" "w" "g" ...
##  $ t: chr  "t" "t" "t" "f" ...
##  $ p: chr  "a" "l" "p" "n" ...
##  $ f: chr  "f" "f" "f" "f" ...
##  $ c: chr  "c" "c" "c" "w" ...
##  $ n: chr  "b" "b" "n" "b" ...
##  $ k: chr  "k" "n" "n" "k" ...
##  $ e: chr  "e" "e" "e" "t" ...
##  $ e: chr  "c" "c" "e" "e" ...
##  $ s: chr  "s" "s" "s" "s" ...
##  $ s: chr  "s" "s" "s" "s" ...
##  $ w: chr  "w" "w" "w" "w" ...
##  $ w: chr  "w" "w" "w" "w" ...
##  $ p: chr  "p" "p" "p" "p" ...
##  $ w: chr  "w" "w" "w" "w" ...
##  $ o: chr  "o" "o" "o" "o" ...
##  $ p: chr  "p" "p" "p" "e" ...
##  $ k: chr  "n" "n" "k" "n" ...
##  $ s: chr  "n" "n" "s" "a" ...
##  $ u: chr  "g" "m" "u" "g" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Clean the data for modeling.

Our data structure includes all categorical variables with varying numbers of levels. Notice that the entities within each attribute are initials. So, for our first step, let’s give the attributes their proper names.
library(tidyverse)

colnames(MushroomData) <- c("Edibility", "CapShape", "CapSurface",
                        "CapColor", "Bruises", "Odor",
                        "GillAttachment", "GillSpacing", "GillSize",
                        "GillColor", "StalkShape", "StalkRoot",
                        "StalkSurfaceAboveRing", "StalkSurfaceBelowRing", "StalkColorAboveRing",
                        "StalkColorBelowRing", "VeilType", "VeilColor",
                        "RingNumber", "RingType", "SporePrintColor",
                        "Population", "Habitat")

MushroomData <- MushroomData %>% map_df(function(.x) as.factor(.x))

levels(MushroomData$Edibility) <- c("edible", "poisonous")
levels(MushroomData$CapShape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(MushroomData$CapColor) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
                                "green", "purple", "white", "yellow")
levels(MushroomData$CapSurface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(MushroomData$Bruises) <- c("no", "yes")
levels(MushroomData$Odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(MushroomData$GillAttachment) <- c("attached", "free")
levels(MushroomData$GillSpacing) <- c("close", "crowded")
levels(MushroomData$GillSize) <- c("broad", "narrow")
levels(MushroomData$GillColor) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange",
                                 "pink", "green", "purple", "white", "yellow")
levels(MushroomData$StalkShape) <- c("enlarging", "tapering")
levels(MushroomData$StalkRoot) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(MushroomData$StalkSurfaceAboveRing) <- c("fibrous", "silky", "smooth", "scaly")
levels(MushroomData$StalkSurfaceBelowRing) <- c("fibrous", "silky", "smooth", "scaly")
levels(MushroomData$StalkColorAboveRing) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
                                "green", "purple", "white", "yellow")
levels(MushroomData$StalkColorBelowRing) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
                                "green", "purple", "white", "yellow")
levels(MushroomData$VeilType) <- "partial"
levels(MushroomData$VeilColor) <- c("brown", "orange", "white", "yellow")
levels(MushroomData$RingNumber) <- c("none", "one", "two")
levels(MushroomData$RingType) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(MushroomData$SporePrintColor) <- c("buff", "chocolate", "black", "brown", "orange",
                                        "green", "purple", "white", "yellow")
levels(MushroomData$Population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(MushroomData$Habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")

Let’s look at the data using styr().

str(MushroomData)
## tibble [8,123 x 23] (S3: tbl_df/tbl/data.frame)
##  $ Edibility            : Factor w/ 2 levels "edible","poisonous": 1 1 2 1 1 1 1 2 1 1 ...
##  $ CapShape             : Factor w/ 6 levels "bell","conical",..: 6 1 6 6 6 1 1 6 1 6 ...
##  $ CapSurface           : Factor w/ 4 levels "fibrous","grooves",..: 3 3 4 3 4 3 4 4 3 4 ...
##  $ CapColor             : Factor w/ 10 levels "buff","cinnamon",..: 10 9 9 4 10 9 9 9 10 10 ...
##  $ Bruises              : Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 2 ...
##  $ Odor                 : Factor w/ 9 levels "almond","creosote",..: 1 4 7 6 1 1 4 7 1 4 ...
##  $ GillAttachment       : Factor w/ 2 levels "attached","free": 2 2 2 2 2 2 2 2 2 2 ...
##  $ GillSpacing          : Factor w/ 2 levels "close","crowded": 1 1 1 2 1 1 1 1 1 1 ...
##  $ GillSize             : Factor w/ 2 levels "broad","narrow": 1 1 2 1 1 1 1 2 1 1 ...
##  $ GillColor            : Factor w/ 12 levels "buff","red","gray",..: 5 6 6 5 6 3 6 8 3 3 ...
##  $ StalkShape           : Factor w/ 2 levels "enlarging","tapering": 1 1 1 2 1 1 1 1 1 1 ...
##  $ StalkRoot            : Factor w/ 5 levels "missing","bulbous",..: 3 3 4 4 3 3 3 4 3 3 ...
##  $ StalkSurfaceAboveRing: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ StalkSurfaceBelowRing: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ StalkColorAboveRing  : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ StalkColorBelowRing  : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ VeilType             : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
##  $ VeilColor            : Factor w/ 4 levels "brown","orange",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ RingNumber           : Factor w/ 3 levels "none","one","two": 2 2 2 2 2 2 2 2 2 2 ...
##  $ RingType             : Factor w/ 5 levels "evanescent","flaring",..: 5 5 5 1 5 5 5 5 5 5 ...
##  $ SporePrintColor      : Factor w/ 9 levels "buff","chocolate",..: 4 4 3 4 3 3 4 3 3 4 ...
##  $ Population           : Factor w/ 6 levels "abundant","clustered",..: 3 3 4 1 3 3 4 5 4 3 ...
##  $ Habitat              : Factor w/ 7 levels "wood","grasses",..: 2 4 6 2 2 4 4 2 4 2 ...
Notice that VeilType has only one level, “partial”, so we can remove this column from our data.
MushroomData <- subset(MushroomData, 
                       select = -c(VeilType))

Use summary() to take a different look at the composition of our data.

summary(MushroomData)
##      Edibility       CapShape      CapSurface      CapColor    Bruises   
##  edible   :4208   bell   : 452   fibrous:2320   brown  :2283   no :4748  
##  poisonous:3915   conical:   4   grooves:   4   gray   :1840   yes:3375  
##                   flat   :3152   scaly  :2555   red    :1500             
##                   knobbed: 828   smooth :3244   yellow :1072             
##                   sunken :  32                  white  :1040             
##                   convex :3655                  buff   : 168             
##                                                 (Other): 220             
##       Odor       GillAttachment  GillSpacing     GillSize        GillColor   
##  none   :3528   attached: 210   close  :6811   broad :5612   buff     :1728  
##  foul   :2160   free    :7913   crowded:1312   narrow:2511   pink     :1492  
##  spicy  : 576                                                white    :1202  
##  fishy  : 576                                                brown    :1048  
##  almond : 400                                                gray     : 752  
##  anise  : 400                                                chocolate: 732  
##  (Other): 483                                                (Other)  :1169  
##      StalkShape     StalkRoot    StalkSurfaceAboveRing StalkSurfaceBelowRing
##  enlarging:3515   missing:2480   fibrous: 552          fibrous: 600         
##  tapering :4608   bulbous:3776   silky  :2372          silky  :2304         
##                   club   : 556   smooth :5175          smooth :4935         
##                   equal  :1119   scaly  :  24          scaly  : 284         
##                   rooted : 192                                              
##                                                                             
##                                                                             
##  StalkColorAboveRing StalkColorBelowRing  VeilColor    RingNumber 
##  purple :4463        purple :4383        brown :  96   none:  36  
##  green  :1872        green  :1872        orange:  96   one :7487  
##  gray   : 576        gray   : 576        white :7923   two : 600  
##  brown  : 448        brown  : 512        yellow:   8              
##  buff   : 432        buff   : 432                                 
##  pink   : 192        pink   : 192                                 
##  (Other): 140        (Other): 156                                 
##        RingType     SporePrintColor     Population      Habitat    
##  evanescent:2776   white    :2388   abundant : 384   wood   :3148  
##  flaring   :  48   brown    :1968   clustered: 340   grasses:2148  
##  large     :1296   black    :1871   numerous : 400   leaves : 832  
##  none      :  36   chocolate:1632   scattered:1247   meadows: 292  
##  pendant   :3967   green    :  72   several  :4040   paths  :1144  
##                    buff     :  48   solitary :1712   urban  : 367  
##                    (Other)  : 144                    waste  : 192

Check for missing data using missmap() and sapply() to look for any NA’s.

library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(MushroomData)
## Warning: Unknown or uninitialised column: `arguments`.

## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

sapply(data, function(MushroomData) sum(is.na(MushroomData)))
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'symbol'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'

## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'symbol'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'
##       ...      list   package   lib.loc   verbose     envir overwrite           
##         0         0         0         0         0         0         0         0

Visual Examination of the Dataset.

Mushroom Species Frequency by Cap Colors.

list(unique(MushroomData$CapColor))
## [[1]]
##  [1] yellow   white    gray     brown    red      pink     buff     purple  
##  [9] cinnamon green   
## Levels: buff cinnamon red gray brown pink green purple white yellow

Plot of cap color by frequency.

color <- table(MushroomData[4])
color1 <- melt(color)
## Warning in melt(color): The melt generic in data.table has been passed a table
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(color). In the
## next version, this warning will become an error.
color1<- as.data.frame(color1)
color1 <- color1 %>% arrange(value)

data=data.frame(id=c(1:10),individual=color1[1],
                value=color1[2]
)
colnames(data) <-c("id","individual","value")

ggplot(data, aes(x=as.factor(individual), y=value))+
    geom_bar(stat="identity", aes(fill=individual))+
    scale_fill_manual("legend", values = c( "orange", "pink", 
    "red", "gray", "brown","pink", "green", "purple", 
    "white","yellow"))+
  ggtitle("Mushroom Species Frequency by Cap Colors")+
  labs(y="Numbers", x = "Cap Color")

Mushroom Species Frequency by Cap Shape

list(unique(MushroomData$CapShape))
## [[1]]
## [1] convex  bell    sunken  flat    knobbed conical
## Levels: bell conical flat knobbed sunken convex

Plot of cap shape by frequency.

shape <- table(MushroomData[2])
shape1 <- melt(shape)
## Warning in melt(shape): The melt generic in data.table has been passed a table
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(shape). In the
## next version, this warning will become an error.
shape1<- as.data.frame(shape1)
shape1 <- shape1 %>% arrange(value)

data=data.frame(id=c(1:6),individual=shape1[1],
                value=shape1[2]
)
colnames(data) <-c("id","individual","value")

ggplot(data, aes(x=as.factor(individual), y=value))+
    geom_bar(stat="identity", aes(fill=individual))+
    scale_fill_manual("legend", values = c( "brown","pink", "green", "purple", "black","yellow"))+
  ggtitle("Mushroom Species Frequency by Cap Shape")+
  labs(y="Numbers", x = "Cap Shape")

Edibility of Mushrooms by Odor.

list(unique(MushroomData$Odor))
## [[1]]
## [1] almond   anise    pungent  none     foul     creosote fishy    spicy   
## [9] musty   
## Levels: almond creosote foul anise musty none pungent spicy fishy

Plot of odor of mushroom by frequency.

odor_plot <-table(MushroomData[c(1,6)])
barplot(odor_plot,legend.text=TRUE, beside=TRUE, col=c("green","red"), xlab = "Odor",ylab = "Number",
        main="Bar Chart Showing the Edibility of Mushrooms by Odor", cex.names=.75)

Proportion of Edible Mushrooms

odor_plot <-table(MushroomData[c(1)])
pie(odor_plot, col=c("green","red"), 
    main="Pie Chart Showing the Edibility of Mushrooms")

Split the dataset into Test and Train sets

# set seed for reproducibility
set.seed(123)  
# 1. Get row numbers for the training data
InTraining <- createDataPartition(MushroomData$Edibility, p = 0.7, list = F)
# 2. Create the training dataset 
training <- MushroomData[InTraining, ]
# 3. Create the testing dataset
testing <- MushroomData[-InTraining, ]
# 4. Create a label and set of predictors for use later.
Label <- training$Edibility
Predictors <- training[, 1:21]

Modeling with the Data:

Model Number One: Classification and Regression Tree Model (CART) using rpart() function

In a classification setting, like we have with our mushroom data, the idea is to use recursive binary splitting of the predictor space using a classification error rate, commonly the Gini index. the Gini index is defined by

\(G = \sum_{k=1}^K \hat{p}_{mk}(1-\hat{p}_{mk})\)

which is a measure of total variance across the \(K\) classes.

An alternative that is used is entropy, given by \(D = -\sum_{k=1}^K \hat{p}_{mk}\text{ log }\hat{p}_{mk}\).

In both cases, the \(\hat{p}_{mk}\) is the proportion of training observations in the mth region that are from the kth class.

library(rpart)
library(rpart.plot)

rpart.grid <- expand.grid(.cp = 0)
trControl <- trainControl(method = "repeatedcv",
                          number=10, 
                          repeats=5, 
                          verboseIter=F)

rpart.model <- train(Edibility ~.,
                     data = training,
                     method = "rpart",
                     trControl = trControl,
                     tuneGrid = rpart.grid,
                     metric = "Accuracy")
rpart.model
## CART 
## 
## 5687 samples
##   21 predictor
##    2 classes: 'edible', 'poisonous' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 5118, 5118, 5118, 5118, 5118, 5119, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9982069  0.9964083
## 
## Tuning parameter 'cp' was held constant at a value of 0

Plot the Confusion Matrix

caret::confusionMatrix(data=predict(rpart.model, 
                                    type = "raw"), 
                       reference = Label, 
                       positive="edible")
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  edible poisonous
##   edible      2946         0
##   poisonous      0      2741
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9994, 1)
##     No Information Rate : 0.518      
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.000      
##             Specificity : 1.000      
##          Pos Pred Value : 1.000      
##          Neg Pred Value : 1.000      
##              Prevalence : 0.518      
##          Detection Rate : 0.518      
##    Detection Prevalence : 0.518      
##       Balanced Accuracy : 1.000      
##                                      
##        'Positive' Class : edible     
## 

Display the Model Split Details.

rpart.model <- rpart(Edibility ~ ., 
                    data = training, 
                    method = "class", 
                    cp = 0.00001)
rpart.model
## n= 5687 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 5687 2741 edible (0.518023563 0.481976437)  
##    2) Odor=almond,anise,none 3027   81 edible (0.973240833 0.026759167)  
##      4) SporePrintColor=buff,chocolate,black,brown,orange,purple,white,yellow 2979   33 edible (0.988922457 0.011077543)  
##        8) StalkColorBelowRing=red,gray,brown,pink,green,purple 2964   18 edible (0.993927126 0.006072874)  
##         16) StalkColorBelowRing=red,gray,pink,green,purple 2907    7 edible (0.997592019 0.002407981)  
##           32) Habitat=wood,grasses,meadows,paths,urban,waste 2763    0 edible (1.000000000 0.000000000) *
##           33) Habitat=leaves 144    7 edible (0.951388889 0.048611111)  
##             66) CapSurface=scaly 137    0 edible (1.000000000 0.000000000) *
##             67) CapSurface=grooves,smooth 7    0 poisonous (0.000000000 1.000000000) *
##         17) StalkColorBelowRing=brown 57   11 edible (0.807017544 0.192982456)  
##           34) StalkRoot=bulbous 46    0 edible (1.000000000 0.000000000) *
##           35) StalkRoot=missing 11    0 poisonous (0.000000000 1.000000000) *
##        9) StalkColorBelowRing=white 15    0 poisonous (0.000000000 1.000000000) *
##      5) SporePrintColor=green 48    0 poisonous (0.000000000 1.000000000) *
##    3) Odor=creosote,foul,musty,pungent,spicy,fishy 2660    0 poisonous (0.000000000 1.000000000) *

What are the most important predictors?

varImp(rpart.model)
##                            Overall
## Bruises                 13.3194444
## CapColor                43.2615798
## CapSurface              13.3194444
## GillAttachment          13.3194444
## GillColor             1115.5470325
## GillSize                11.5221088
## GillSpacing             20.5976399
## Habitat                 12.5345631
## Odor                  2682.1401548
## Population               0.4307432
## SporePrintColor       1676.9208859
## StalkColorAboveRing     27.0152996
## StalkColorBelowRing     62.1056531
## StalkRoot               17.7543860
## StalkSurfaceAboveRing 1039.7917729
## StalkSurfaceBelowRing  949.5308711
## VeilColor               13.7237947
## CapShape                 0.0000000
## StalkShape               0.0000000
## RingNumber               0.0000000
## RingType                 0.0000000

Finding the Lowest Cross-Validation Error

printcp(rpart.model)
## 
## Classification tree:
## rpart(formula = Edibility ~ ., data = training, method = "class", 
##     cp = 1e-05)
## 
## Variables actually used in tree construction:
## [1] CapSurface          Habitat             Odor               
## [4] SporePrintColor     StalkColorBelowRing StalkRoot          
## 
## Root node error: 2741/5687 = 0.48198
## 
## n= 5687 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.9704487      0 1.0000000 1.0000000 0.01374739
## 2 0.0175119      1 0.0295513 0.0295513 0.00326001
## 3 0.0054725      2 0.0120394 0.0120394 0.00208970
## 4 0.0020066      3 0.0065669 0.0065669 0.00154539
## 5 0.0012769      5 0.0025538 0.0040131 0.00120883
## 6 0.0000100      7 0.0000000 0.0025538 0.00096466

Plot Showing lowest X-Value Relative Error

plotcp(rpart.model)

rpart.model$cptable[which.min(rpart.model$cptable[, "xerror"]), "CP"]
## [1] 1e-05

Pruning Tree with the Lowest Cross-Validation Error

bestcp <- round(rpart.model$cptable[which.min(rpart.model$cptable[, "xerror"]), "CP"], 4)
rpart.modelPruned <- prune(rpart.model, cp = bestcp)

Plot of Pruned Tree

rpart.plot(rpart.modelPruned, 
           extra = 104, 
           box.palette = "GnBu", 
           branch.lty = 3, nn = TRUE)

Model Number Two: Naive Bayes Classifier using naiveBayes() from e1071 package

The well known Bayes formula for conditional probability,

\(P(A \cap B) = P(A,B) = P(A)P(B\vert A) = P(B)P(A \vert B) \Rightarrow P(B\vert A) = \frac{P(B)P(A\vert B)}{(P(A)}\)

is used in this package to create a classifier.

In a classification problem, we have some predictors (also called independent variables, covariates, and features) and a result that is our dependent variable (or our target, label, or class). Each of the observations in our dataset has some values for the predictors and a class. From this information we can create a learner that predicts the class for the given features.

In the Naive Bayes algorithm, as probability for each label when the predictors values are given. We want to find the label, or class, with the highest probability. The algorithm assumes independence of the features which, if is largely true for a given dataset will usually generate a very accurate model.

library(e1071)

naiveBayes.model <- train(Edibility ~. , 
                          method = "nb",
                          data = training,
                          trainControl = trainControl("cv", 
                                                      number=10))
naiveBayes.model
## Naive Bayes 
## 
## 5687 samples
##   21 predictor
##    2 classes: 'edible', 'poisonous' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 5687, 5687, 5687, 5687, 5687, 5687, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE            NaN        NaN
##    TRUE      0.9277323  0.8548239
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
##  = 1.

Make a prediction using test set.

naiveBayes.model_pred <- predict(naiveBayes.model, 
                                 newdata = testing)

Evaluate the model with a confusion matrix

confusionMatrix(naiveBayes.model_pred, 
                testing$Edibility)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  edible poisonous
##   edible      1237       169
##   poisonous     25      1005
##                                           
##                Accuracy : 0.9204          
##                  95% CI : (0.9089, 0.9308)
##     No Information Rate : 0.5181          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8398          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9802          
##             Specificity : 0.8560          
##          Pos Pred Value : 0.8798          
##          Neg Pred Value : 0.9757          
##              Prevalence : 0.5181          
##          Detection Rate : 0.5078          
##    Detection Prevalence : 0.5772          
##       Balanced Accuracy : 0.9181          
##                                           
##        'Positive' Class : edible          
## 

Model Number Three: Random Forest Classifier using randomForest() from the randomForest package

Random forest algorithms use a major modification to the bagging algorithm that involves building a large collection of decorrelated trees and then takes the average over all of them.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
randomForest.model <- train(Edibility ~.,
                            data = training,
                            method = "rf",
                            metric = "Accuracy",
                            ntree = 500, 
                            trainControl=trainControl(method = "cv", 
                                                      number = 10),
                            tuneGrid = expand.grid(.mtry = c(2,3,4,5)))
randomForest.model
## Random Forest 
## 
## 5687 samples
##   21 predictor
##    2 classes: 'edible', 'poisonous' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 5687, 5687, 5687, 5687, 5687, 5687, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9615557  0.9227906
##   3     0.9944850  0.9889487
##   4     0.9995996  0.9991984
##   5     0.9998289  0.9996577
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.

Look at a plot of the results

plot(randomForest.model)

Make a prediction using the model.

randomForest.model_pred <- predict( randomForest.model, 
                                    newdata = testing)

mean((randomForest.model_pred!=testing$Edibility)^2) 
## [1] 0

Model Number Four: Linear Discriminant Analysis using lda() from the MASS package

Linear discriminant analysis uses an approach to a modification of Bayes theorem to assign the posterior probability, \(p_{k}(x)\) that an \(X=x\) observation belongs to the kth class. We assume the \(X\)’s are drawn from a multivariate Gaussian distribution with a class-specific vector and a common covariance matrix.

#LDA
library(MASS)
library(MASS)
lda.model <- lda(Edibility~CapShape+CapSurface+CapColor+Bruises+Odor+GillAttachment+GillSpacing+GillSize+GillColor+StalkShape+StalkRoot+StalkSurfaceAboveRing+StalkSurfaceBelowRing+VeilColor+SporePrintColor+Population+Habitat, data = training)

lda.modelPredict <- predict(lda.model, 
                newdata = data.frame(testing))

mean(lda.modelPredict$class!=testing$Edibility)
## [1] 0

10-Fold Cross Validation for LDA Model.

MAE=c()
mae=c()

for (i in 1:10){
train_data <- MushroomData[-((i*568-567):(i*568)), ]
test_data <- MushroomData[((i*568-567):(i*568)), ]

lda.model <- lda(Edibility~CapShape+CapSurface+CapColor+Bruises+Odor+
            GillAttachment+GillSpacing+GillSize+GillColor+
            StalkShape+StalkRoot+StalkSurfaceAboveRing+
            StalkSurfaceBelowRing+VeilColor+SporePrintColor+
            Population+Habitat, data=train_data)

lda.modelPredict <- predict(lda.model, test_data)

mae=c(mae, mean(lda.modelPredict$class!=test_data$Edibility))
}

MAE4=c(MAE,mean(mae))
MAE4
## [1] 0.0007042254