Good?, Bad?, Indifferent?
This project evaluates four model options to determine the best predictive ability for deciding if a wild mushroom is edible or poisonous. We explore Classification and Regression Trees (CART), Naive Bayes Classifier, Random Forests, and Linear Discriminant Analysis.
## Load the libraries we will be using
library(gapminder)
library(here)
library(socviz)
library(tidyverse)
library(data.table)
library(caret)
library(rpart)
library(corrplot)
library(klaR)
Data downloaded from https://archive.ics.uci.edu/ml/datasets/Mushroom and saved as .data file “agaricus.lepiota.data”.
rename dataset as “MushroomData” and look at the structure of the dataset.
MushroomURL <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
MushroomData <- fread(MushroomURL)
str(MushroomData)
## Classes 'data.table' and 'data.frame': 8123 obs. of 23 variables:
## $ p: chr "e" "e" "p" "e" ...
## $ x: chr "x" "b" "x" "x" ...
## $ s: chr "s" "s" "y" "s" ...
## $ n: chr "y" "w" "w" "g" ...
## $ t: chr "t" "t" "t" "f" ...
## $ p: chr "a" "l" "p" "n" ...
## $ f: chr "f" "f" "f" "f" ...
## $ c: chr "c" "c" "c" "w" ...
## $ n: chr "b" "b" "n" "b" ...
## $ k: chr "k" "n" "n" "k" ...
## $ e: chr "e" "e" "e" "t" ...
## $ e: chr "c" "c" "e" "e" ...
## $ s: chr "s" "s" "s" "s" ...
## $ s: chr "s" "s" "s" "s" ...
## $ w: chr "w" "w" "w" "w" ...
## $ w: chr "w" "w" "w" "w" ...
## $ p: chr "p" "p" "p" "p" ...
## $ w: chr "w" "w" "w" "w" ...
## $ o: chr "o" "o" "o" "o" ...
## $ p: chr "p" "p" "p" "e" ...
## $ k: chr "n" "n" "k" "n" ...
## $ s: chr "n" "n" "s" "a" ...
## $ u: chr "g" "m" "u" "g" ...
## - attr(*, ".internal.selfref")=<externalptr>
library(tidyverse)
colnames(MushroomData) <- c("Edibility", "CapShape", "CapSurface",
"CapColor", "Bruises", "Odor",
"GillAttachment", "GillSpacing", "GillSize",
"GillColor", "StalkShape", "StalkRoot",
"StalkSurfaceAboveRing", "StalkSurfaceBelowRing", "StalkColorAboveRing",
"StalkColorBelowRing", "VeilType", "VeilColor",
"RingNumber", "RingType", "SporePrintColor",
"Population", "Habitat")
MushroomData <- MushroomData %>% map_df(function(.x) as.factor(.x))
levels(MushroomData$Edibility) <- c("edible", "poisonous")
levels(MushroomData$CapShape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(MushroomData$CapColor) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(MushroomData$CapSurface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(MushroomData$Bruises) <- c("no", "yes")
levels(MushroomData$Odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(MushroomData$GillAttachment) <- c("attached", "free")
levels(MushroomData$GillSpacing) <- c("close", "crowded")
levels(MushroomData$GillSize) <- c("broad", "narrow")
levels(MushroomData$GillColor) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange",
"pink", "green", "purple", "white", "yellow")
levels(MushroomData$StalkShape) <- c("enlarging", "tapering")
levels(MushroomData$StalkRoot) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(MushroomData$StalkSurfaceAboveRing) <- c("fibrous", "silky", "smooth", "scaly")
levels(MushroomData$StalkSurfaceBelowRing) <- c("fibrous", "silky", "smooth", "scaly")
levels(MushroomData$StalkColorAboveRing) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(MushroomData$StalkColorBelowRing) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(MushroomData$VeilType) <- "partial"
levels(MushroomData$VeilColor) <- c("brown", "orange", "white", "yellow")
levels(MushroomData$RingNumber) <- c("none", "one", "two")
levels(MushroomData$RingType) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(MushroomData$SporePrintColor) <- c("buff", "chocolate", "black", "brown", "orange",
"green", "purple", "white", "yellow")
levels(MushroomData$Population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(MushroomData$Habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")
str(MushroomData)
## tibble [8,123 x 23] (S3: tbl_df/tbl/data.frame)
## $ Edibility : Factor w/ 2 levels "edible","poisonous": 1 1 2 1 1 1 1 2 1 1 ...
## $ CapShape : Factor w/ 6 levels "bell","conical",..: 6 1 6 6 6 1 1 6 1 6 ...
## $ CapSurface : Factor w/ 4 levels "fibrous","grooves",..: 3 3 4 3 4 3 4 4 3 4 ...
## $ CapColor : Factor w/ 10 levels "buff","cinnamon",..: 10 9 9 4 10 9 9 9 10 10 ...
## $ Bruises : Factor w/ 2 levels "no","yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ Odor : Factor w/ 9 levels "almond","creosote",..: 1 4 7 6 1 1 4 7 1 4 ...
## $ GillAttachment : Factor w/ 2 levels "attached","free": 2 2 2 2 2 2 2 2 2 2 ...
## $ GillSpacing : Factor w/ 2 levels "close","crowded": 1 1 1 2 1 1 1 1 1 1 ...
## $ GillSize : Factor w/ 2 levels "broad","narrow": 1 1 2 1 1 1 1 2 1 1 ...
## $ GillColor : Factor w/ 12 levels "buff","red","gray",..: 5 6 6 5 6 3 6 8 3 3 ...
## $ StalkShape : Factor w/ 2 levels "enlarging","tapering": 1 1 1 2 1 1 1 1 1 1 ...
## $ StalkRoot : Factor w/ 5 levels "missing","bulbous",..: 3 3 4 4 3 3 3 4 3 3 ...
## $ StalkSurfaceAboveRing: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ StalkSurfaceBelowRing: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ StalkColorAboveRing : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ StalkColorBelowRing : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ VeilType : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
## $ VeilColor : Factor w/ 4 levels "brown","orange",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ RingNumber : Factor w/ 3 levels "none","one","two": 2 2 2 2 2 2 2 2 2 2 ...
## $ RingType : Factor w/ 5 levels "evanescent","flaring",..: 5 5 5 1 5 5 5 5 5 5 ...
## $ SporePrintColor : Factor w/ 9 levels "buff","chocolate",..: 4 4 3 4 3 3 4 3 3 4 ...
## $ Population : Factor w/ 6 levels "abundant","clustered",..: 3 3 4 1 3 3 4 5 4 3 ...
## $ Habitat : Factor w/ 7 levels "wood","grasses",..: 2 4 6 2 2 4 4 2 4 2 ...
MushroomData <- subset(MushroomData,
select = -c(VeilType))
summary(MushroomData)
## Edibility CapShape CapSurface CapColor Bruises
## edible :4208 bell : 452 fibrous:2320 brown :2283 no :4748
## poisonous:3915 conical: 4 grooves: 4 gray :1840 yes:3375
## flat :3152 scaly :2555 red :1500
## knobbed: 828 smooth :3244 yellow :1072
## sunken : 32 white :1040
## convex :3655 buff : 168
## (Other): 220
## Odor GillAttachment GillSpacing GillSize GillColor
## none :3528 attached: 210 close :6811 broad :5612 buff :1728
## foul :2160 free :7913 crowded:1312 narrow:2511 pink :1492
## spicy : 576 white :1202
## fishy : 576 brown :1048
## almond : 400 gray : 752
## anise : 400 chocolate: 732
## (Other): 483 (Other) :1169
## StalkShape StalkRoot StalkSurfaceAboveRing StalkSurfaceBelowRing
## enlarging:3515 missing:2480 fibrous: 552 fibrous: 600
## tapering :4608 bulbous:3776 silky :2372 silky :2304
## club : 556 smooth :5175 smooth :4935
## equal :1119 scaly : 24 scaly : 284
## rooted : 192
##
##
## StalkColorAboveRing StalkColorBelowRing VeilColor RingNumber
## purple :4463 purple :4383 brown : 96 none: 36
## green :1872 green :1872 orange: 96 one :7487
## gray : 576 gray : 576 white :7923 two : 600
## brown : 448 brown : 512 yellow: 8
## buff : 432 buff : 432
## pink : 192 pink : 192
## (Other): 140 (Other): 156
## RingType SporePrintColor Population Habitat
## evanescent:2776 white :2388 abundant : 384 wood :3148
## flaring : 48 brown :1968 clustered: 340 grasses:2148
## large :1296 black :1871 numerous : 400 leaves : 832
## none : 36 chocolate:1632 scattered:1247 meadows: 292
## pendant :3967 green : 72 several :4040 paths :1144
## buff : 48 solitary :1712 urban : 367
## (Other) : 144 waste : 192
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.6, built: 2019-11-24)
## ## Copyright (C) 2005-2021 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(MushroomData)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
sapply(data, function(MushroomData) sum(is.na(MushroomData)))
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'symbol'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'symbol'
## Warning in is.na(MushroomData): is.na() applied to non-(list or vector) of type
## 'language'
## ... list package lib.loc verbose envir overwrite
## 0 0 0 0 0 0 0 0
list(unique(MushroomData$CapColor))
## [[1]]
## [1] yellow white gray brown red pink buff purple
## [9] cinnamon green
## Levels: buff cinnamon red gray brown pink green purple white yellow
color <- table(MushroomData[4])
color1 <- melt(color)
## Warning in melt(color): The melt generic in data.table has been passed a table
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(color). In the
## next version, this warning will become an error.
color1<- as.data.frame(color1)
color1 <- color1 %>% arrange(value)
data=data.frame(id=c(1:10),individual=color1[1],
value=color1[2]
)
colnames(data) <-c("id","individual","value")
ggplot(data, aes(x=as.factor(individual), y=value))+
geom_bar(stat="identity", aes(fill=individual))+
scale_fill_manual("legend", values = c( "orange", "pink",
"red", "gray", "brown","pink", "green", "purple",
"white","yellow"))+
ggtitle("Mushroom Species Frequency by Cap Colors")+
labs(y="Numbers", x = "Cap Color")
list(unique(MushroomData$CapShape))
## [[1]]
## [1] convex bell sunken flat knobbed conical
## Levels: bell conical flat knobbed sunken convex
shape <- table(MushroomData[2])
shape1 <- melt(shape)
## Warning in melt(shape): The melt generic in data.table has been passed a table
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(shape). In the
## next version, this warning will become an error.
shape1<- as.data.frame(shape1)
shape1 <- shape1 %>% arrange(value)
data=data.frame(id=c(1:6),individual=shape1[1],
value=shape1[2]
)
colnames(data) <-c("id","individual","value")
ggplot(data, aes(x=as.factor(individual), y=value))+
geom_bar(stat="identity", aes(fill=individual))+
scale_fill_manual("legend", values = c( "brown","pink", "green", "purple", "black","yellow"))+
ggtitle("Mushroom Species Frequency by Cap Shape")+
labs(y="Numbers", x = "Cap Shape")
list(unique(MushroomData$Odor))
## [[1]]
## [1] almond anise pungent none foul creosote fishy spicy
## [9] musty
## Levels: almond creosote foul anise musty none pungent spicy fishy
odor_plot <-table(MushroomData[c(1,6)])
barplot(odor_plot,legend.text=TRUE, beside=TRUE, col=c("green","red"), xlab = "Odor",ylab = "Number",
main="Bar Chart Showing the Edibility of Mushrooms by Odor", cex.names=.75)
odor_plot <-table(MushroomData[c(1)])
pie(odor_plot, col=c("green","red"),
main="Pie Chart Showing the Edibility of Mushrooms")
# set seed for reproducibility
set.seed(123)
# 1. Get row numbers for the training data
InTraining <- createDataPartition(MushroomData$Edibility, p = 0.7, list = F)
# 2. Create the training dataset
training <- MushroomData[InTraining, ]
# 3. Create the testing dataset
testing <- MushroomData[-InTraining, ]
# 4. Create a label and set of predictors for use later.
Label <- training$Edibility
Predictors <- training[, 1:21]
In a classification setting, like we have with our mushroom data, the idea is to use recursive binary splitting of the predictor space using a classification error rate, commonly the Gini index. the Gini index is defined by
\(G = \sum_{k=1}^K \hat{p}_{mk}(1-\hat{p}_{mk})\)
which is a measure of total variance across the \(K\) classes.
An alternative that is used is entropy, given by \(D = -\sum_{k=1}^K \hat{p}_{mk}\text{ log }\hat{p}_{mk}\).
In both cases, the \(\hat{p}_{mk}\) is the proportion of training observations in the mth region that are from the kth class.
library(rpart)
library(rpart.plot)
rpart.grid <- expand.grid(.cp = 0)
trControl <- trainControl(method = "repeatedcv",
number=10,
repeats=5,
verboseIter=F)
rpart.model <- train(Edibility ~.,
data = training,
method = "rpart",
trControl = trControl,
tuneGrid = rpart.grid,
metric = "Accuracy")
rpart.model
## CART
##
## 5687 samples
## 21 predictor
## 2 classes: 'edible', 'poisonous'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 5118, 5118, 5118, 5118, 5118, 5119, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9982069 0.9964083
##
## Tuning parameter 'cp' was held constant at a value of 0
caret::confusionMatrix(data=predict(rpart.model,
type = "raw"),
reference = Label,
positive="edible")
## Confusion Matrix and Statistics
##
## Reference
## Prediction edible poisonous
## edible 2946 0
## poisonous 0 2741
##
## Accuracy : 1
## 95% CI : (0.9994, 1)
## No Information Rate : 0.518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.518
## Detection Rate : 0.518
## Detection Prevalence : 0.518
## Balanced Accuracy : 1.000
##
## 'Positive' Class : edible
##
rpart.model <- rpart(Edibility ~ .,
data = training,
method = "class",
cp = 0.00001)
rpart.model
## n= 5687
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5687 2741 edible (0.518023563 0.481976437)
## 2) Odor=almond,anise,none 3027 81 edible (0.973240833 0.026759167)
## 4) SporePrintColor=buff,chocolate,black,brown,orange,purple,white,yellow 2979 33 edible (0.988922457 0.011077543)
## 8) StalkColorBelowRing=red,gray,brown,pink,green,purple 2964 18 edible (0.993927126 0.006072874)
## 16) StalkColorBelowRing=red,gray,pink,green,purple 2907 7 edible (0.997592019 0.002407981)
## 32) Habitat=wood,grasses,meadows,paths,urban,waste 2763 0 edible (1.000000000 0.000000000) *
## 33) Habitat=leaves 144 7 edible (0.951388889 0.048611111)
## 66) CapSurface=scaly 137 0 edible (1.000000000 0.000000000) *
## 67) CapSurface=grooves,smooth 7 0 poisonous (0.000000000 1.000000000) *
## 17) StalkColorBelowRing=brown 57 11 edible (0.807017544 0.192982456)
## 34) StalkRoot=bulbous 46 0 edible (1.000000000 0.000000000) *
## 35) StalkRoot=missing 11 0 poisonous (0.000000000 1.000000000) *
## 9) StalkColorBelowRing=white 15 0 poisonous (0.000000000 1.000000000) *
## 5) SporePrintColor=green 48 0 poisonous (0.000000000 1.000000000) *
## 3) Odor=creosote,foul,musty,pungent,spicy,fishy 2660 0 poisonous (0.000000000 1.000000000) *
varImp(rpart.model)
## Overall
## Bruises 13.3194444
## CapColor 43.2615798
## CapSurface 13.3194444
## GillAttachment 13.3194444
## GillColor 1115.5470325
## GillSize 11.5221088
## GillSpacing 20.5976399
## Habitat 12.5345631
## Odor 2682.1401548
## Population 0.4307432
## SporePrintColor 1676.9208859
## StalkColorAboveRing 27.0152996
## StalkColorBelowRing 62.1056531
## StalkRoot 17.7543860
## StalkSurfaceAboveRing 1039.7917729
## StalkSurfaceBelowRing 949.5308711
## VeilColor 13.7237947
## CapShape 0.0000000
## StalkShape 0.0000000
## RingNumber 0.0000000
## RingType 0.0000000
printcp(rpart.model)
##
## Classification tree:
## rpart(formula = Edibility ~ ., data = training, method = "class",
## cp = 1e-05)
##
## Variables actually used in tree construction:
## [1] CapSurface Habitat Odor
## [4] SporePrintColor StalkColorBelowRing StalkRoot
##
## Root node error: 2741/5687 = 0.48198
##
## n= 5687
##
## CP nsplit rel error xerror xstd
## 1 0.9704487 0 1.0000000 1.0000000 0.01374739
## 2 0.0175119 1 0.0295513 0.0295513 0.00326001
## 3 0.0054725 2 0.0120394 0.0120394 0.00208970
## 4 0.0020066 3 0.0065669 0.0065669 0.00154539
## 5 0.0012769 5 0.0025538 0.0040131 0.00120883
## 6 0.0000100 7 0.0000000 0.0025538 0.00096466
plotcp(rpart.model)
rpart.model$cptable[which.min(rpart.model$cptable[, "xerror"]), "CP"]
## [1] 1e-05
bestcp <- round(rpart.model$cptable[which.min(rpart.model$cptable[, "xerror"]), "CP"], 4)
rpart.modelPruned <- prune(rpart.model, cp = bestcp)
rpart.plot(rpart.modelPruned,
extra = 104,
box.palette = "GnBu",
branch.lty = 3, nn = TRUE)
The well known Bayes formula for conditional probability,
\(P(A \cap B) = P(A,B) = P(A)P(B\vert A) = P(B)P(A \vert B) \Rightarrow P(B\vert A) = \frac{P(B)P(A\vert B)}{(P(A)}\)
is used in this package to create a classifier.
In a classification problem, we have some predictors (also called independent variables, covariates, and features) and a result that is our dependent variable (or our target, label, or class). Each of the observations in our dataset has some values for the predictors and a class. From this information we can create a learner that predicts the class for the given features.
In the Naive Bayes algorithm, as probability for each label when the predictors values are given. We want to find the label, or class, with the highest probability. The algorithm assumes independence of the features which, if is largely true for a given dataset will usually generate a very accurate model.
library(e1071)
naiveBayes.model <- train(Edibility ~. ,
method = "nb",
data = training,
trainControl = trainControl("cv",
number=10))
naiveBayes.model
## Naive Bayes
##
## 5687 samples
## 21 predictor
## 2 classes: 'edible', 'poisonous'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5687, 5687, 5687, 5687, 5687, 5687, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE NaN NaN
## TRUE 0.9277323 0.8548239
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
## = 1.
naiveBayes.model_pred <- predict(naiveBayes.model,
newdata = testing)
confusionMatrix(naiveBayes.model_pred,
testing$Edibility)
## Confusion Matrix and Statistics
##
## Reference
## Prediction edible poisonous
## edible 1237 169
## poisonous 25 1005
##
## Accuracy : 0.9204
## 95% CI : (0.9089, 0.9308)
## No Information Rate : 0.5181
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8398
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9802
## Specificity : 0.8560
## Pos Pred Value : 0.8798
## Neg Pred Value : 0.9757
## Prevalence : 0.5181
## Detection Rate : 0.5078
## Detection Prevalence : 0.5772
## Balanced Accuracy : 0.9181
##
## 'Positive' Class : edible
##
Random forest algorithms use a major modification to the bagging algorithm that involves building a large collection of decorrelated trees and then takes the average over all of them.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
randomForest.model <- train(Edibility ~.,
data = training,
method = "rf",
metric = "Accuracy",
ntree = 500,
trainControl=trainControl(method = "cv",
number = 10),
tuneGrid = expand.grid(.mtry = c(2,3,4,5)))
randomForest.model
## Random Forest
##
## 5687 samples
## 21 predictor
## 2 classes: 'edible', 'poisonous'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5687, 5687, 5687, 5687, 5687, 5687, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9615557 0.9227906
## 3 0.9944850 0.9889487
## 4 0.9995996 0.9991984
## 5 0.9998289 0.9996577
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
plot(randomForest.model)
randomForest.model_pred <- predict( randomForest.model,
newdata = testing)
mean((randomForest.model_pred!=testing$Edibility)^2)
## [1] 0
Linear discriminant analysis uses an approach to a modification of Bayes theorem to assign the posterior probability, \(p_{k}(x)\) that an \(X=x\) observation belongs to the kth class. We assume the \(X\)’s are drawn from a multivariate Gaussian distribution with a class-specific vector and a common covariance matrix.
#LDA
library(MASS)
library(MASS)
lda.model <- lda(Edibility~CapShape+CapSurface+CapColor+Bruises+Odor+GillAttachment+GillSpacing+GillSize+GillColor+StalkShape+StalkRoot+StalkSurfaceAboveRing+StalkSurfaceBelowRing+VeilColor+SporePrintColor+Population+Habitat, data = training)
lda.modelPredict <- predict(lda.model,
newdata = data.frame(testing))
mean(lda.modelPredict$class!=testing$Edibility)
## [1] 0
MAE=c()
mae=c()
for (i in 1:10){
train_data <- MushroomData[-((i*568-567):(i*568)), ]
test_data <- MushroomData[((i*568-567):(i*568)), ]
lda.model <- lda(Edibility~CapShape+CapSurface+CapColor+Bruises+Odor+
GillAttachment+GillSpacing+GillSize+GillColor+
StalkShape+StalkRoot+StalkSurfaceAboveRing+
StalkSurfaceBelowRing+VeilColor+SporePrintColor+
Population+Habitat, data=train_data)
lda.modelPredict <- predict(lda.model, test_data)
mae=c(mae, mean(lda.modelPredict$class!=test_data$Edibility))
}
MAE4=c(MAE,mean(mae))
MAE4
## [1] 0.0007042254