library(tidyverse)
library(dplyr)
library(rsample)
library(caret)
library(e1071)
library(ROCR)
library(partykit)
library(ggplot2) # to visualize data
library(gridExtra) # to display multiple graph
library(inspectdf) # for EDA
library(tidymodels) # to build tidy models

Explanation

Hi!! Welcome to my LBB in this LBB i’m gonna use Mushroom dataset and make a classification whether if its edible or poisonous using Naive Bayes and Decision Tree. Enjoy!!

Input Data

data <- read_csv("mushrooms.csv")

Data Inspections

head(data)
dim(data)
#> [1] 8124   23
names(data)
#>  [1] "class"                    "cap-shape"               
#>  [3] "cap-surface"              "cap-color"               
#>  [5] "bruises"                  "odor"                    
#>  [7] "gill-attachment"          "gill-spacing"            
#>  [9] "gill-size"                "gill-color"              
#> [11] "stalk-shape"              "stalk-root"              
#> [13] "stalk-surface-above-ring" "stalk-surface-below-ring"
#> [15] "stalk-color-above-ring"   "stalk-color-below-ring"  
#> [17] "veil-type"                "veil-color"              
#> [19] "ring-number"              "ring-type"               
#> [21] "spore-print-color"        "population"              
#> [23] "habitat"

Data Pre Processing

glimpse(data)
#> Rows: 8,124
#> Columns: 23
#> $ class                      <chr> "p", "e", "e", "p", "e", "e", "e", "e", "p"…
#> $ `cap-shape`                <chr> "x", "x", "b", "x", "x", "x", "b", "b", "x"…
#> $ `cap-surface`              <chr> "s", "s", "s", "y", "s", "y", "s", "y", "y"…
#> $ `cap-color`                <chr> "n", "y", "w", "w", "g", "y", "w", "w", "w"…
#> $ bruises                    <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, …
#> $ odor                       <chr> "p", "a", "l", "p", "n", "a", "a", "l", "p"…
#> $ `gill-attachment`          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
#> $ `gill-spacing`             <chr> "c", "c", "c", "c", "w", "c", "c", "c", "c"…
#> $ `gill-size`                <chr> "n", "b", "b", "n", "b", "b", "b", "b", "n"…
#> $ `gill-color`               <chr> "k", "k", "n", "n", "k", "n", "g", "n", "p"…
#> $ `stalk-shape`              <chr> "e", "e", "e", "e", "t", "e", "e", "e", "e"…
#> $ `stalk-root`               <chr> "e", "c", "c", "e", "e", "c", "c", "c", "e"…
#> $ `stalk-surface-above-ring` <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s"…
#> $ `stalk-surface-below-ring` <chr> "s", "s", "s", "s", "s", "s", "s", "s", "s"…
#> $ `stalk-color-above-ring`   <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `stalk-color-below-ring`   <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `veil-type`                <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p"…
#> $ `veil-color`               <chr> "w", "w", "w", "w", "w", "w", "w", "w", "w"…
#> $ `ring-number`              <chr> "o", "o", "o", "o", "o", "o", "o", "o", "o"…
#> $ `ring-type`                <chr> "p", "p", "p", "p", "e", "p", "p", "p", "p"…
#> $ `spore-print-color`        <chr> "k", "n", "n", "k", "n", "k", "k", "n", "k"…
#> $ population                 <chr> "s", "n", "n", "s", "a", "n", "n", "s", "v"…
#> $ habitat                    <chr> "u", "g", "m", "u", "g", "g", "m", "m", "g"…

Variable Definitions :

  • classes: edible=e, poisonous=p
  • cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s
  • cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s
  • cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y
  • bruises: bruises=t, no=f
  • odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s
  • gill-attachment: attached=a, descending=d, free=f, notched=n
  • gill-spacing: close=c, crowded=w, distant=d
  • gill-size: broad=b, narrow=n
  • gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y
  • stalk-shape: enlarging=e, tapering=t
  • stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?
  • stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s
  • stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s
  • stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
  • stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y
  • veil-type: partial=p, universal=u
  • veil-color: brown=n, orange=o, white=w, yellow=y
  • ring-number: none=n, one=o, two=t
  • ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z
  • spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y
  • population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y
  • habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d

the data type is still not correct so we need to change it. we want to change class level from e and p to edibale and poisonous so we can understand it easier. we won’t need column that only have 1 factor so we remove it.

data <- data %>% mutate_all(as.factor) %>% 
  mutate(class = factor(class, levels = c("e","p"), labels = c("edible", "poisonous"))) %>% 
  select(-c("gill-attachment","veil-type"))

Lets check if there is any missing value

colSums(is.na(data))
#>                    class                cap-shape              cap-surface 
#>                        0                        0                        0 
#>                cap-color                  bruises                     odor 
#>                        0                        0                        0 
#>             gill-spacing                gill-size               gill-color 
#>                        0                        0                        0 
#>              stalk-shape               stalk-root stalk-surface-above-ring 
#>                        0                        0                        0 
#> stalk-surface-below-ring   stalk-color-above-ring   stalk-color-below-ring 
#>                        0                        0                        0 
#>               veil-color              ring-number                ring-type 
#>                        0                        0                        0 
#>        spore-print-color               population                  habitat 
#>                        0                        0                        0
data_clean <- data %>% na.omit()
anyNA(data_clean)
#> [1] FALSE

great!! there is no more missing value

head(data_clean)

Cross Validation

Split data to data train and data test

set.seed(123)
index <- sample(nrow(data_clean), 0.8*nrow(data_clean))

data_train <- data_clean[index,]
data_test <- data_clean[-index,]

check proportion data train

prop.table(table(data_train$class))
#> 
#>    edible poisonous 
#> 0.5145407 0.4854593

check proportion data train

prop.table(table(data_test$class))
#> 
#>    edible poisonous 
#> 0.5316923 0.4683077

Great! the proportion only slightly inbalance so we don’t need to perform up or down sampling, we can procced to build the model

Data Modeling

Naive Bayes

Build Model Without ‘Laplace’

we use all the predictors available in the data

model_nb <- naiveBayes(x = data_train %>% select(-class),
                      y = data_train$class)

Prediction

# model fitting
pred_naive <- predict(model_nb, data_test, type = "class") # for the class prediction
prob_naive <- predict(model_nb, data_test, type = "raw") # for the probability

Perfomance

confusionMatrix(pred_naive, data_test$class)
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  edible poisonous
#>   edible       858        93
#>   poisonous      6       668
#>                                                
#>                Accuracy : 0.9391               
#>                  95% CI : (0.9263, 0.9502)     
#>     No Information Rate : 0.5317               
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.8768               
#>                                                
#>  Mcnemar's Test P-Value : < 0.00000000000000022
#>                                                
#>             Sensitivity : 0.9931               
#>             Specificity : 0.8778               
#>          Pos Pred Value : 0.9022               
#>          Neg Pred Value : 0.9911               
#>              Prevalence : 0.5317               
#>          Detection Rate : 0.5280               
#>    Detection Prevalence : 0.5852               
#>       Balanced Accuracy : 0.9354               
#>                                                
#>        'Positive' Class : edible               
#> 
# result
naive_table <- select(data_test, class) %>%
  bind_cols(class_pred = pred_naive) %>% 
  bind_cols(class_eprob = round(prob_naive[,1],4)) %>% 
  bind_cols(class_pprob = round(prob_naive[,2],4))

# performance evaluation - confusion matrix
naive_table %>% 
  conf_mat(class, class_pred) %>% 
  autoplot(type = "heatmap")

naive_table %>%
  summarise(
    accuracy = accuracy_vec(class, class_pred),
    sensitivity = sens_vec(class, class_pred),
    specificity = spec_vec(class, class_pred),
    precision = precision_vec(class, class_pred)
  )

ROC

naive_roc <- data.frame(prediction=round(prob_naive[,1],4),
                      trueclass=as.numeric(naive_table$class=="edible"))
head(naive_roc)
naive_roc <- ROCR::prediction(naive_roc$prediction, naive_roc$trueclass) 

# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
     main = "ROC")
abline(a = 0, b = 1)

#### AUC

auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n
#> [1] 0.992321

based on model above we get an accuracy 94% and lets see if the model improv if we add ‘laplace=1’ in our formula.

Build Model with ‘laplace’

model_nb2 <- naiveBayes(x = data_train %>% select(-class),
                      y = data_train$class,
                      laplace = 1)

Prediction

# model fitting
pred_naive <- predict(model_nb2, data_test, type = "class") # for the class prediction
prob_naive <- predict(model_nb2, data_test, type = "raw") # for the probability

Performance

confusionMatrix(pred_naive, data_test$class)
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  edible poisonous
#>   edible       860        75
#>   poisonous      4       686
#>                                                
#>                Accuracy : 0.9514               
#>                  95% CI : (0.9398, 0.9613)     
#>     No Information Rate : 0.5317               
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.9018               
#>                                                
#>  Mcnemar's Test P-Value : 0.000000000000003391 
#>                                                
#>             Sensitivity : 0.9954               
#>             Specificity : 0.9014               
#>          Pos Pred Value : 0.9198               
#>          Neg Pred Value : 0.9942               
#>              Prevalence : 0.5317               
#>          Detection Rate : 0.5292               
#>    Detection Prevalence : 0.5754               
#>       Balanced Accuracy : 0.9484               
#>                                                
#>        'Positive' Class : edible               
#> 
# result
naive_table <- select(data_test, class) %>%
  bind_cols(class_pred = pred_naive) %>% 
  bind_cols(class_eprob = round(prob_naive[,1],4)) %>% 
  bind_cols(class_pprob = round(prob_naive[,2],4))

# performance evaluation - confusion matrix
naive_table %>% 
  conf_mat(class, class_pred) %>% 
  autoplot(type = "heatmap")

naive_table %>%
  summarise(
    accuracy = accuracy_vec(class, class_pred),
    sensitivity = sens_vec(class, class_pred),
    specificity = spec_vec(class, class_pred),
    precision = precision_vec(class, class_pred)
  )

ROC

naive_roc <- data.frame(prediction=round(prob_naive[,1],4),
                      trueclass=as.numeric(naive_table$class=="edible"))
head(naive_roc)
naive_roc <- ROCR::prediction(naive_roc$prediction, naive_roc$trueclass) 

# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
     main = "ROC")
abline(a = 0, b = 1)

AUC

auc_ROCR_n <- performance(naive_roc, measure = "auc")
auc_ROCR_n <- auc_ROCR_n@y.values[[1]]
auc_ROCR_n
#> [1] 0.9966783
final_n <- naive_table %>%
  summarise(
    accuracy = accuracy_vec(class, class_pred),
    sensitivity = sens_vec(class, class_pred),
    specificity = spec_vec(class, class_pred),
    precision = precision_vec(class, class_pred)
  )%>% 
  cbind(AUC = auc_ROCR_n)

the model accuracy above increase from 94% to 95% when add laplace = 1 in our formula. in our case, mushrooms is classified whether its safe or not for us to eat it. there are mushrooms that we cant or not safe to eat it because its poisonous. so we need a high accuracy model and more importantly we need the specificity have a high result as well because we dont want if our model predict the mushrooms edible but the truth its poisonous

Decision Tree

now let try bulid our model using Decision Tree and compere it with naive bayes model

Build Model

library(rpart)
library(rattle)
library(rpart.plot)

# model building
model_dtree <- rpart(formula = class ~ ., data = data_train, method = "class")

fancyRpartPlot(model_dtree, sub = NULL)

Prediction

pred_dt <- predict(model_dtree, data_test, type = "class") # for the class prediction
prob_dt <- predict(model_dtree, data_test, type = "prob") # for the probability

Perfomance

confusionMatrix(pred_dt, data_test$class)
#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  edible poisonous
#>   edible       864        12
#>   poisonous      0       749
#>                                                
#>                Accuracy : 0.9926               
#>                  95% CI : (0.9871, 0.9962)     
#>     No Information Rate : 0.5317               
#>     P-Value [Acc > NIR] : < 0.00000000000000022
#>                                                
#>                   Kappa : 0.9852               
#>                                                
#>  Mcnemar's Test P-Value : 0.001496             
#>                                                
#>             Sensitivity : 1.0000               
#>             Specificity : 0.9842               
#>          Pos Pred Value : 0.9863               
#>          Neg Pred Value : 1.0000               
#>              Prevalence : 0.5317               
#>          Detection Rate : 0.5317               
#>    Detection Prevalence : 0.5391               
#>       Balanced Accuracy : 0.9921               
#>                                                
#>        'Positive' Class : edible               
#> 
# result
dtree_table <- select(data_test, class) %>%
  bind_cols(class_pred = pred_dt) %>% 
  bind_cols(class_eprob = round(prob_dt[,1],4)) %>% 
  bind_cols(class_pprob = round(prob_dt[,2],4))

# performance evaluation - confusion matrix
naive_table %>% 
  conf_mat(class, class_pred) %>% 
  autoplot(type = "heatmap")

dtree_table %>%
  summarise(
    accuracy = accuracy_vec(class, class_pred),
    sensitivity = sens_vec(class, class_pred),
    specificity = spec_vec(class, class_pred),
    precision = precision_vec(class, class_pred)
  )

ROC

dtree_roc <- data.frame(prediction=round(prob_dt[,1],4),
                      trueclass=as.numeric(dtree_table$class=="edible"))
head(dtree_roc)
dtree_roc <- ROCR::prediction(dtree_roc$prediction, dtree_roc$trueclass) 

# ROC curve
plot(performance(naive_roc, "tpr", "fpr"),
     main = "ROC")
abline(a = 0, b = 1)

AUC

auc_ROCR_d <- performance(dtree_roc, measure = "auc")
auc_ROCR_d <- auc_ROCR_d@y.values[[1]]
auc_ROCR_d
#> [1] 0.9921156
final_d <- dtree_table %>%
  summarise(
    accuracy = accuracy_vec(class, class_pred),
    sensitivity = sens_vec(class, class_pred),
    specificity = spec_vec(class, class_pred),
    precision = precision_vec(class, class_pred)
  )%>% 
  cbind(AUC = auc_ROCR_d)

Based on result above we get accuracy of 99% and specificity of 98%.

Conclusion

rbind("Naive Bayes" = final_n, "Decision Tree" = final_d )

Based on the metrics table above, the predictive model built using Decision Tree algorithm gave the best result. The model gave highest accuracy 99% while also maintain sensitivity, specificity, and precision above 90%. It also gave the highest AUC at 99%. Therefore the best model to predict Mushrooms whether its edible or poisonous based on characteristic features is the Decision Tree model.