# Library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(C50)
## Warning: package 'C50' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

The Data

# read in data
mush <- read_csv("mushroomsClean.csv")
## Rows: 8124 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (21): toxicity, cap_shape, cap_surface, cap_color, bruises, gill_attach,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mush <- mush |>
  mutate(across(where(is_character), as_factor))
str(mush)
## tibble [8,124 × 21] (S3: tbl_df/tbl/data.frame)
##  $ toxicity           : Factor w/ 2 levels "poison","edible": 1 2 2 1 2 2 2 2 1 2 ...
##  $ cap_shape          : Factor w/ 4 levels "convex","sunken",..: 1 1 2 1 1 1 2 2 1 2 ...
##  $ cap_surface        : Factor w/ 4 levels "smooth","scaly",..: 1 1 1 2 1 2 1 2 2 1 ...
##  $ cap_color          : Factor w/ 10 levels "brown","yellow",..: 1 2 3 3 4 2 3 3 3 2 ...
##  $ bruises            : Factor w/ 2 levels "yes","no": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_attach        : Factor w/ 2 levels "free","attached": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gill_spacing       : Factor w/ 2 levels "close","crowded": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size          : Factor w/ 2 levels "narrow","broad": 1 2 2 1 2 2 2 2 1 2 ...
##  $ gill_color         : Factor w/ 12 levels "black","brown",..: 1 1 2 2 1 2 3 2 4 3 ...
##  $ stalk_shape        : Factor w/ 2 levels "enlarging","tapering": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root         : Factor w/ 5 levels "equal","club",..: 1 2 2 1 1 2 2 2 1 2 ...
##  $ stalk_surface_above: Factor w/ 3 levels "smooth","fibrous",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_surface_below: Factor w/ 3 levels "smooth","fibrous",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_color_above  : Factor w/ 9 levels "white","gray",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_color_below  : Factor w/ 9 levels "white","pink",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color         : Factor w/ 4 levels "white","brown",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ring_number        : Factor w/ 3 levels "one","two","none": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ring_type          : Factor w/ 5 levels "pendant","evanescent",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ spore_print_color  : Factor w/ 9 levels "black","brown",..: 1 2 2 1 2 1 1 2 1 1 ...
##  $ population         : Factor w/ 6 levels "scattered","numerous",..: 1 2 2 1 3 2 2 1 4 1 ...
##  $ habitat            : Factor w/ 7 levels "urban","grasses",..: 1 2 3 1 2 2 3 3 2 3 ...

I would like to model toxicity using decision trees based on a number of variables that seem reasonable. By reasonable, I mean variables that I think I would be able to distinguish about a mushroom (definitely not a mycologist either) in order to make a model I would feel confident trusting.

Of the 21 variables provided in the data, I think the following would be noticeable by an average mushroom-eater: cap_shape, cap_color, bruises, gill_color, stalk_shape, stalk_color_above, stalk_color_below, population, and habitat.

mush_clean <- mush |>
  select(toxicity, 
         cap_shape, cap_color, bruises, gill_color, stalk_shape,
         stalk_color_above,  stalk_color_below, population, habitat)

Decision Tree

mush_model <- C5.0(toxicity~., data = mush_clean)
mush_model
## 
## Call:
## C5.0.formula(formula = toxicity ~ ., data = mush_clean)
## 
## Classification Tree
## Number of samples: 8124 
## Number of predictors: 9 
## 
## Tree size: 33 
## 
## Non-standard options: attempt to group attributes
summary(mush_model)
## 
## Call:
## C5.0.formula(formula = toxicity ~ ., data = mush_clean)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Wed Feb 12 17:28:46 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 8124 cases (10 attributes) from undefined.data
## 
## Decision tree:
## 
## gill_color in {buff,green}: poison (1752/34)
## gill_color in {black,brown,gray,pink,white,chocolate,purple,red,yellow,orange}:
## :...stalk_color_above in {brown,buff,cinnamon,yellow}:
##     :...cap_color in {yellow,white,gray,red,pink,buff,purple,cinnamon,
##     :   :             green}: poison (896/20)
##     :   cap_color = brown:
##     :   :...stalk_color_above in {brown,buff,yellow}: edible (16/1)
##     :       stalk_color_above = cinnamon: poison (12)
##     stalk_color_above in {white,gray,pink,red,orange}:
##     :...habitat = urban:
##         :...bruises = yes: poison (272/4)
##         :   bruises = no: edible (96/3)
##         habitat in {grasses,meadows,woods,paths,waste,leaves}:
##         :...stalk_shape = tapering:
##             :...habitat in {meadows,woods,paths,waste,
##             :   :           leaves}: edible (1824/30)
##             :   habitat = grasses:
##             :   :...bruises = yes: poison (144/2)
##             :       bruises = no: edible (768/14)
##             stalk_shape = enlarging:
##             :...stalk_color_above = gray: edible (0)
##                 stalk_color_above = pink: poison (432/5)
##                 stalk_color_above in {white,red,orange}:
##                 :...habitat = woods:
##                     :...cap_color in {purple,green}: edible (32/1)
##                     :   cap_color in {brown,yellow,red,buff,
##                     :   :             cinnamon}: poison (32/1)
##                     :   cap_color in {white,gray,pink}:
##                     :   :...cap_shape in {sunken,conical}: poison (0)
##                     :       cap_shape = flat: edible (8)
##                     :       cap_shape = convex:
##                     :       :...population in {numerous,abundant,
##                     :           :              clustered}: poison (0)
##                     :           population = solitary: edible (4)
##                     :           population in {scattered,several}:
##                     :           :...gill_color in {black,brown,gray,pink,
##                     :               :              purple,red,yellow,
##                     :               :              orange}: poison (194/5)
##                     :               gill_color in {white,
##                     :                              chocolate}: edible (2)
##                     habitat in {grasses,meadows,paths,waste,leaves}:
##                     :...population = abundant: edible (0)
##                         population = several:
##                         :...habitat in {grasses,meadows,
##                         :   :           waste}: poison (112/2)
##                         :   habitat in {paths,leaves}: edible (160/3)
##                         population in {scattered,numerous,solitary,clustered}:
##                         :...population in {numerous,solitary,clustered}:
##                             :...stalk_color_below in {pink,gray,buff,brown,red,
##                             :   :                     yellow,orange,
##                             :   :                     cinnamon}: edible (192/2)
##                             :   stalk_color_below = white:
##                             :   :...habitat in {grasses,meadows,paths,
##                             :       :           waste}: edible (608/12)
##                             :       habitat = leaves: poison (8)
##                             population = scattered:
##                             :...cap_shape = conical: edible (0)
##                                 cap_shape in {convex,sunken}:
##                                 :...cap_color in {yellow,white,gray,red,pink,
##                                 :   :             buff,purple,cinnamon,
##                                 :   :             green}: edible (440/29)
##                                 :   cap_color = brown:
##                                 :   :...habitat = grasses: poison (28/12)
##                                 :       habitat in {meadows,paths,waste,
##                                 :                   leaves}: edible (12)
##                                 cap_shape = flat:
##                                 :...cap_color in {gray,red,pink,buff,purple,
##                                     :             cinnamon,
##                                     :             green}: edible (0)
##                                     cap_color = white: poison (16)
##                                     cap_color in {brown,yellow}: [S1]
## 
## SubTree [S1]
## 
## gill_color = black: poison (4)
## gill_color in {brown,gray,pink,white,chocolate,purple,red,yellow,
##                orange}: edible (60/13)
## 
## 
## Evaluation on training data (8124 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      27  193( 2.4%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3817   108    (a): class poison
##      85  4114    (b): class edible
## 
## 
##  Attribute usage:
## 
##  100.00% gill_color
##   78.43% stalk_color_above
##   67.06% habitat
##   62.53% stalk_shape
##   22.65% population
##   21.61% cap_color
##   15.76% bruises
##    9.95% stalk_color_below
##    9.45% cap_shape
## 
## 
## Time: 0.0 secs

Impressively, buff gill color is a huge indicator of toxicity. Now if only I knew which color “buff” was… Gill color, stalk color, and cap color look like they are dominating the first few decisions, but from there general categories seem irrelevant as you go from one decision to the next.

plot(mush_model)

I was curious how base-plot would render this, and I am not impressed. I’m hoping to come back to this and rework the visualization once I complete the assignment.

Training and Testing

n <- nrow(mush)
rows2test <- sample(n, size = 0.3 * n)
mush_train <- mush[-rows2test,]
mush_test <- mush[rows2test,]


mush_model2 <- C5.0(toxicity ~ ., data = mush_train)
mush_fits <- predict(mush_model2, mush_test)
mush_cm <- confusionMatrix(mush_fits, mush_test$toxicity)
mush_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction poison edible
##     poison   1113     18
##     edible     32   1274
##                                          
##                Accuracy : 0.9795         
##                  95% CI : (0.973, 0.9847)
##     No Information Rate : 0.5302         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.9588         
##                                          
##  Mcnemar's Test P-Value : 0.06599        
##                                          
##             Sensitivity : 0.9721         
##             Specificity : 0.9861         
##          Pos Pred Value : 0.9841         
##          Neg Pred Value : 0.9755         
##              Prevalence : 0.4698         
##          Detection Rate : 0.4567         
##    Detection Prevalence : 0.4641         
##       Balanced Accuracy : 0.9791         
##                                          
##        'Positive' Class : poison         
## 

The model numerically does very well, with an accuracy of 0.979. However, a little less than 3 percent of the time, the model classifies a poisonous mushroom as being edible. I eat a lot of mushrooms, so 3 percent, while not being very large statistically, is a little too high for my taste. This is only a single test-train set, too, so repeated cross validation might reduce that value and leave me a little more confident in the model. I would definitely want to refine the model before using it to gauge how edible an unknown mushroom was.