Load necessary packages

library(data.table)
library(ggplot2)

Read the mushroom data

shrooms <- fread("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", colClasses = "char", header = FALSE)
names(shrooms) <- c('class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
                   'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
                   'gill-color', 'stalk-shape', 'stalk-root',
                   'stalk-surface-above-ring', 'stalk-surface-below-ring',
                   'stalk-color-above-ring', 'stalk-color-below-ring',
                   'veil-type', 'veil-color', 'ring-number', 'ring-type',
                   'spore-print-color', 'population', 'habitat')

Explore the data

str(shrooms)
## Classes 'data.table' and 'data.frame':   8124 obs. of  23 variables:
##  $ class                   : chr  "p" "e" "e" "p" ...
##  $ cap-shape               : chr  "x" "x" "b" "x" ...
##  $ cap-surface             : chr  "s" "s" "s" "y" ...
##  $ cap-color               : chr  "n" "y" "w" "w" ...
##  $ bruises                 : chr  "t" "t" "t" "t" ...
##  $ odor                    : chr  "p" "a" "l" "p" ...
##  $ gill-attachment         : chr  "f" "f" "f" "f" ...
##  $ gill-spacing            : chr  "c" "c" "c" "c" ...
##  $ gill-size               : chr  "n" "b" "b" "n" ...
##  $ gill-color              : chr  "k" "k" "n" "n" ...
##  $ stalk-shape             : chr  "e" "e" "e" "e" ...
##  $ stalk-root              : chr  "e" "c" "c" "e" ...
##  $ stalk-surface-above-ring: chr  "s" "s" "s" "s" ...
##  $ stalk-surface-below-ring: chr  "s" "s" "s" "s" ...
##  $ stalk-color-above-ring  : chr  "w" "w" "w" "w" ...
##  $ stalk-color-below-ring  : chr  "w" "w" "w" "w" ...
##  $ veil-type               : chr  "p" "p" "p" "p" ...
##  $ veil-color              : chr  "w" "w" "w" "w" ...
##  $ ring-number             : chr  "o" "o" "o" "o" ...
##  $ ring-type               : chr  "p" "p" "p" "p" ...
##  $ spore-print-color       : chr  "k" "n" "n" "k" ...
##  $ population              : chr  "s" "n" "n" "s" ...
##  $ habitat                 : chr  "u" "g" "m" "u" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Data legend

The data is rather cryptic. Thankfully, the file agaricus-lepiota.names contains the legend for the attributes (which is where the names above came from):

     1. cap-shape:                bell=b,conical=c,convex=x,flat=f,
                                  knobbed=k,sunken=s
     2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
     3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,
                                  pink=p,purple=u,red=e,white=w,yellow=y
     4. bruises?:                 bruises=t,no=f
     5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
                                  musty=m,none=n,pungent=p,spicy=s
     6. gill-attachment:          attached=a,descending=d,free=f,notched=n
     7. gill-spacing:             close=c,crowded=w,distant=d
     8. gill-size:                broad=b,narrow=n
     9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
                                  green=r,orange=o,pink=p,purple=u,red=e,
                                  white=w,yellow=y
    10. stalk-shape:              enlarging=e,tapering=t
    11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
                                  rhizomorphs=z,rooted=r,missing=?
    12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
    13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
    14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                                  pink=p,red=e,white=w,yellow=y
    16. veil-type:                partial=p,universal=u
    17. veil-color:               brown=n,orange=o,white=w,yellow=y
    18. ring-number:              none=n,one=o,two=t
    19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
                                  none=n,pendant=p,sheathing=s,zone=z
    20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
                                  orange=o,purple=u,white=w,yellow=y
    21. population:               abundant=a,clustered=c,numerous=n,
                                  scattered=s,several=v,solitary=y
    22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
                                  urban=u,waste=w,woods=d

From chapter 3 of the course textbook, we know that odor is key, so we’ll keep that field together with a few others—specifically those with fewer options to reduce student fatigue \(\ddot\smile\). In a real classification exercise, all fields should be kept for analysis, of course.

minishrooms <- shrooms[, c("class", "odor", "gill-spacing", "gill-size",
                           "stalk-shape")]

The next step will be to replace the cryptic single character values with more readable ones. We will use a method called “update joins” which takes advantage of the speed of data.table’s merge capabilities and replacement by reference to overwrite the characters with text from a lookup table, similar to an in-place Vlookup in Excel, but very fast. See this stackoverflow question and this data.table tutorial.

The first step will be to create the lookup tables and the next step will be to update join/overwrite the old.

# Create edible field and remove class
minishrooms[, `:=`(edible = (class == "e"),
                   class = NULL)]

# Create lookup tables with which to replace the single character values. To
# make life easier, the "index" for each lookup table will be named with the
# corresponding value from the master mushroom table.

odor <- data.table(odor = c('a', 'l', 'c', 'y', 'f', 'm', 'n', 'p', 's'),
                   Odor = c('almond', 'anise', 'creosote', 'fishy', 'foul',
                            'musty', 'none', 'pungent', 'spicy'))
gillspace <- data.table(`gill-spacing` = c('c', 'w', 'd'),
                        gillSpace = c('close', 'crowded', 'distant'))
gillsize <- data.table(`gill-size` = c('b', 'n'),
                       gillSize = c('broad', 'narrow'))
stalkshape <- data.table(`stalk-shape` = c('e', 't'),
                         stalkShape = c('enlarging', 'tapering'))

# Now perform the update joins. Will chain the merges for speed purposes so
# intermediate data.tables aren't created. This is equivalent to updating
# minishroom time and again, but without making seperate versions of it.
# See https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html
minishrooms[
  odor, on = "odor", odor := i.Odor
  ][
  gillspace, on = "gill-spacing", `gill-spacing` := i.gillSpace
  ][
  gillsize, on = "gill-size", `gill-size` := i.gillSize
  ][
  stalkshape, on = "stalk-shape", `stalk-shape` := stalkShape  
  ]

Now we can look at our smaller, more comprehensible list of mushrooms.

minishrooms[]
##          odor gill-spacing gill-size stalk-shape edible
##    1: pungent        close    narrow   enlarging  FALSE
##    2:  almond        close     broad   enlarging   TRUE
##    3:   anise        close     broad   enlarging   TRUE
##    4: pungent        close    narrow   enlarging  FALSE
##    5:    none      crowded     broad    tapering   TRUE
##   ---                                                  
## 8120:    none        close     broad   enlarging   TRUE
## 8121:    none        close     broad   enlarging   TRUE
## 8122:    none        close     broad   enlarging   TRUE
## 8123:   fishy        close    narrow    tapering  FALSE
## 8124:    none        close     broad   enlarging   TRUE

Let’s finish with some graphical analysis:

ggplot(minishrooms) + geom_bar(aes(x = odor, fill = edible))

It seems that odor is a pretty decent predictor for edibility!