Tasks

  1. Import mushrooms data set
  2. Create a data set with a subset of the columns (include edible/poisonous column and 3 or 4 others)
  3. Add meaningful column names
  4. Replace abbreviations in data

Task 1 - Import data set

mushroom <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header = FALSE, sep = ",", stringsAsFactors = FALSE)
head(mushroom)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Task 2 - Create subset

mushSub <- subset(mushroom, select = c(1, 4, 6, 14, 15, 21, 22, 23))
head(mushSub)
##   V1 V4 V6 V14 V15 V21 V22 V23
## 1  p  n  p   s   w   k   s   u
## 2  e  y  a   s   w   n   n   g
## 3  e  w  l   s   w   n   n   m
## 4  p  w  p   s   w   k   s   u
## 5  e  g  n   s   w   n   a   g
## 6  e  y  a   s   w   k   n   g

Task 3 - Add column names

colnames(mushSub) <- c("class", "capColor", "odor", "stalkSurfaceBelowRing", "stalkColorAboveRing", "sporePrintColor", "population", "habitat")
head(mushSub)
##   class capColor odor stalkSurfaceBelowRing stalkColorAboveRing
## 1     p        n    p                     s                   w
## 2     e        y    a                     s                   w
## 3     e        w    l                     s                   w
## 4     p        w    p                     s                   w
## 5     e        g    n                     s                   w
## 6     e        y    a                     s                   w
##   sporePrintColor population habitat
## 1               k          s       u
## 2               n          n       g
## 3               n          n       m
## 4               k          s       u
## 5               n          a       g
## 6               k          n       g

Task 4 - Replace abbreviations

mushSub$class[mushSub$class == "p"] <- "poisonous"
mushSub$class[mushSub$class == "e"] <- "edible"
mushSub$capColor[mushSub$capColor == "n"] <- "brown"
mushSub$capColor[mushSub$capColor == "b"] <- "buff"
mushSub$capColor[mushSub$capColor == "c"] <- "cinnamon"
mushSub$capColor[mushSub$capColor == "g"] <- "gray"
mushSub$capColor[mushSub$capColor == "r"] <- "green"
mushSub$capColor[mushSub$capColor == "p"] <- "pink"
mushSub$capColor[mushSub$capColor == "u"] <- "purple"
mushSub$capColor[mushSub$capColor == "e"] <- "red"
mushSub$capColor[mushSub$capColor == "w"] <- "white"
mushSub$capColor[mushSub$capColor == "y"] <- "yellow"
mushSub$odor[mushSub$odor == "a"] <- "almond"
mushSub$odor[mushSub$odor == "l"] <- "anise"
mushSub$odor[mushSub$odor == "c"] <- "creosote"
mushSub$odor[mushSub$odor == "y"] <- "fishy"
mushSub$odor[mushSub$odor == "f"] <- "foul"
mushSub$odor[mushSub$odor == "m"] <- "musty"
mushSub$odor[mushSub$odor == "n"] <- "none"
mushSub$odor[mushSub$odor == "p"] <- "pungent"
mushSub$odor[mushSub$odor == "s"] <- "spicy"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "f"] <- "fibrous"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "y"] <- "scaly"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "k"] <- "silky"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "s"] <- "smooth"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "n"] <- "brown"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "b"] <- "buff"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "c"] <- "cinnamon"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "g"] <- "gray"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "r"] <- "green"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "p"] <- "pink"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "u"] <- "purple"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "e"] <- "red"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "w"] <- "white"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "y"] <- "yellow"
mushSub$sporePrintColor[mushSub$sporePrintColor == "k"] <- "black"
mushSub$sporePrintColor[mushSub$sporePrintColor == "n"] <- "brown"
mushSub$sporePrintColor[mushSub$sporePrintColor == "b"] <- "buff"
mushSub$sporePrintColor[mushSub$sporePrintColor == "h"] <- "chocolate"
mushSub$sporePrintColor[mushSub$sporePrintColor == "r"] <- "green"
mushSub$sporePrintColor[mushSub$sporePrintColor == "o"] <- "orange"
mushSub$sporePrintColor[mushSub$sporePrintColor == "u"] <- "purple"
mushSub$sporePrintColor[mushSub$sporePrintColor == "w"] <- "white"
mushSub$sporePrintColor[mushSub$sporePrintColor == "y"] <- "yellow"
mushSub$population[mushSub$population == "a"] <- "abundant"
mushSub$population[mushSub$population == "c"] <- "clustered"
mushSub$population[mushSub$population == "n"] <- "numerous"
mushSub$population[mushSub$population == "s"] <- "scattered"
mushSub$population[mushSub$population == "v"] <- "several"
mushSub$population[mushSub$population == "y"] <- "solitary"
mushSub$habitat[mushSub$habitat == "g"] <- "grasses"
mushSub$habitat[mushSub$habitat == "l"] <- "leaves"
mushSub$habitat[mushSub$habitat == "m"] <- "meadows"
mushSub$habitat[mushSub$habitat == "p"] <- "paths"
mushSub$habitat[mushSub$habitat == "u"] <- "urban"
mushSub$habitat[mushSub$habitat == "w"] <- "waste"
mushSub$habitat[mushSub$habitat == "d"] <- "woods"

head(mushSub)
##       class capColor    odor stalkSurfaceBelowRing stalkColorAboveRing
## 1 poisonous    brown pungent                smooth               white
## 2    edible   yellow  almond                smooth               white
## 3    edible    white   anise                smooth               white
## 4 poisonous    white pungent                smooth               white
## 5    edible     gray    none                smooth               white
## 6    edible   yellow  almond                smooth               white
##   sporePrintColor population habitat
## 1           black  scattered   urban
## 2           brown   numerous grasses
## 3           brown   numerous meadows
## 4           black  scattered   urban
## 5           brown   abundant grasses
## 6           black   numerous grasses

Further questions

I chose the variables based on the data dictionary’s rules for identifying poisonous mushrooms. I’ll further subset the data set to explore those conditions and find the edible mushrooms using the negations of those rules.

The poisonous rules (disjuntive)

  1. odor=NOT(almond.OR.anise.OR.none)
p1 <- subset(mushSub, !(odor %in% c("almond","anise","none")))
table(p1$odor, p1$class)
##           
##            poisonous
##   creosote       192
##   fishy          576
##   foul          2160
##   musty           36
##   pungent        256
##   spicy          576
  1. spore-print-color=green
p2 <- subset(mushSub, sporePrintColor == "green")
table(p2$sporePrintColor, p2$class)
##        
##         poisonous
##   green        72
  1. odor=none.AND.stalk-surface-below-ring=scaly.AND.(stalk-color-above-ring=NOT.brown)
p3 <- subset(mushSub, odor == "none" & stalkSurfaceBelowRing == "scaly" & stalkColorAboveRing != "brown")
table(p3$stalkColorAboveRing, p3$class)
##         
##          poisonous
##   white         32
##   yellow         8
  1. habitat=leaves.AND.cap-color=white
p4 <- subset(mushSub, habitat == "leaves" & capColor == "white")
table(p4$habitat, p4$capColor, p4$class)
## , ,  = poisonous
## 
##         
##          white
##   leaves     8

4a. population=clustered.AND.cap_color=white

p4a <- subset(mushSub, population == "clustered" & capColor == "white")
table(p4a$population, p4a$capColor, p4a$class)
## , ,  = poisonous
## 
##            
##             white
##   clustered     8

Finding the edible mushrooms

By taking the negation of all the poisonous rules together, I can discover the edible mushrooms within the data set. I can also identify how many poisonous mushrooms come up in this subset and calculate the accuracy of the rules. Note that rule p3 is removed from the logical statement as it conflicts with rule p1.

  1. Using rule p4:
e1 <- subset(mushSub, 
             odor %in% c("almond", "anise", "none") & #p1
             sporePrintColor != "green" & #p2
             #(odor != "none" | stalkSurfaceBelowRing != "scaly" | stalkColorAboveRing == "brown") & #p3
             (habitat != "leaves" | capColor != "white")) #p4
table(e1$class)
## 
##    edible poisonous 
##      4208        40

With 40 poisonous mushrooms in the subset, the rules are 99.51% accurate when using p4.

  1. Using rule p4a
e1 <- subset(mushSub, 
             odor %in% c("almond", "anise", "none") & #p1
             sporePrintColor != "green" & #p2
             #(odor != "none" | stalkSurfaceBelowRing != "scaly" | stalkColorAboveRing == "brown") & #p3
             (population != "clustered" | capColor != "white")) #p4a
table(e1$class)
## 
##    edible poisonous 
##      4208        40

Substituting rule p4a for p4 also results in 40 poisonous mushrooms and the same accuracy rate of 99.51%