mushroom <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header = FALSE, sep = ",", stringsAsFactors = FALSE)
head(mushroom)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
mushSub <- subset(mushroom, select = c(1, 4, 6, 14, 15, 21, 22, 23))
head(mushSub)
## V1 V4 V6 V14 V15 V21 V22 V23
## 1 p n p s w k s u
## 2 e y a s w n n g
## 3 e w l s w n n m
## 4 p w p s w k s u
## 5 e g n s w n a g
## 6 e y a s w k n g
colnames(mushSub) <- c("class", "capColor", "odor", "stalkSurfaceBelowRing", "stalkColorAboveRing", "sporePrintColor", "population", "habitat")
head(mushSub)
## class capColor odor stalkSurfaceBelowRing stalkColorAboveRing
## 1 p n p s w
## 2 e y a s w
## 3 e w l s w
## 4 p w p s w
## 5 e g n s w
## 6 e y a s w
## sporePrintColor population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
mushSub$class[mushSub$class == "p"] <- "poisonous"
mushSub$class[mushSub$class == "e"] <- "edible"
mushSub$capColor[mushSub$capColor == "n"] <- "brown"
mushSub$capColor[mushSub$capColor == "b"] <- "buff"
mushSub$capColor[mushSub$capColor == "c"] <- "cinnamon"
mushSub$capColor[mushSub$capColor == "g"] <- "gray"
mushSub$capColor[mushSub$capColor == "r"] <- "green"
mushSub$capColor[mushSub$capColor == "p"] <- "pink"
mushSub$capColor[mushSub$capColor == "u"] <- "purple"
mushSub$capColor[mushSub$capColor == "e"] <- "red"
mushSub$capColor[mushSub$capColor == "w"] <- "white"
mushSub$capColor[mushSub$capColor == "y"] <- "yellow"
mushSub$odor[mushSub$odor == "a"] <- "almond"
mushSub$odor[mushSub$odor == "l"] <- "anise"
mushSub$odor[mushSub$odor == "c"] <- "creosote"
mushSub$odor[mushSub$odor == "y"] <- "fishy"
mushSub$odor[mushSub$odor == "f"] <- "foul"
mushSub$odor[mushSub$odor == "m"] <- "musty"
mushSub$odor[mushSub$odor == "n"] <- "none"
mushSub$odor[mushSub$odor == "p"] <- "pungent"
mushSub$odor[mushSub$odor == "s"] <- "spicy"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "f"] <- "fibrous"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "y"] <- "scaly"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "k"] <- "silky"
mushSub$stalkSurfaceBelowRing[mushSub$stalkSurfaceBelowRing == "s"] <- "smooth"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "n"] <- "brown"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "b"] <- "buff"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "c"] <- "cinnamon"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "g"] <- "gray"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "r"] <- "green"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "p"] <- "pink"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "u"] <- "purple"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "e"] <- "red"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "w"] <- "white"
mushSub$stalkColorAboveRing[mushSub$stalkColorAboveRing == "y"] <- "yellow"
mushSub$sporePrintColor[mushSub$sporePrintColor == "k"] <- "black"
mushSub$sporePrintColor[mushSub$sporePrintColor == "n"] <- "brown"
mushSub$sporePrintColor[mushSub$sporePrintColor == "b"] <- "buff"
mushSub$sporePrintColor[mushSub$sporePrintColor == "h"] <- "chocolate"
mushSub$sporePrintColor[mushSub$sporePrintColor == "r"] <- "green"
mushSub$sporePrintColor[mushSub$sporePrintColor == "o"] <- "orange"
mushSub$sporePrintColor[mushSub$sporePrintColor == "u"] <- "purple"
mushSub$sporePrintColor[mushSub$sporePrintColor == "w"] <- "white"
mushSub$sporePrintColor[mushSub$sporePrintColor == "y"] <- "yellow"
mushSub$population[mushSub$population == "a"] <- "abundant"
mushSub$population[mushSub$population == "c"] <- "clustered"
mushSub$population[mushSub$population == "n"] <- "numerous"
mushSub$population[mushSub$population == "s"] <- "scattered"
mushSub$population[mushSub$population == "v"] <- "several"
mushSub$population[mushSub$population == "y"] <- "solitary"
mushSub$habitat[mushSub$habitat == "g"] <- "grasses"
mushSub$habitat[mushSub$habitat == "l"] <- "leaves"
mushSub$habitat[mushSub$habitat == "m"] <- "meadows"
mushSub$habitat[mushSub$habitat == "p"] <- "paths"
mushSub$habitat[mushSub$habitat == "u"] <- "urban"
mushSub$habitat[mushSub$habitat == "w"] <- "waste"
mushSub$habitat[mushSub$habitat == "d"] <- "woods"
head(mushSub)
## class capColor odor stalkSurfaceBelowRing stalkColorAboveRing
## 1 poisonous brown pungent smooth white
## 2 edible yellow almond smooth white
## 3 edible white anise smooth white
## 4 poisonous white pungent smooth white
## 5 edible gray none smooth white
## 6 edible yellow almond smooth white
## sporePrintColor population habitat
## 1 black scattered urban
## 2 brown numerous grasses
## 3 brown numerous meadows
## 4 black scattered urban
## 5 brown abundant grasses
## 6 black numerous grasses
I chose the variables based on the data dictionary’s rules for identifying poisonous mushrooms. I’ll further subset the data set to explore those conditions and find the edible mushrooms using the negations of those rules.
p1 <- subset(mushSub, !(odor %in% c("almond","anise","none")))
table(p1$odor, p1$class)
##
## poisonous
## creosote 192
## fishy 576
## foul 2160
## musty 36
## pungent 256
## spicy 576
p2 <- subset(mushSub, sporePrintColor == "green")
table(p2$sporePrintColor, p2$class)
##
## poisonous
## green 72
p3 <- subset(mushSub, odor == "none" & stalkSurfaceBelowRing == "scaly" & stalkColorAboveRing != "brown")
table(p3$stalkColorAboveRing, p3$class)
##
## poisonous
## white 32
## yellow 8
p4 <- subset(mushSub, habitat == "leaves" & capColor == "white")
table(p4$habitat, p4$capColor, p4$class)
## , , = poisonous
##
##
## white
## leaves 8
4a. population=clustered.AND.cap_color=white
p4a <- subset(mushSub, population == "clustered" & capColor == "white")
table(p4a$population, p4a$capColor, p4a$class)
## , , = poisonous
##
##
## white
## clustered 8
By taking the negation of all the poisonous rules together, I can discover the edible mushrooms within the data set. I can also identify how many poisonous mushrooms come up in this subset and calculate the accuracy of the rules. Note that rule p3 is removed from the logical statement as it conflicts with rule p1.
e1 <- subset(mushSub,
odor %in% c("almond", "anise", "none") & #p1
sporePrintColor != "green" & #p2
#(odor != "none" | stalkSurfaceBelowRing != "scaly" | stalkColorAboveRing == "brown") & #p3
(habitat != "leaves" | capColor != "white")) #p4
table(e1$class)
##
## edible poisonous
## 4208 40
With 40 poisonous mushrooms in the subset, the rules are 99.51% accurate when using p4.
e1 <- subset(mushSub,
odor %in% c("almond", "anise", "none") & #p1
sporePrintColor != "green" & #p2
#(odor != "none" | stalkSurfaceBelowRing != "scaly" | stalkColorAboveRing == "brown") & #p3
(population != "clustered" | capColor != "white")) #p4a
table(e1$class)
##
## edible poisonous
## 4208 40
Substituting rule p4a for p4 also results in 40 poisonous mushrooms and the same accuracy rate of 99.51%