Overview

Mushroom Data Set This data set sourced from the UCI repository of machine learning databases. Hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota family are categorized as either poisonous or edible based on physical attributes. More information is available at Mushroom Data Set Repository link

Task Summary

Task below to be performed using R and RStudio for this assignment. R Markdown (.Rmd) file to be published to rpubs.com and saved in GitHub repository.

  • Source mushroom dataset
  • Create Data frame of selected columns
  • Add column names
  • Replace abbreviations
  • Logical rules for mushroom data sets

Source mushroom dataset

This step is to source the data set in R program from github location

library (readr)
count<-3
dataUrl="https://raw.githubusercontent.com/rnivas2028/MSDS/Data607/Assignment1/DataSet/agaricus-lepiota.data"
dataSet <- read.csv(dataUrl, header = FALSE, sep = ",", stringsAsFactors = FALSE)

Lets find how many rows and columns in original dataset

dim(dataSet)
## [1] 8124   23
head(dataSet,count)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p   k
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p   n
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p   n
##   V22 V23
## 1   s   u
## 2   n   g
## 3   n   m
tail(dataSet, count)
##      V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 8122  e  f  s  n  f  n  a  c  b   n   e   ?   s   s   o   o   p   o   o   p   b
## 8123  p  k  y  n  f  y  f  c  n   b   t   ?   s   k   w   w   p   w   o   e   w
## 8124  e  x  s  n  f  n  a  c  b   y   e   ?   s   s   o   o   p   o   o   p   o
##      V22 V23
## 8122   c   l
## 8123   v   l
## 8124   c   l

Create sub dataset

dataSubSet <- subset(dataSet, select = c(1, 4, 6, 7, 8, 9, 10, 14, 15, 21, 22, 23))
dim(dataSubSet)
## [1] 8124   12
head(dataSubSet,count)
##   V1 V4 V6 V7 V8 V9 V10 V14 V15 V21 V22 V23
## 1  p  n  p  f  c  n   k   s   w   k   s   u
## 2  e  y  a  f  c  b   k   s   w   n   n   g
## 3  e  w  l  f  c  b   n   s   w   n   n   m
tail(dataSubSet, count)
##      V1 V4 V6 V7 V8 V9 V10 V14 V15 V21 V22 V23
## 8122  e  n  n  a  c  b   n   s   o   b   c   l
## 8123  p  n  y  f  c  n   b   k   w   w   v   l
## 8124  e  n  n  a  c  b   y   s   o   o   c   l

Add column names

colnames(dataSubSet) <- c("classes", "capColor", "odor", "gillAttachment", "gillSpacing", "gillSize", "gillColor", "stalkSurfaceBelowRing", "stalkColorAboveRing", "sporePrintColor", "population", "habitat")
head(dataSubSet)
##   classes capColor odor gillAttachment gillSpacing gillSize gillColor
## 1       p        n    p              f           c        n         k
## 2       e        y    a              f           c        b         k
## 3       e        w    l              f           c        b         n
## 4       p        w    p              f           c        n         n
## 5       e        g    n              f           w        b         k
## 6       e        y    a              f           c        b         n
##   stalkSurfaceBelowRing stalkColorAboveRing sporePrintColor population habitat
## 1                     s                   w               k          s       u
## 2                     s                   w               n          n       g
## 3                     s                   w               n          n       m
## 4                     s                   w               k          s       u
## 5                     s                   w               n          a       g
## 6                     s                   w               k          n       g

Replace abbreviations

dataSubSet$classes[dataSubSet$classes == "p"] <- "poisonous"
dataSubSet$classes[dataSubSet$classes == "e"] <- "edible"
dataSubSet$capColor[dataSubSet$capColor == "n"] <- "brown"
dataSubSet$capColor[dataSubSet$capColor == "b"] <- "buff"
dataSubSet$capColor[dataSubSet$capColor == "c"] <- "cinnamon"
dataSubSet$capColor[dataSubSet$capColor == "g"] <- "gray"
dataSubSet$capColor[dataSubSet$capColor == "r"] <- "green"
dataSubSet$capColor[dataSubSet$capColor == "p"] <- "pink"
dataSubSet$capColor[dataSubSet$capColor == "u"] <- "purple"
dataSubSet$capColor[dataSubSet$capColor == "e"] <- "red"
dataSubSet$capColor[dataSubSet$capColor == "w"] <- "white"
dataSubSet$capColor[dataSubSet$capColor == "y"] <- "yellow"
dataSubSet$odor[dataSubSet$odor == "a"] <- "almond"
dataSubSet$odor[dataSubSet$odor == "l"] <- "anise"
dataSubSet$odor[dataSubSet$odor == "c"] <- "creosote"
dataSubSet$odor[dataSubSet$odor == "y"] <- "fishy"
dataSubSet$odor[dataSubSet$odor == "f"] <- "foul"
dataSubSet$odor[dataSubSet$odor == "m"] <- "musty"
dataSubSet$odor[dataSubSet$odor == "n"] <- "none"
dataSubSet$odor[dataSubSet$odor == "p"] <- "pungent"
dataSubSet$odor[dataSubSet$odor == "s"] <- "spicy"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "f"] <- "fibrous"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "y"] <- "scaly"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "k"] <- "silky"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "s"] <- "smooth"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "n"] <- "brown"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "b"] <- "buff"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "c"] <- "cinnamon"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "g"] <- "gray"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "r"] <- "green"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "p"] <- "pink"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "u"] <- "purple"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "e"] <- "red"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "w"] <- "white"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "y"] <- "yellow"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "k"] <- "black"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "n"] <- "brown"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "b"] <- "buff"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "h"] <- "chocolate"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "r"] <- "green"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "o"] <- "orange"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "u"] <- "purple"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "w"] <- "white"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "y"] <- "yellow"
dataSubSet$population[dataSubSet$population == "a"] <- "abundant"
dataSubSet$population[dataSubSet$population == "c"] <- "clustered"
dataSubSet$population[dataSubSet$population == "n"] <- "numerous"
dataSubSet$population[dataSubSet$population == "s"] <- "scattered"
dataSubSet$population[dataSubSet$population == "v"] <- "several"
dataSubSet$population[dataSubSet$population == "y"] <- "solitary"
dataSubSet$habitat[dataSubSet$habitat == "g"] <- "grasses"
dataSubSet$habitat[dataSubSet$habitat == "l"] <- "leaves"
dataSubSet$habitat[dataSubSet$habitat == "m"] <- "meadows"
dataSubSet$habitat[dataSubSet$habitat == "p"] <- "paths"
dataSubSet$habitat[dataSubSet$habitat == "u"] <- "urban"
dataSubSet$habitat[dataSubSet$habitat == "w"] <- "waste"
dataSubSet$habitat[dataSubSet$habitat == "d"] <- "woods"
head(dataSubSet)
##     classes capColor    odor gillAttachment gillSpacing gillSize gillColor
## 1 poisonous    brown pungent              f           c        n         k
## 2    edible   yellow  almond              f           c        b         k
## 3    edible    white   anise              f           c        b         n
## 4 poisonous    white pungent              f           c        n         n
## 5    edible     gray    none              f           w        b         k
## 6    edible   yellow  almond              f           c        b         n
##   stalkSurfaceBelowRing stalkColorAboveRing sporePrintColor population habitat
## 1                smooth               white           black  scattered   urban
## 2                smooth               white           brown   numerous grasses
## 3                smooth               white           brown   numerous meadows
## 4                smooth               white           black  scattered   urban
## 5                smooth               white           brown   abundant grasses
## 6                smooth               white           black   numerous grasses

Logical rules for mushroom data sets

Disjunctive rules for poisonous mushrooms, from most general to most specific:

P_1) odor=NOT(almond.OR.anise.OR.none)

p1_ordorDataSet <- subset(dataSubSet, !(odor %in% c("almond","anise","none")))
nrow(p1_ordorDataSet)
## [1] 3796

P_2) spore-print-color=green

p2_sporePrintColorDataSet <- subset(dataSubSet, sporePrintColor == "green")
nrow(p2_sporePrintColorDataSet)
## [1] 72

P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.(stalk-color-above-ring=NOT.brown)

p3_ordorDataSet <- subset(dataSubSet, (odor == "none" & stalkSurfaceBelowRing == "scaly") & stalkColorAboveRing != "brown")
nrow(p3_ordorDataSet)
## [1] 40

P_4) habitat=leaves.AND.cap-color=white or P_4) population=clustered.AND.cap_color=white

p4a_habitatDataSet <- subset(dataSubSet, habitat == "leaves" & capColor == "white")
nrow(p4a_habitatDataSet)
## [1] 8
p4a_population <- subset(dataSubSet, population == "clustered" & capColor == "white")
nrow(p4a_population)
## [1] 8

Conclusions

Below are findings after applying disjunctive rules on data set for poisonous mushrooms

poisonouMushCunt<-sum(nrow(p1_ordorDataSet), nrow(p2_sporePrintColorDataSet), nrow(p3_ordorDataSet), nrow(p4a_habitatDataSet));
print(poisonouMushCunt)
## [1] 3916
edibleDataSubSet <- subset(dataSubSet, (classes == 'edible'))
nrow(edibleDataSubSet)
## [1] 4208
poisonousDataSubSet <- subset(dataSubSet, (classes == 'poisonous'))
nrow(poisonousDataSubSet)
## [1] 3916

Count of poisonous mushrooms from these rules is same as in original data set.

Percentage of poisonous mushrooms is:

pctPoisnousMushrooms<-poisonouMushCunt/nrow(dataSubSet)*100
print(pctPoisnousMushrooms)
## [1] 48.20286
pctEdibleMushrooms<-100-pctPoisnousMushrooms
print(pctEdibleMushrooms)
## [1] 51.79714