Mushroom Data Set This data set sourced from the UCI repository of machine learning databases. Hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota family are categorized as either poisonous or edible based on physical attributes. More information is available at Mushroom Data Set Repository link
Task below to be performed using R and RStudio for this assignment. R Markdown (.Rmd) file to be published to rpubs.com and saved in GitHub repository.
This step is to source the data set in R program from github location
library (readr)
count<-3
dataUrl="https://raw.githubusercontent.com/rnivas2028/MSDS/Data607/Assignment1/DataSet/agaricus-lepiota.data"
dataSet <- read.csv(dataUrl, header = FALSE, sep = ",", stringsAsFactors = FALSE)Lets find how many rows and columns in original dataset
dim(dataSet)## [1] 8124 23
head(dataSet,count)## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1 p x s n t p f c n k e e s s w w p w o p k
## 2 e x s y t a f c b k e c s s w w p w o p n
## 3 e b s w t l f c b n e c s s w w p w o p n
## V22 V23
## 1 s u
## 2 n g
## 3 n m
tail(dataSet, count)## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 8122 e f s n f n a c b n e ? s s o o p o o p b
## 8123 p k y n f y f c n b t ? s k w w p w o e w
## 8124 e x s n f n a c b y e ? s s o o p o o p o
## V22 V23
## 8122 c l
## 8123 v l
## 8124 c l
dataSubSet <- subset(dataSet, select = c(1, 4, 6, 7, 8, 9, 10, 14, 15, 21, 22, 23))
dim(dataSubSet)## [1] 8124 12
head(dataSubSet,count)## V1 V4 V6 V7 V8 V9 V10 V14 V15 V21 V22 V23
## 1 p n p f c n k s w k s u
## 2 e y a f c b k s w n n g
## 3 e w l f c b n s w n n m
tail(dataSubSet, count)## V1 V4 V6 V7 V8 V9 V10 V14 V15 V21 V22 V23
## 8122 e n n a c b n s o b c l
## 8123 p n y f c n b k w w v l
## 8124 e n n a c b y s o o c l
colnames(dataSubSet) <- c("classes", "capColor", "odor", "gillAttachment", "gillSpacing", "gillSize", "gillColor", "stalkSurfaceBelowRing", "stalkColorAboveRing", "sporePrintColor", "population", "habitat")
head(dataSubSet)## classes capColor odor gillAttachment gillSpacing gillSize gillColor
## 1 p n p f c n k
## 2 e y a f c b k
## 3 e w l f c b n
## 4 p w p f c n n
## 5 e g n f w b k
## 6 e y a f c b n
## stalkSurfaceBelowRing stalkColorAboveRing sporePrintColor population habitat
## 1 s w k s u
## 2 s w n n g
## 3 s w n n m
## 4 s w k s u
## 5 s w n a g
## 6 s w k n g
dataSubSet$classes[dataSubSet$classes == "p"] <- "poisonous"
dataSubSet$classes[dataSubSet$classes == "e"] <- "edible"
dataSubSet$capColor[dataSubSet$capColor == "n"] <- "brown"
dataSubSet$capColor[dataSubSet$capColor == "b"] <- "buff"
dataSubSet$capColor[dataSubSet$capColor == "c"] <- "cinnamon"
dataSubSet$capColor[dataSubSet$capColor == "g"] <- "gray"
dataSubSet$capColor[dataSubSet$capColor == "r"] <- "green"
dataSubSet$capColor[dataSubSet$capColor == "p"] <- "pink"
dataSubSet$capColor[dataSubSet$capColor == "u"] <- "purple"
dataSubSet$capColor[dataSubSet$capColor == "e"] <- "red"
dataSubSet$capColor[dataSubSet$capColor == "w"] <- "white"
dataSubSet$capColor[dataSubSet$capColor == "y"] <- "yellow"
dataSubSet$odor[dataSubSet$odor == "a"] <- "almond"
dataSubSet$odor[dataSubSet$odor == "l"] <- "anise"
dataSubSet$odor[dataSubSet$odor == "c"] <- "creosote"
dataSubSet$odor[dataSubSet$odor == "y"] <- "fishy"
dataSubSet$odor[dataSubSet$odor == "f"] <- "foul"
dataSubSet$odor[dataSubSet$odor == "m"] <- "musty"
dataSubSet$odor[dataSubSet$odor == "n"] <- "none"
dataSubSet$odor[dataSubSet$odor == "p"] <- "pungent"
dataSubSet$odor[dataSubSet$odor == "s"] <- "spicy"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "f"] <- "fibrous"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "y"] <- "scaly"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "k"] <- "silky"
dataSubSet$stalkSurfaceBelowRing[dataSubSet$stalkSurfaceBelowRing == "s"] <- "smooth"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "n"] <- "brown"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "b"] <- "buff"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "c"] <- "cinnamon"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "g"] <- "gray"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "r"] <- "green"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "p"] <- "pink"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "u"] <- "purple"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "e"] <- "red"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "w"] <- "white"
dataSubSet$stalkColorAboveRing[dataSubSet$stalkColorAboveRing == "y"] <- "yellow"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "k"] <- "black"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "n"] <- "brown"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "b"] <- "buff"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "h"] <- "chocolate"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "r"] <- "green"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "o"] <- "orange"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "u"] <- "purple"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "w"] <- "white"
dataSubSet$sporePrintColor[dataSubSet$sporePrintColor == "y"] <- "yellow"
dataSubSet$population[dataSubSet$population == "a"] <- "abundant"
dataSubSet$population[dataSubSet$population == "c"] <- "clustered"
dataSubSet$population[dataSubSet$population == "n"] <- "numerous"
dataSubSet$population[dataSubSet$population == "s"] <- "scattered"
dataSubSet$population[dataSubSet$population == "v"] <- "several"
dataSubSet$population[dataSubSet$population == "y"] <- "solitary"
dataSubSet$habitat[dataSubSet$habitat == "g"] <- "grasses"
dataSubSet$habitat[dataSubSet$habitat == "l"] <- "leaves"
dataSubSet$habitat[dataSubSet$habitat == "m"] <- "meadows"
dataSubSet$habitat[dataSubSet$habitat == "p"] <- "paths"
dataSubSet$habitat[dataSubSet$habitat == "u"] <- "urban"
dataSubSet$habitat[dataSubSet$habitat == "w"] <- "waste"
dataSubSet$habitat[dataSubSet$habitat == "d"] <- "woods"
head(dataSubSet)## classes capColor odor gillAttachment gillSpacing gillSize gillColor
## 1 poisonous brown pungent f c n k
## 2 edible yellow almond f c b k
## 3 edible white anise f c b n
## 4 poisonous white pungent f c n n
## 5 edible gray none f w b k
## 6 edible yellow almond f c b n
## stalkSurfaceBelowRing stalkColorAboveRing sporePrintColor population habitat
## 1 smooth white black scattered urban
## 2 smooth white brown numerous grasses
## 3 smooth white brown numerous meadows
## 4 smooth white black scattered urban
## 5 smooth white brown abundant grasses
## 6 smooth white black numerous grasses
Disjunctive rules for poisonous mushrooms, from most general to most specific:
P_1) odor=NOT(almond.OR.anise.OR.none)
p1_ordorDataSet <- subset(dataSubSet, !(odor %in% c("almond","anise","none")))
nrow(p1_ordorDataSet)## [1] 3796
P_2) spore-print-color=green
p2_sporePrintColorDataSet <- subset(dataSubSet, sporePrintColor == "green")
nrow(p2_sporePrintColorDataSet)## [1] 72
P_3) odor=none.AND.stalk-surface-below-ring=scaly.AND.(stalk-color-above-ring=NOT.brown)
p3_ordorDataSet <- subset(dataSubSet, (odor == "none" & stalkSurfaceBelowRing == "scaly") & stalkColorAboveRing != "brown")
nrow(p3_ordorDataSet)## [1] 40
P_4) habitat=leaves.AND.cap-color=white or P_4) population=clustered.AND.cap_color=white
p4a_habitatDataSet <- subset(dataSubSet, habitat == "leaves" & capColor == "white")
nrow(p4a_habitatDataSet)## [1] 8
p4a_population <- subset(dataSubSet, population == "clustered" & capColor == "white")
nrow(p4a_population)## [1] 8
Below are findings after applying disjunctive rules on data set for poisonous mushrooms
poisonouMushCunt<-sum(nrow(p1_ordorDataSet), nrow(p2_sporePrintColorDataSet), nrow(p3_ordorDataSet), nrow(p4a_habitatDataSet));
print(poisonouMushCunt)## [1] 3916
edibleDataSubSet <- subset(dataSubSet, (classes == 'edible'))
nrow(edibleDataSubSet)## [1] 4208
poisonousDataSubSet <- subset(dataSubSet, (classes == 'poisonous'))
nrow(poisonousDataSubSet)## [1] 3916
Count of poisonous mushrooms from these rules is same as in original data set.
Percentage of poisonous mushrooms is:
pctPoisnousMushrooms<-poisonouMushCunt/nrow(dataSubSet)*100
print(pctPoisnousMushrooms)## [1] 48.20286
pctEdibleMushrooms<-100-pctPoisnousMushrooms
print(pctEdibleMushrooms)## [1] 51.79714