Reading in Mushroom Data from Internet
mush <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
Subsetting Data and Renaming Columns
mush1 <- mush[,c(1, 2, 3, 6, 21,23)]
colnames(mush1) <- c("safety", "cap_shape", "cap_surface", "odor", "spore print color", "habitat")
Renaming Characeristics of Mushrooms
mush1$safety <-gsub ("e","edible",mush1$safety)
mush1$safety <-gsub ("p","poisonous",mush1$safety)
levels(mush1$cap_shape) <- list(bell = "b", conical ="c", convex = "x", flat = "f", knobbed ="k", sunken="s")
levels(mush1$cap_surface) <-list(fibrous = "f", grooves = "g", scaly = "y", smooth = "s")
mush1$odor <- gsub ("\\<a\\>", "almond", mush1$odor)
mush1$odor <- gsub ("\\<l\\>", "anise", mush1$odor)
mush1$odor <- gsub ("\\<c\\>", "creosote", mush1$odor)
mush1$odor <- gsub ("\\<y\\>", "fishy", mush1$odor)
mush1$odor <- gsub ("\\<f\\>", "foul", mush1$odor)
mush1$odor <- gsub ("\\<m\\>", "musty", mush1$odor)
mush1$odor <- gsub ("\\<n\\>", "none", mush1$odor)
mush1$odor <- gsub ("\\<p\\>", "pungent", mush1$odor)
mush1$odor <- gsub ("\\<s\\>", "spicy", mush1$odor)
mush1$`spore print color` <- gsub ("\\<k\\>", "black", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<n\\>", "brown", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<b\\>", "buff", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<h\\>", "chocolate", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<r\\>", "green", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<o\\>", "orange", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<u\\>", "purple", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<w\\>", "white", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<y\\>", "yellow", mush1$`spore print color`)
mush1$habitat <- gsub ("\\<g\\>", "grasses", mush1$habitat)
mush1$habitat <- gsub ("\\<l\\>", "leaves", mush1$habitat)
mush1$habitat <- gsub ("\\<m\\>", "meadows", mush1$habitat)
mush1$habitat <- gsub ("\\<p\\>", "paths", mush1$habitat)
mush1$habitat <- gsub ("\\<u\\>", "urban", mush1$habitat)
mush1$habitat <- gsub ("\\<w\\>", "waste", mush1$habitat)
mush1$habitat <- gsub ("\\<d\\>", "woods", mush1$habitat)
head(mush1)
## safety cap_shape cap_surface odor spore print color habitat
## 1 edible convex smooth almond brown grasses
## 2 edible bell smooth anise brown meadows
## 3 poisonous convex scaly pungent black urban
## 4 edible convex smooth none brown grasses
## 5 edible convex scaly almond black grasses
## 6 edible bell smooth almond black meadows
Data Summary
summary(mush1)
## safety cap_shape cap_surface odor
## Length:8123 bell : 452 fibrous:2320 Length:8123
## Class :character conical: 4 grooves: 4 Class :character
## Mode :character convex :3655 scaly :3244 Mode :character
## flat :3152 smooth :2555
## knobbed: 828
## sunken : 32
## spore print color habitat
## Length:8123 Length:8123
## Class :character Class :character
## Mode :character Mode :character
##
##
##
num_obs <-nrow(mush1)
Function to determine proportion of a particular factor in the data set
ratio <-function(col_num, category,num_obs){
num_category <- nrow(mush1[mush1[,col_num]==category,])
prob_category <- num_category/num_obs
return(prob_category)
}
Calculating the Probability of a mushroom being edible or poisonous
prob_edible <-ratio(1, "edible", num_obs)
prob_edible
## [1] 0.5180352
prob_poison <-ratio(1, "poisonous", num_obs)
prob_poison
## [1] 0.4819648
Function to Calculate Entropy
entropy <- function(prob){
en <- -1*prob*log2(prob)
return(en)
}
Calculating Entropy of Parent Set - Whether Musrhoom is edible or poisonous
entropy_parent <- entropy(prob_edible) + entropy(prob_poison)
entropy_parent
## [1] 0.9990613
Calculating Probability According to Cap Shape
prob_bell <-ratio(2, "bell", num_obs)
prob_conical <-ratio(2, "conical", num_obs)
prob_convex <-ratio(2, "convex", num_obs)
prob_flat <-ratio(2, "flat", num_obs)
prob_knobbed <-ratio(2, "knobbed", num_obs)
prob_sunken <-ratio(2, "sunken", num_obs)
bell_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="bell"))
bell_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="bell"))
bell_total <- nrow(subset(mush1, cap_shape=="bell"))
conical_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="conical"))
conical_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="conical"))
conical_total <- nrow(subset(mush1, cap_shape=="conical"))
conical_edible
## [1] 0
conical_poison
## [1] 4
conical_total
## [1] 4
convex_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="convex"))
convex_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="convex"))
convex_total <- nrow(subset(mush1, cap_shape=="convex"))
flat_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="flat"))
flat_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="flat"))
flat_total <- nrow(subset(mush1, cap_shape=="flat"))
knobbed_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="knobbed"))
knobbed_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="knobbed"))
knobbed_total <- nrow(subset(mush1, cap_shape=="knobbed"))
sunken_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="sunken"))
sunken_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="sunken"))
sunken_total <- nrow(subset(mush1, cap_shape=="sunken"))
sunken_edible
## [1] 32
sunken_poison
## [1] 0
sunken_total
## [1] 32
Calculating Entropy for Cap Shape
entropy_cap_shape <- entropy(bell_edible/bell_total) + entropy(bell_poison/bell_total) + entropy(convex_edible/convex_total) + entropy(convex_poison/convex_total) + entropy(flat_edible/flat_total) + entropy(flat_poison/flat_total) + entropy(knobbed_edible/knobbed_total) + entropy(knobbed_poison/knobbed_total)
entropy_cap_shape
## [1] 3.33413
Calculating Probability According to Odor
prob_almond <-ratio(4, "almond", num_obs)
prob_anise <-ratio(4, "anise", num_obs)
prob_creosote <-ratio(4, "creosote", num_obs)
prob_fishy <-ratio(4, "fishy", num_obs)
prob_foul <-ratio(4, "foul", num_obs)
prob_musty <-ratio(4, "musty", num_obs)
prob_none <-ratio(4, "none", num_obs)
prob_pungent <-ratio(4, "pungent", num_obs)
prob_spicy <-ratio(4, "spicy", num_obs)
almond_edible <- nrow(subset(mush1, safety=="edible" & odor=="almond"))
almond_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="almond"))
almond_total <- nrow(subset(mush1, odor=="almond"))
anise_edible <- nrow(subset(mush1, safety=="edible" & odor=="anise"))
anise_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="anise"))
anise_total <- nrow(subset(mush1, odor=="anise"))
creosote_edible <- nrow(subset(mush1, safety=="edible" & odor=="creosote"))
creosote_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="creosote"))
creosote_total <- nrow(subset(mush1, odor=="creosote"))
fishy_edible <- nrow(subset(mush1, safety=="edible" & odor=="fishy"))
fishy_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="fishy"))
fishy_total <- nrow(subset(mush1, odor=="fishy"))
foul_edible <- nrow(subset(mush1, safety=="edible" & odor=="foul"))
foul_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="foul"))
foul_total <- nrow(subset(mush1, odor=="foul"))
musty_edible <- nrow(subset(mush1, safety=="edible" & odor=="musty"))
musty_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="musty"))
musty_total <- nrow(subset(mush1, odor=="musty"))
none_edible <- nrow(subset(mush1, safety=="edible" & odor=="none"))
none_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="none"))
none_total <- nrow(subset(mush1, odor=="none"))
pungent_edible <- nrow(subset(mush1, safety=="edible" & odor=="pungent"))
pungent_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="pungent"))
pungent_total <- nrow(subset(mush1, odor=="pungent"))
spicy_edible <- nrow(subset(mush1, safety=="edible" & odor=="spicy"))
spicy_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="spicy"))
spicy_total <- nrow(subset(mush1, odor=="spicy"))
Calculating Entropy for Odor
entropy_odor <- entropy(almond_edible/almond_total) + entropy(almond_poison/almond_total) + entropy(anise_edible/anise_total) + entropy(anise_poison/anise_total) + entropy(creosote_edible/creosote_total) + entropy(creosote_poison/creosote_total) + entropy(fishy_edible/fishy_total) + entropy(fishy_poison/fishy_total) + entropy(foul_edible/foul_total) + entropy(foul_poison/foul_total) + entropy(musty_edible/musty_total) + entropy(musty_poison/musty_total) + entropy(none_edible/none_total) + entropy(none_poison/none_total) + entropy(pungent_edible/pungent_total) + entropy(pungent_poison/pungent_total)+ entropy(spicy_edible/spicy_total) + entropy(spicy_poison/spicy_total)
entropy_odor
## [1] NaN
Calculating Entropy for Odor
entropy_odor <- entropy(almond_edible/almond_total)
+ entropy(almond_poison/almond_total)
## [1] NaN
+ entropy(anise_edible/anise_total)
## [1] 0
+ entropy(anise_poison/anise_total)
## [1] NaN
+ entropy(creosote_edible/creosote_total)
## [1] NaN
+ entropy(creosote_poison/creosote_total)
## [1] 0
+ entropy(fishy_edible/fishy_total)
## [1] NaN
+ entropy(fishy_poison/fishy_total)
## [1] 0
+ entropy(foul_edible/foul_total)
## [1] NaN
+ entropy(foul_poison/foul_total)
## [1] 0
+ entropy(musty_edible/musty_total)
## [1] NaN
+ entropy(musty_poison/musty_total)
## [1] 0
+ entropy(none_edible/none_total)
## [1] 0.04822709
+ entropy(none_poison/none_total)
## [1] 0.1659097
+ entropy(pungent_edible/pungent_total)
## [1] NaN
+ entropy(pungent_poison/pungent_total)
## [1] 0
+ entropy(spicy_edible/spicy_total)
## [1] NaN
+ entropy(spicy_poison/spicy_total)
## [1] 0
entropy_odor
## [1] 0
Calculating Entropy for Odor
entropy_odor <- entropy(none_edible/none_total) + entropy(none_poison/none_total)
entropy_odor
## [1] 0.2141368