Reading in Mushroom Data from Internet

mush <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")

Subsetting Data and Renaming Columns

mush1 <- mush[,c(1, 2, 3, 6, 21,23)]

colnames(mush1) <- c("safety", "cap_shape", "cap_surface", "odor", "spore print color", "habitat")

Renaming Characeristics of Mushrooms

mush1$safety <-gsub ("e","edible",mush1$safety)
mush1$safety <-gsub ("p","poisonous",mush1$safety)

levels(mush1$cap_shape) <- list(bell = "b", conical ="c", convex = "x", flat = "f", knobbed ="k", sunken="s")

levels(mush1$cap_surface) <-list(fibrous = "f", grooves = "g", scaly = "y", smooth = "s")

mush1$odor <- gsub ("\\<a\\>", "almond", mush1$odor)
mush1$odor <- gsub ("\\<l\\>", "anise", mush1$odor)
mush1$odor <- gsub ("\\<c\\>", "creosote", mush1$odor)
mush1$odor <- gsub ("\\<y\\>", "fishy", mush1$odor)
mush1$odor <- gsub ("\\<f\\>", "foul", mush1$odor)
mush1$odor <- gsub ("\\<m\\>", "musty", mush1$odor)
mush1$odor <- gsub ("\\<n\\>", "none", mush1$odor)
mush1$odor <- gsub ("\\<p\\>", "pungent", mush1$odor)
mush1$odor <- gsub ("\\<s\\>", "spicy", mush1$odor)

mush1$`spore print color` <- gsub ("\\<k\\>", "black", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<n\\>", "brown", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<b\\>", "buff", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<h\\>", "chocolate", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<r\\>", "green", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<o\\>", "orange", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<u\\>", "purple", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<w\\>", "white", mush1$`spore print color`)
mush1$`spore print color` <- gsub ("\\<y\\>", "yellow", mush1$`spore print color`)

mush1$habitat <- gsub ("\\<g\\>", "grasses", mush1$habitat)
mush1$habitat <- gsub ("\\<l\\>", "leaves", mush1$habitat)
mush1$habitat <- gsub ("\\<m\\>", "meadows", mush1$habitat)
mush1$habitat <- gsub ("\\<p\\>", "paths", mush1$habitat)
mush1$habitat <- gsub ("\\<u\\>", "urban", mush1$habitat)
mush1$habitat <- gsub ("\\<w\\>", "waste", mush1$habitat)
mush1$habitat <- gsub ("\\<d\\>", "woods", mush1$habitat)

head(mush1)
##      safety cap_shape cap_surface    odor spore print color habitat
## 1    edible    convex      smooth  almond             brown grasses
## 2    edible      bell      smooth   anise             brown meadows
## 3 poisonous    convex       scaly pungent             black   urban
## 4    edible    convex      smooth    none             brown grasses
## 5    edible    convex       scaly  almond             black grasses
## 6    edible      bell      smooth  almond             black meadows

Data Summary

summary(mush1)
##     safety            cap_shape     cap_surface       odor          
##  Length:8123        bell   : 452   fibrous:2320   Length:8123       
##  Class :character   conical:   4   grooves:   4   Class :character  
##  Mode  :character   convex :3655   scaly  :3244   Mode  :character  
##                     flat   :3152   smooth :2555                     
##                     knobbed: 828                                    
##                     sunken :  32                                    
##  spore print color    habitat         
##  Length:8123        Length:8123       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
num_obs <-nrow(mush1)

Function to determine proportion of a particular factor in the data set

ratio <-function(col_num, category,num_obs){
  num_category <- nrow(mush1[mush1[,col_num]==category,])
  prob_category <- num_category/num_obs
  return(prob_category)
}

Calculating the Probability of a mushroom being edible or poisonous

prob_edible <-ratio(1, "edible", num_obs)
prob_edible
## [1] 0.5180352
prob_poison <-ratio(1, "poisonous", num_obs)
prob_poison
## [1] 0.4819648

Function to Calculate Entropy

entropy <- function(prob){
  en <- -1*prob*log2(prob)
  return(en)
}

Calculating Entropy of Parent Set - Whether Musrhoom is edible or poisonous

entropy_parent <- entropy(prob_edible) + entropy(prob_poison)
entropy_parent
## [1] 0.9990613

Calculating Probability According to Cap Shape

prob_bell <-ratio(2, "bell", num_obs)
prob_conical <-ratio(2, "conical", num_obs)
prob_convex <-ratio(2, "convex", num_obs)
prob_flat <-ratio(2, "flat", num_obs)
prob_knobbed <-ratio(2, "knobbed", num_obs)
prob_sunken <-ratio(2, "sunken", num_obs)


bell_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="bell"))
bell_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="bell"))
bell_total <- nrow(subset(mush1, cap_shape=="bell"))

conical_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="conical"))
conical_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="conical"))
conical_total <- nrow(subset(mush1, cap_shape=="conical"))

conical_edible
## [1] 0
conical_poison
## [1] 4
conical_total
## [1] 4
convex_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="convex"))
convex_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="convex"))
convex_total <- nrow(subset(mush1, cap_shape=="convex"))

flat_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="flat"))
flat_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="flat"))
flat_total <- nrow(subset(mush1, cap_shape=="flat"))

knobbed_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="knobbed"))
knobbed_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="knobbed"))
knobbed_total <- nrow(subset(mush1, cap_shape=="knobbed"))

sunken_edible <- nrow(subset(mush1, safety=="edible" & cap_shape=="sunken"))
sunken_poison <- nrow(subset(mush1, safety=="poisonous" & cap_shape=="sunken"))
sunken_total <- nrow(subset(mush1, cap_shape=="sunken"))


sunken_edible
## [1] 32
sunken_poison
## [1] 0
sunken_total
## [1] 32

Calculating Entropy for Cap Shape

entropy_cap_shape <-  entropy(bell_edible/bell_total) + entropy(bell_poison/bell_total) + entropy(convex_edible/convex_total) + entropy(convex_poison/convex_total) + entropy(flat_edible/flat_total) + entropy(flat_poison/flat_total) + entropy(knobbed_edible/knobbed_total) + entropy(knobbed_poison/knobbed_total)

entropy_cap_shape
## [1] 3.33413

Calculating Information Gain for Cap Shape

ig_cap_shape <- entropy_parent - prob_bell* (entropy(bell_edible/bell_total) + entropy(bell_poison/bell_total))- prob_convex* (entropy(convex_edible/convex_total) + entropy(convex_poison/convex_total))- prob_flat *(entropy(flat_edible/flat_total) + entropy(flat_poison/flat_total)) - prob_knobbed* (entropy(knobbed_edible/knobbed_total) + entropy(knobbed_poison/knobbed_total))
  
ig_cap_shape
## [1] 0.04880828

Calculating Probability According to Odor

prob_almond <-ratio(4, "almond", num_obs)
prob_anise <-ratio(4, "anise", num_obs)
prob_creosote <-ratio(4, "creosote", num_obs)
prob_fishy <-ratio(4, "fishy", num_obs)
prob_foul <-ratio(4, "foul", num_obs)
prob_musty <-ratio(4, "musty", num_obs)
prob_none <-ratio(4, "none", num_obs)
prob_pungent <-ratio(4, "pungent", num_obs)
prob_spicy <-ratio(4, "spicy", num_obs)

almond_edible <- nrow(subset(mush1, safety=="edible" & odor=="almond"))
almond_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="almond"))
almond_total <- nrow(subset(mush1, odor=="almond"))

anise_edible <- nrow(subset(mush1, safety=="edible" & odor=="anise"))
anise_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="anise"))
anise_total <- nrow(subset(mush1, odor=="anise"))

creosote_edible <- nrow(subset(mush1, safety=="edible" & odor=="creosote"))
creosote_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="creosote"))
creosote_total <- nrow(subset(mush1, odor=="creosote"))

fishy_edible <- nrow(subset(mush1, safety=="edible" & odor=="fishy"))
fishy_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="fishy"))
fishy_total <- nrow(subset(mush1, odor=="fishy"))

foul_edible <- nrow(subset(mush1, safety=="edible" & odor=="foul"))
foul_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="foul"))
foul_total <- nrow(subset(mush1, odor=="foul"))

musty_edible <- nrow(subset(mush1, safety=="edible" & odor=="musty"))
musty_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="musty"))
musty_total <- nrow(subset(mush1, odor=="musty"))

none_edible <- nrow(subset(mush1, safety=="edible" & odor=="none"))
none_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="none"))
none_total <- nrow(subset(mush1, odor=="none"))

pungent_edible <- nrow(subset(mush1, safety=="edible" & odor=="pungent"))
pungent_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="pungent"))
pungent_total <- nrow(subset(mush1, odor=="pungent"))

spicy_edible <- nrow(subset(mush1, safety=="edible" & odor=="spicy"))
spicy_poison <- nrow(subset(mush1, safety=="poisonous" & odor=="spicy"))
spicy_total <- nrow(subset(mush1, odor=="spicy"))

Calculating Entropy for Odor

entropy_odor <-  entropy(almond_edible/almond_total) + entropy(almond_poison/almond_total) + entropy(anise_edible/anise_total) + entropy(anise_poison/anise_total) + entropy(creosote_edible/creosote_total) + entropy(creosote_poison/creosote_total) + entropy(fishy_edible/fishy_total) + entropy(fishy_poison/fishy_total) + entropy(foul_edible/foul_total) + entropy(foul_poison/foul_total) + entropy(musty_edible/musty_total) + entropy(musty_poison/musty_total) + entropy(none_edible/none_total) + entropy(none_poison/none_total) + entropy(pungent_edible/pungent_total) + entropy(pungent_poison/pungent_total)+ entropy(spicy_edible/spicy_total) + entropy(spicy_poison/spicy_total)

entropy_odor
## [1] NaN

Calculating Entropy for Odor

entropy_odor <-  entropy(almond_edible/almond_total) 
+ entropy(almond_poison/almond_total) 
## [1] NaN
+ entropy(anise_edible/anise_total) 
## [1] 0
+ entropy(anise_poison/anise_total) 
## [1] NaN
+ entropy(creosote_edible/creosote_total) 
## [1] NaN
+ entropy(creosote_poison/creosote_total) 
## [1] 0
+ entropy(fishy_edible/fishy_total) 
## [1] NaN
+ entropy(fishy_poison/fishy_total) 
## [1] 0
+ entropy(foul_edible/foul_total) 
## [1] NaN
+ entropy(foul_poison/foul_total) 
## [1] 0
+ entropy(musty_edible/musty_total) 
## [1] NaN
+ entropy(musty_poison/musty_total) 
## [1] 0
+ entropy(none_edible/none_total) 
## [1] 0.04822709
+ entropy(none_poison/none_total) 
## [1] 0.1659097
+ entropy(pungent_edible/pungent_total) 
## [1] NaN
+ entropy(pungent_poison/pungent_total)
## [1] 0
+ entropy(spicy_edible/spicy_total) 
## [1] NaN
+ entropy(spicy_poison/spicy_total)
## [1] 0
entropy_odor
## [1] 0

Calculating Entropy for Odor

entropy_odor <-  entropy(none_edible/none_total) + entropy(none_poison/none_total) 

entropy_odor
## [1] 0.2141368

Calculating Information Gain for Odor

ig_odor <- entropy_parent - prob_none *(entropy(none_edible/none_total) + entropy(none_poison/none_total))


ig_odor
## [1] 0.9060569