Introduction

We would like to clasify characteristics of mushrooms as edible and poisonous using data visualization techniques.

Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
bruises: bruises=t,no=f
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
gill-attachment: attached=a,descending=d,free=f,notched=n
gill-spacing: close=c,crowded=w,distant=d
gill-size: broad=b,narrow=n
gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
stalk-shape: enlarging=e,tapering=t
stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
veil-type: partial=p,universal=u
veil-color: brown=n,orange=o,white=w,yellow=y
ring-number: none=n,one=o,two=t
ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

Load Packages

library(tidyverse)
library(gridExtra) # for multiple graphs
library(DT) 
library(reshape2) # for melt

Load Dataset

data <- read_csv("mushrooms.csv")

datatable(head(data))
data <- as_tibble(lapply(data, function(x) as.factor(x))) # factor

Summary of variables

summary(data)
##  class    cap-shape cap-surface   cap-color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill-attachment gill-spacing gill-size   gill-color   stalk-shape
##  a: 210          c:6812       b:5612    b      :1728   e:3516     
##  f:7914          w:1312       n:2512    p      :1492   t:4608     
##                                         w      :1202              
##                                         n      :1048              
##                                         g      : 752              
##                                         h      : 732              
##                                         (Other):1170              
##  stalk-root stalk-surface-above-ring stalk-surface-below-ring
##  ?:2480     f: 552                   f: 600                  
##  b:3776     k:2372                   k:2304                  
##  c: 556     s:5176                   s:4936                  
##  e:1120     y:  24                   y: 284                  
##  r: 192                                                      
##                                                              
##                                                              
##  stalk-color-above-ring stalk-color-below-ring veil-type veil-color
##  w      :4464           w      :4384           p:8124    n:  96    
##  p      :1872           p      :1872                     o:  96    
##  g      : 576           g      : 576                     w:7924    
##  n      : 448           n      : 512                     y:   8    
##  b      : 432           b      : 432                               
##  o      : 192           o      : 192                               
##  (Other): 140           (Other): 156                               
##  ring-number ring-type spore-print-color population habitat 
##  n:  36      e:2776    w      :2388      a: 384     d:3148  
##  o:7488      f:  48    n      :1968      c: 340     g:2148  
##  t: 600      l:1296    k      :1872      n: 400     l: 832  
##              n:  36    h      :1632      s:1248     m: 292  
##              p:3968    r      :  72      v:4040     p:1144  
##                        b      :  48      y:1712     u: 368  
##                        (Other): 144                 w: 192

We see that there is just a veil type which is p (partial). We delete it.

data <- select(data, -`veil-type`)

Is there missing value?

lapply(data, function(x) 1 - length(which(is.na(x))) / nrow(data) ) %>%
  as_tibble() %>%
  gather(everything(), key = "column", value ="fill_rate") %>%
  ggplot(aes(x = column, y = fill_rate)) + 
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip()

There is no missing value.

Exploration

How many mushrooms are edible or poisonous?

data %>%
  group_by(class) %>%
  summarise(n = n(), percent = round(n() / nrow(data) * 100))
## # A tibble: 2 x 3
##    class     n percent
##   <fctr> <int>   <dbl>
## 1      e  4208      52
## 2      p  3916      48

How cap effects mushrooms?

Our factors start with cap-

p1 <- ggplot(data, aes(x = `cap-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `cap-surface`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `cap-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2,p3, ncol = 1)

Bruises and Odor

p1 <- ggplot(data, aes(x = bruises)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = odor)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = odor)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = bruises)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")

grid.arrange(p1,p4,p2,p3, ncol = 2)

How gill affects class?

p1 <- ggplot(data, aes(x = `gill-attachment`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `gill-spacing`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `gill-size`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p4 <- ggplot(data, aes(x = `gill-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2,p3,p4, ncol = 2)

We have to see also position with “dodge”

p1 <- ggplot(data, aes(x = `gill-attachment`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `gill-spacing`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p3 <- ggplot(data, aes(x = `gill-size`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `gill-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")

grid.arrange(p1,p2,p3,p4, ncol = 2)

How stalk affects class ?

p1 <- ggplot(data, aes(x = `stalk-surface-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `stalk-surface-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `stalk-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p4 <- ggplot(data, aes(x = `stalk-root`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p5 <- ggplot(data, aes(x = `stalk-color-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p6 <- ggplot(data, aes(x = `stalk-color-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2,p3,p4,p5,p6, ncol = 2)

p1 <- ggplot(data, aes(x = `stalk-surface-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `stalk-surface-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p3 <- ggplot(data, aes(x = `stalk-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `stalk-root`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p5 <- ggplot(data, aes(x = `stalk-color-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p6 <- ggplot(data, aes(x = `stalk-color-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")

grid.arrange(p1,p2,p3,p4,p5,p6, ncol = 2)

length(which(data$`stalk-color-above-ring` == data$`stalk-color-below-ring`)) / nrow(data) # for color
## [1] 0.6238306
length(which(data$`stalk-surface-above-ring` == data$`stalk-surface-below-ring`)) / nrow(data) # for surface
## [1] 0.770064

spore-print-color

p1 <- ggplot(data, aes(x = `spore-print-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `spore-print-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2, ncol = 2)

How ring affects class?

p1 <- ggplot(data, aes(x = `ring-type`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `ring-type`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `ring-number`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `ring-number`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2,p3,p4, ncol = 2)

veil-color

p1 <- ggplot(data, aes(x = `veil-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `veil-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2, ncol = 2)

Population and Habitat

p1 <- ggplot(data, aes(x = `population`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `population`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `habitat`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `habitat`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")

grid.arrange(p1,p2,p3,p4, ncol = 2)

We can say that population and habitat are important for classification.

Parallel Coordinates

library(parcoords)
library(htmlwidgets)
data %>%
  select(class,odor, `gill-color`, `stalk-color-below-ring`, `spore-print-color`, habitat, population) %>%
  parcoords(brushMode = "1D-axes",
          reorderable = TRUE,
          rownames = FALSE,
          color = list(
            colorBy = "class",
            colorScale = htmlwidgets::JS("d3.scale.category10()")
          )    
  )

Results