We would like to clasify characteristics of mushrooms as edible and poisonous using data visualization techniques.
Attribute Information: (classes: edible=e, poisonous=p)
cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
bruises: bruises=t,no=f
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
gill-attachment: attached=a,descending=d,free=f,notched=n
gill-spacing: close=c,crowded=w,distant=d
gill-size: broad=b,narrow=n
gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
stalk-shape: enlarging=e,tapering=t
stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
veil-type: partial=p,universal=u
veil-color: brown=n,orange=o,white=w,yellow=y
ring-number: none=n,one=o,two=t
ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
library(tidyverse)
library(gridExtra) # for multiple graphs
library(DT)
library(reshape2) # for melt
data <- read_csv("mushrooms.csv")
datatable(head(data))
data <- as_tibble(lapply(data, function(x) as.factor(x))) # factor
summary(data)
## class cap-shape cap-surface cap-color bruises odor
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## gill-attachment gill-spacing gill-size gill-color stalk-shape
## a: 210 c:6812 b:5612 b :1728 e:3516
## f:7914 w:1312 n:2512 p :1492 t:4608
## w :1202
## n :1048
## g : 752
## h : 732
## (Other):1170
## stalk-root stalk-surface-above-ring stalk-surface-below-ring
## ?:2480 f: 552 f: 600
## b:3776 k:2372 k:2304
## c: 556 s:5176 s:4936
## e:1120 y: 24 y: 284
## r: 192
##
##
## stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## w :4464 w :4384 p:8124 n: 96
## p :1872 p :1872 o: 96
## g : 576 g : 576 w:7924
## n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## ring-number ring-type spore-print-color population habitat
## n: 36 e:2776 w :2388 a: 384 d:3148
## o:7488 f: 48 n :1968 c: 340 g:2148
## t: 600 l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
We see that there is just a veil type which is p (partial). We delete it.
data <- select(data, -`veil-type`)
lapply(data, function(x) 1 - length(which(is.na(x))) / nrow(data) ) %>%
as_tibble() %>%
gather(everything(), key = "column", value ="fill_rate") %>%
ggplot(aes(x = column, y = fill_rate)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip()
There is no missing value.
data %>%
group_by(class) %>%
summarise(n = n(), percent = round(n() / nrow(data) * 100))
## # A tibble: 2 x 3
## class n percent
## <fctr> <int> <dbl>
## 1 e 4208 52
## 2 p 3916 48
Our factors start with cap-
p1 <- ggplot(data, aes(x = `cap-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `cap-surface`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `cap-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2,p3, ncol = 1)
cap-shape is conical, it is also poisonous.cap-shape is sunken, it is also edible and most of bell-shaped mushrooms are edible.cap-surface is grooves, it is also poisionous. (4 observations.)cap-color is green or purple, it is also edible.p1 <- ggplot(data, aes(x = bruises)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = odor)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = odor)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = bruises)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
grid.arrange(p1,p4,p2,p3, ncol = 2)
odor is important factor and it directly affects class of mushrooms.p1 <- ggplot(data, aes(x = `gill-attachment`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `gill-spacing`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `gill-size`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p4 <- ggplot(data, aes(x = `gill-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2,p3,p4, ncol = 2)
We have to see also position with “dodge”
p1 <- ggplot(data, aes(x = `gill-attachment`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `gill-spacing`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p3 <- ggplot(data, aes(x = `gill-size`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `gill-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
grid.arrange(p1,p2,p3,p4, ncol = 2)
gill-color is important factor.p1 <- ggplot(data, aes(x = `stalk-surface-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p2 <- ggplot(data, aes(x = `stalk-surface-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `stalk-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p4 <- ggplot(data, aes(x = `stalk-root`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p5 <- ggplot(data, aes(x = `stalk-color-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p6 <- ggplot(data, aes(x = `stalk-color-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2,p3,p4,p5,p6, ncol = 2)
p1 <- ggplot(data, aes(x = `stalk-surface-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `stalk-surface-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p3 <- ggplot(data, aes(x = `stalk-shape`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `stalk-root`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p5 <- ggplot(data, aes(x = `stalk-color-below-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p6 <- ggplot(data, aes(x = `stalk-color-above-ring`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
grid.arrange(p1,p2,p3,p4,p5,p6, ncol = 2)
length(which(data$`stalk-color-above-ring` == data$`stalk-color-below-ring`)) / nrow(data) # for color
## [1] 0.6238306
length(which(data$`stalk-surface-above-ring` == data$`stalk-surface-below-ring`)) / nrow(data) # for surface
## [1] 0.770064
p1 <- ggplot(data, aes(x = `spore-print-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `spore-print-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2, ncol = 2)
p1 <- ggplot(data, aes(x = `ring-type`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `ring-type`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `ring-number`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `ring-number`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2,p3,p4, ncol = 2)
p1 <- ggplot(data, aes(x = `veil-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `veil-color`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2, ncol = 2)
p1 <- ggplot(data, aes(x = `population`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p2 <- ggplot(data, aes(x = `population`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
p3 <- ggplot(data, aes(x = `habitat`)) + geom_histogram(stat = "count", aes(fill = class), position = "dodge")
p4 <- ggplot(data, aes(x = `habitat`)) + geom_histogram(stat = "count", aes(fill = class), position = "fill")
grid.arrange(p1,p2,p3,p4, ncol = 2)
We can say that population and habitat are important for classification.
library(parcoords)
library(htmlwidgets)
data %>%
select(class,odor, `gill-color`, `stalk-color-below-ring`, `spore-print-color`, habitat, population) %>%
parcoords(brushMode = "1D-axes",
reorderable = TRUE,
rownames = FALSE,
color = list(
colorBy = "class",
colorScale = htmlwidgets::JS("d3.scale.category10()")
)
)