library(tidyverse)
col_names = c("Class", "CapShape", "CapSurface","CapColor", "Bruises", "Odor", "GillAttachment","GillSpacing", "GillSize", "GillColor", "StalkShape","StalkRoot", "StalkAbove", "StalkBelow", "ColorAbove","ColorBelow", "VeilType", "VeilColor", "RingNumber","RingType", "SporeColor", "Population", "Habitat")
mash_uci <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"), header=FALSE,col.names = col_names)
write_csv(mash_uci,"C:/ds607/mash_uci.csv")
glimpse(mash_uci)
## Observations: 8,124
## Variables: 23
## $ Class <fct> p, e, e, p, e, e, e, e, p, e, e, e, e, p, e, e,...
## $ CapShape <fct> x, x, b, x, x, x, b, b, x, b, x, x, b, x, x, s,...
## $ CapSurface <fct> s, s, s, y, s, y, s, y, y, s, y, y, s, y, f, f,...
## $ CapColor <fct> n, y, w, w, g, y, w, w, w, y, y, y, y, w, n, g,...
## $ Bruises <fct> t, t, t, t, f, t, t, t, t, t, t, t, t, t, f, f,...
## $ Odor <fct> p, a, l, p, n, a, a, l, p, a, l, a, a, p, n, n,...
## $ GillAttachment <fct> f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f,...
## $ GillSpacing <fct> c, c, c, c, w, c, c, c, c, c, c, c, c, c, w, c,...
## $ GillSize <fct> n, b, b, n, b, b, b, b, n, b, b, b, b, n, b, n,...
## $ GillColor <fct> k, k, n, n, k, n, g, n, p, g, g, n, w, k, n, k,...
## $ StalkShape <fct> e, e, e, e, t, e, e, e, e, e, e, e, e, e, t, e,...
## $ StalkRoot <fct> e, c, c, e, e, c, c, c, e, c, c, c, c, e, e, e,...
## $ StalkAbove <fct> s, s, s, s, s, s, s, s, s, s, s, s, s, s, s, s,...
## $ StalkBelow <fct> s, s, s, s, s, s, s, s, s, s, s, s, s, s, f, s,...
## $ ColorAbove <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ ColorBelow <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ VeilType <fct> p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p,...
## $ VeilColor <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ RingNumber <fct> o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o,...
## $ RingType <fct> p, p, p, p, e, p, p, p, p, p, p, p, p, p, e, p,...
## $ SporeColor <fct> k, n, n, k, n, k, k, n, k, k, n, k, n, n, k, n,...
## $ Population <fct> s, n, n, s, a, n, n, s, v, s, n, s, s, v, a, y,...
## $ Habitat <fct> u, g, m, u, g, g, m, m, g, m, g, m, g, u, g, u,...
summary(mash_uci)
## Class CapShape CapSurface CapColor Bruises Odor
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## GillAttachment GillSpacing GillSize GillColor StalkShape StalkRoot
## a: 210 c:6812 b:5612 b :1728 e:3516 ?:2480
## f:7914 w:1312 n:2512 p :1492 t:4608 b:3776
## w :1202 c: 556
## n :1048 e:1120
## g : 752 r: 192
## h : 732
## (Other):1170
## StalkAbove StalkBelow ColorAbove ColorBelow VeilType VeilColor
## f: 552 f: 600 w :4464 w :4384 p:8124 n: 96
## k:2372 k:2304 p :1872 p :1872 o: 96
## s:5176 s:4936 g : 576 g : 576 w:7924
## y: 24 y: 284 n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## RingNumber RingType SporeColor Population Habitat
## n: 36 e:2776 w :2388 a: 384 d:3148
## o:7488 f: 48 n :1968 c: 340 g:2148
## t: 600 l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
It’s my first impression that odor and color could be an easy way to detect whether a mushroom poisonous or not. Further analysis like feature selection would be necessary if the project is going to continue.
sub_mush <- mash_uci %>%
select("Bruises", "Odor","GillSize", "ColorAbove", "SporeColor" , "Class")
write_csv(sub_mush,"C:/ds607/sub_mush.csv")
dim(sub_mush)
## [1] 8124 6
sub_mush$Bruises <- fct_recode(sub_mush$Bruises,
"bruises" ="t",
"no" = "f")
sub_mush$Odor <- fct_recode(sub_mush$Odor,
"almond" ="a",
"anise" = "l",
"creosote" = "c",
"fishy" = "y",
"musty" = "m",
"none" = "n",
"pungent" = "p",
"spicy" = "s")
sub_mush$GillSize <- fct_recode(sub_mush$GillSize,
"board" ="b",
"narrow" = "n")
sub_mush$ColorAbove <- fct_recode(sub_mush$ColorAbove,
"brown" ="n",
"buff" = "b","
cinnamon" = "c",
"gray" = "g",
"orange" = "o",
"pink" = "p",
"red" = "e",
"white" = "w",
"yellow" = "y")
sub_mush$SporeColor <- fct_recode(sub_mush$SporeColor,
"black" ="k",
"buff" = "b",
"brown" = "n",
"chocolate" = "h",
"green" = "r",
"orange" = "o",
"purple" = "u",
"white" = "w",
"yellow" = "y")
sub_mush$Class <- fct_recode(sub_mush$Class,
"edible" ="e",
"poisonous" = "p")
head(sub_mush,n=20)
## Bruises Odor GillSize ColorAbove SporeColor Class
## 1 bruises pungent narrow white black poisonous
## 2 bruises almond board white brown edible
## 3 bruises anise board white brown edible
## 4 bruises pungent narrow white black poisonous
## 5 no none board white brown edible
## 6 bruises almond board white black edible
## 7 bruises almond board white black edible
## 8 bruises anise board white brown edible
## 9 bruises pungent narrow white black poisonous
## 10 bruises almond board white black edible
## 11 bruises anise board white brown edible
## 12 bruises almond board white black edible
## 13 bruises almond board white brown edible
## 14 bruises pungent narrow white brown poisonous
## 15 no none board white black edible
## 16 no none narrow white brown edible
## 17 no none board white brown edible
## 18 bruises pungent narrow white black poisonous
## 19 bruises pungent narrow white brown poisonous
## 20 bruises pungent narrow white brown poisonous