1.Load all the required libraries

library(tidyverse)

2.Download the dataset from UCI & take a glimpse of the dataset

col_names = c("Class", "CapShape", "CapSurface","CapColor", "Bruises", "Odor", "GillAttachment","GillSpacing", "GillSize", "GillColor", "StalkShape","StalkRoot", "StalkAbove", "StalkBelow", "ColorAbove","ColorBelow", "VeilType", "VeilColor", "RingNumber","RingType", "SporeColor", "Population", "Habitat")

mash_uci <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"), header=FALSE,col.names = col_names)
write_csv(mash_uci,"C:/ds607/mash_uci.csv")
glimpse(mash_uci)
## Observations: 8,124
## Variables: 23
## $ Class          <fct> p, e, e, p, e, e, e, e, p, e, e, e, e, p, e, e,...
## $ CapShape       <fct> x, x, b, x, x, x, b, b, x, b, x, x, b, x, x, s,...
## $ CapSurface     <fct> s, s, s, y, s, y, s, y, y, s, y, y, s, y, f, f,...
## $ CapColor       <fct> n, y, w, w, g, y, w, w, w, y, y, y, y, w, n, g,...
## $ Bruises        <fct> t, t, t, t, f, t, t, t, t, t, t, t, t, t, f, f,...
## $ Odor           <fct> p, a, l, p, n, a, a, l, p, a, l, a, a, p, n, n,...
## $ GillAttachment <fct> f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f,...
## $ GillSpacing    <fct> c, c, c, c, w, c, c, c, c, c, c, c, c, c, w, c,...
## $ GillSize       <fct> n, b, b, n, b, b, b, b, n, b, b, b, b, n, b, n,...
## $ GillColor      <fct> k, k, n, n, k, n, g, n, p, g, g, n, w, k, n, k,...
## $ StalkShape     <fct> e, e, e, e, t, e, e, e, e, e, e, e, e, e, t, e,...
## $ StalkRoot      <fct> e, c, c, e, e, c, c, c, e, c, c, c, c, e, e, e,...
## $ StalkAbove     <fct> s, s, s, s, s, s, s, s, s, s, s, s, s, s, s, s,...
## $ StalkBelow     <fct> s, s, s, s, s, s, s, s, s, s, s, s, s, s, f, s,...
## $ ColorAbove     <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ ColorBelow     <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ VeilType       <fct> p, p, p, p, p, p, p, p, p, p, p, p, p, p, p, p,...
## $ VeilColor      <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w,...
## $ RingNumber     <fct> o, o, o, o, o, o, o, o, o, o, o, o, o, o, o, o,...
## $ RingType       <fct> p, p, p, p, e, p, p, p, p, p, p, p, p, p, e, p,...
## $ SporeColor     <fct> k, n, n, k, n, k, k, n, k, k, n, k, n, n, k, n,...
## $ Population     <fct> s, n, n, s, a, n, n, s, v, s, n, s, s, v, a, y,...
## $ Habitat        <fct> u, g, m, u, g, g, m, m, g, m, g, m, g, u, g, u,...
summary(mash_uci)
##  Class    CapShape CapSurface    CapColor    Bruises       Odor     
##  e:4208   b: 452   f:2320     n      :2284   f:4748   n      :3528  
##  p:3916   c:   4   g:   4     g      :1840   t:3376   f      :2160  
##           f:3152   s:2556     e      :1500            s      : 576  
##           k: 828   y:3244     y      :1072            y      : 576  
##           s:  32              w      :1040            a      : 400  
##           x:3656              b      : 168            l      : 400  
##                               (Other): 220            (Other): 484  
##  GillAttachment GillSpacing GillSize   GillColor    StalkShape StalkRoot
##  a: 210         c:6812      b:5612   b      :1728   e:3516     ?:2480   
##  f:7914         w:1312      n:2512   p      :1492   t:4608     b:3776   
##                                      w      :1202              c: 556   
##                                      n      :1048              e:1120   
##                                      g      : 752              r: 192   
##                                      h      : 732                       
##                                      (Other):1170                       
##  StalkAbove StalkBelow   ColorAbove     ColorBelow   VeilType VeilColor
##  f: 552     f: 600     w      :4464   w      :4384   p:8124   n:  96   
##  k:2372     k:2304     p      :1872   p      :1872            o:  96   
##  s:5176     s:4936     g      : 576   g      : 576            w:7924   
##  y:  24     y: 284     n      : 448   n      : 512            y:   8   
##                        b      : 432   b      : 432                     
##                        o      : 192   o      : 192                     
##                        (Other): 140   (Other): 156                     
##  RingNumber RingType   SporeColor   Population Habitat 
##  n:  36     e:2776   w      :2388   a: 384     d:3148  
##  o:7488     f:  48   n      :1968   c: 340     g:2148  
##  t: 600     l:1296   k      :1872   n: 400     l: 832  
##             n:  36   h      :1632   s:1248     m: 292  
##             p:3968   r      :  72   v:4040     p:1144  
##                      b      :  48   y:1712     u: 368  
##                      (Other): 144              w: 192

3.Subsetting the dataset

It’s my first impression that odor and color could be an easy way to detect whether a mushroom poisonous or not. Further analysis like feature selection would be necessary if the project is going to continue.

sub_mush <- mash_uci %>% 
  select("Bruises", "Odor","GillSize", "ColorAbove", "SporeColor" , "Class")
write_csv(sub_mush,"C:/ds607/sub_mush.csv")
dim(sub_mush)
## [1] 8124    6

4.Replace the abbreviations

sub_mush$Bruises <- fct_recode(sub_mush$Bruises,
                               "bruises" ="t",
                               "no" = "f")

sub_mush$Odor <- fct_recode(sub_mush$Odor,
                            "almond" ="a",
                            "anise" = "l",
                            "creosote" = "c",
                            "fishy" = "y",
                            "musty" = "m",
                            "none" = "n",
                            "pungent" = "p",
                            "spicy" = "s")

sub_mush$GillSize <- fct_recode(sub_mush$GillSize,
                                "board" ="b",
                                "narrow" = "n")

sub_mush$ColorAbove <- fct_recode(sub_mush$ColorAbove,
                                  "brown" ="n",
                                  "buff" = "b","
                                  cinnamon" = "c",
                                  "gray" = "g",
                                  "orange" = "o",
                                  "pink" = "p",
                                  "red" = "e",
                                  "white" = "w",
                                  "yellow" = "y")

sub_mush$SporeColor <- fct_recode(sub_mush$SporeColor,
                                  "black" ="k",
                                  "buff" = "b",
                                  "brown" = "n",
                                  "chocolate" = "h",
                                  "green" = "r",
                                  "orange" = "o",
                                  "purple" = "u",
                                  "white" = "w",
                                  "yellow" = "y")

sub_mush$Class <- fct_recode(sub_mush$Class,
                             "edible" ="e",
                             "poisonous" = "p")

5.Take a look at the first 20 rows to make sure the abbreviations are replaced

head(sub_mush,n=20)
##    Bruises    Odor GillSize ColorAbove SporeColor     Class
## 1  bruises pungent   narrow      white      black poisonous
## 2  bruises  almond    board      white      brown    edible
## 3  bruises   anise    board      white      brown    edible
## 4  bruises pungent   narrow      white      black poisonous
## 5       no    none    board      white      brown    edible
## 6  bruises  almond    board      white      black    edible
## 7  bruises  almond    board      white      black    edible
## 8  bruises   anise    board      white      brown    edible
## 9  bruises pungent   narrow      white      black poisonous
## 10 bruises  almond    board      white      black    edible
## 11 bruises   anise    board      white      brown    edible
## 12 bruises  almond    board      white      black    edible
## 13 bruises  almond    board      white      brown    edible
## 14 bruises pungent   narrow      white      brown poisonous
## 15      no    none    board      white      black    edible
## 16      no    none   narrow      white      brown    edible
## 17      no    none    board      white      brown    edible
## 18 bruises pungent   narrow      white      black poisonous
## 19 bruises pungent   narrow      white      brown poisonous
## 20 bruises pungent   narrow      white      brown poisonous