Moldy old Mushroom Data

Load libraries and read in the data

library(RCurl)
library(dplyr)
mush_url <- getURL('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data')
mush_data <- read.csv(text = mush_url, header = FALSE,sep = ",",stringsAsFactors = FALSE )
head(mush_data)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Rename columns from the data dictionary

colnames(mush_data) <- c('edibility', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor', 'gill attachment', 'gill spacing', 'gill size', 'gill color', 'stalk shape', 'stalk root', 'stalk surface above ring', 'stalk surface below ring', 'stalk color above ring', 'stalk color above ring', 'veil type', 'veil color', 'ring number', 'ring type', 'spore print color', 'population', 'habitat')

head(mush_data)
##   edibility cap_shape cap_surface cap_color bruises odor gill attachment
## 1         p         x           s         n       t    p               f
## 2         e         x           s         y       t    a               f
## 3         e         b           s         w       t    l               f
## 4         p         x           y         w       t    p               f
## 5         e         x           s         g       f    n               f
## 6         e         x           y         y       t    a               f
##   gill spacing gill size gill color stalk shape stalk root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   stalk surface above ring stalk surface below ring stalk color above ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk color above ring veil type veil color ring number ring type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         p
## 5                      w         p          w           o         e
## 6                      w         p          w           o         p
##   spore print color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m
## 4                 k          s       u
## 5                 n          a       g
## 6                 k          n       g

Choose a subset of collumns:

mush_data_cols <- subset(mush_data, select = edibility:odor)
head(mush_data_cols)
##   edibility cap_shape cap_surface cap_color bruises odor
## 1         p         x           s         n       t    p
## 2         e         x           s         y       t    a
## 3         e         b           s         w       t    l
## 4         p         x           y         w       t    p
## 5         e         x           s         g       f    n
## 6         e         x           y         y       t    a

Give meaningful values to data

mush_data_cols$edibility = ifelse( mush_data_cols$edibility == "p" , "poisonous", "edible")

mush_data_cols$cap_shape = case_when( 
    mush_data_cols$cap_shape == "b" ~ "bell",
    mush_data_cols$cap_shape == "c" ~ "conical",
    mush_data_cols$cap_shape == "x" ~ "convex",
    mush_data_cols$cap_shape == "f" ~ "flat",
    mush_data_cols$cap_shape == "k" ~ "knobbed",
    mush_data_cols$cap_shape == "s" ~ "sunken",
    TRUE ~ mush_data_cols$cap_shape
    )

mush_data_cols$cap_surface = case_when( 
    mush_data_cols$cap_surface == "f" ~ "fibrous",
    mush_data_cols$cap_surface == "g" ~ "grooves",
    mush_data_cols$cap_surface == "y" ~ "scaly",
    mush_data_cols$cap_surface == "s" ~ "smooth",
    TRUE ~ mush_data_cols$cap_surface
    )

mush_data_cols$cap_color = case_when( 
    mush_data_cols$cap_color == "n" ~ "brown",
    mush_data_cols$cap_color == "b" ~ "buff",
    mush_data_cols$cap_color == "c" ~ "cinnamon",
    mush_data_cols$cap_color == "g" ~ "gray",
    mush_data_cols$cap_color == "r" ~ "green",
    mush_data_cols$cap_color == "p" ~ "pink",
    mush_data_cols$cap_color == "u" ~ "purple",
    mush_data_cols$cap_color == "e" ~ "red",
    mush_data_cols$cap_color == "w" ~ "white",
    mush_data_cols$cap_color == "y" ~ "yellow",
    TRUE ~ mush_data_cols$cap_color
    )
mush_data_cols$bruises = ifelse( mush_data_cols$bruises == "t" , "bruises", "no")

mush_data_cols$odor = case_when( 
    mush_data_cols$odor == "a" ~ "almond",
    mush_data_cols$odor == "l" ~ "anise",
    mush_data_cols$odor == "c" ~ "creosote",
    mush_data_cols$odor == "y" ~ "fishy",
    mush_data_cols$odor == "f" ~ "foul",
    mush_data_cols$odor == "m" ~ "musty",
    mush_data_cols$odor == "n" ~ "none",
    mush_data_cols$odor == "p" ~ "pungent",
    mush_data_cols$odor == "s" ~ "spicy",
    TRUE ~ mush_data_cols$odor
    )

Final subset of the Mushroom Data with meaningful column and data names

head(mush_data_cols)
##   edibility cap_shape cap_surface cap_color bruises    odor
## 1 poisonous    convex      smooth     brown bruises pungent
## 2    edible    convex      smooth    yellow bruises  almond
## 3    edible      bell      smooth     white bruises   anise
## 4 poisonous    convex       scaly     white bruises pungent
## 5    edible    convex      smooth      gray      no    none
## 6    edible    convex       scaly    yellow bruises  almond