Load Libraries

library(tidyverse)

Load Data

The original dataset is located at https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data

mushrooms <- read.csv('https://raw.githubusercontent.com/albert-gilharry/data607-assignment1/master/agaricus-lepiota.data',header = FALSE)

Rename Columns

Rename all columns for the sake of completeness.

names(mushrooms) <- c("class","cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat")

Extract Columns of Interest

mushrooms <- select(mushrooms, "class", "cap-shape", "cap-color","odor","habitat")

Preview Data

head(mushrooms)
##   class cap-shape cap-color odor habitat
## 1     p         x         n    p       u
## 2     e         x         y    a       g
## 3     e         b         w    l       m
## 4     p         x         w    p       u
## 5     e         x         g    n       g
## 6     e         x         y    a       g

Transformations are necessary to make more sense of this data.

Examine Data Structure

str(mushrooms)
## 'data.frame':    8124 obs. of  5 variables:
##  $ class    : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap-shape: Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap-color: Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ odor     : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ habitat  : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

Replace Abbreviations

Careful considerations were taken to ensure the abbrevitions in the levels references the correct metadata in the dictionary.

levels(mushrooms$class)
## [1] "e" "p"
levels(mushrooms$class) <- c("edible","poisonous")

levels(mushrooms$`cap-shape`)
## [1] "b" "c" "f" "k" "s" "x"
levels(mushrooms$`cap-shape`) <- c("bell","conical","flat","knobbed","sunken","convex")

levels(mushrooms$`cap-color`)
##  [1] "b" "c" "e" "g" "n" "p" "r" "u" "w" "y"
levels(mushrooms$`cap-color`) <- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")

levels(mushrooms$odor)
## [1] "a" "c" "f" "l" "m" "n" "p" "s" "y"
levels(mushrooms$odor) <- c("almond","creosote","foul","anise","musty","none","pungent","spicy","fishy")

levels(mushrooms$habitat)
## [1] "d" "g" "l" "m" "p" "u" "w"
levels(mushrooms$habitat) <- c("woods","grasses","leaves","meadows","paths","urban","waste")

Transformed Data

head(mushrooms)
##       class cap-shape cap-color    odor habitat
## 1 poisonous    convex     brown pungent   urban
## 2    edible    convex    yellow  almond grasses
## 3    edible      bell     white   anise meadows
## 4 poisonous    convex     white pungent   urban
## 5    edible    convex      gray    none grasses
## 6    edible    convex    yellow  almond grasses

Users now have a better initial understanding of the data.

Summarize Data

summary(mushrooms)
##        class        cap-shape      cap-color         odor     
##  edible   :4208   bell   : 452   brown  :2284   none   :3528  
##  poisonous:3916   conical:   4   gray   :1840   foul   :2160  
##                   flat   :3152   red    :1500   spicy  : 576  
##                   knobbed: 828   yellow :1072   fishy  : 576  
##                   sunken :  32   white  :1040   almond : 400  
##                   convex :3656   buff   : 168   anise  : 400  
##                                  (Other): 220   (Other): 484  
##     habitat    
##  woods  :3148  
##  grasses:2148  
##  leaves : 832  
##  meadows: 292  
##  paths  :1144  
##  urban  : 368  
##  waste  : 192