library(tidyverse)
The original dataset is located at https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data
mushrooms <- read.csv('https://raw.githubusercontent.com/albert-gilharry/data607-assignment1/master/agaricus-lepiota.data',header = FALSE)
Rename all columns for the sake of completeness.
names(mushrooms) <- c("class","cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat")
mushrooms <- select(mushrooms, "class", "cap-shape", "cap-color","odor","habitat")
head(mushrooms)
## class cap-shape cap-color odor habitat
## 1 p x n p u
## 2 e x y a g
## 3 e b w l m
## 4 p x w p u
## 5 e x g n g
## 6 e x y a g
Transformations are necessary to make more sense of this data.
str(mushrooms)
## 'data.frame': 8124 obs. of 5 variables:
## $ class : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap-shape: Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap-color: Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
Careful considerations were taken to ensure the abbrevitions in the levels references the correct metadata in the dictionary.
levels(mushrooms$class)
## [1] "e" "p"
levels(mushrooms$class) <- c("edible","poisonous")
levels(mushrooms$`cap-shape`)
## [1] "b" "c" "f" "k" "s" "x"
levels(mushrooms$`cap-shape`) <- c("bell","conical","flat","knobbed","sunken","convex")
levels(mushrooms$`cap-color`)
## [1] "b" "c" "e" "g" "n" "p" "r" "u" "w" "y"
levels(mushrooms$`cap-color`) <- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")
levels(mushrooms$odor)
## [1] "a" "c" "f" "l" "m" "n" "p" "s" "y"
levels(mushrooms$odor) <- c("almond","creosote","foul","anise","musty","none","pungent","spicy","fishy")
levels(mushrooms$habitat)
## [1] "d" "g" "l" "m" "p" "u" "w"
levels(mushrooms$habitat) <- c("woods","grasses","leaves","meadows","paths","urban","waste")
head(mushrooms)
## class cap-shape cap-color odor habitat
## 1 poisonous convex brown pungent urban
## 2 edible convex yellow almond grasses
## 3 edible bell white anise meadows
## 4 poisonous convex white pungent urban
## 5 edible convex gray none grasses
## 6 edible convex yellow almond grasses
Users now have a better initial understanding of the data.
summary(mushrooms)
## class cap-shape cap-color odor
## edible :4208 bell : 452 brown :2284 none :3528
## poisonous:3916 conical: 4 gray :1840 foul :2160
## flat :3152 red :1500 spicy : 576
## knobbed: 828 yellow :1072 fishy : 576
## sunken : 32 white :1040 almond : 400
## convex :3656 buff : 168 anise : 400
## (Other): 220 (Other): 484
## habitat
## woods :3148
## grasses:2148
## leaves : 832
## meadows: 292
## paths :1144
## urban : 368
## waste : 192