library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(readr)
From the Data Dictionary: This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be’’ for Poisonous Oak and Ivy.
mushroomURL <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
# read in the mushroom data as a csv with no headers and comma separator
mushroomData <- read.csv(mushroomURL, header = FALSE, sep = ",")
mushroomData <- as.data.frame(mushroomData)
ncol(mushroomData)
## [1] 23
nrow(mushroomData)
## [1] 8124
head(mushroomData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1 p x s n t p f c n k e e s s w w p w o p k
## 2 e x s y t a f c b k e c s s w w p w o p n
## 3 e b s w t l f c b n e c s s w w p w o p n
## 4 p x y w t p f c n n e e s s w w p w o p k
## 5 e x s g f n f w b k t e s s w w p w o e n
## 6 e x y y t a f c b n e c s s w w p w o p k
## V22 V23
## 1 s u
## 2 n g
## 3 n m
## 4 s u
## 5 a g
## 6 n g
dim(mushroomData)
## [1] 8124 23
table(mushroomData$V1)
##
## e p
## 4208 3916
unique(mushroomData$V1)
## [1] "p" "e"
headers <- c('edib-or-poison', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat');
relabels <- rbind(c('edib-or-poison', "e", "edible"),
c('edib-or-poison', "p", "poisonous"),
c('odor', "a", "almond"),
c('odor', "l", "anise"),
c('odor', "c", "creosote"),
c('odor', "y", "fishy"),
c('odor', "f", "foul"),
c('odor', "m", "musty"),
c('odor', "n", "none"),
c('odor', "p", "pungent"),
c('odor', "s", "spicy"),
c('cap-color', "n", "brown"),
c('cap-color', "b", "buff"),
c('cap-color', "c", "cinnamon"),
c('cap-color', "g", "gray"),
c('cap-color', "r", "green"),
c('cap-color', "p", "pink"),
c('cap-color', "u", "purple"),
c('cap-color', "e", "red"),
c('cap-color', "w", "white"),
c('cap-color', "y", "yellow"),
c('population', "a", "abundant"),
c('population', "c", "clustered"),
c('population', "n", "numerous"),
c('population', "s", "scattered"),
c('population', "v", "several"),
c('population', "y", "solitary"),
c('habitat', "g", "grasses"),
c('habitat', "l", "leaves"),
c('habitat', "m", "meadows"),
c('habitat', "p", "paths"),
c('habitat', "u", "urban"),
c('habitat', "w", "waste"),
c('habitat', "d", "woods")
);
relabels <- data.frame(relabels, stringsAsFactors = FALSE)
for(i in 1:length(headers)) {
names(mushroomData)[i] <- headers[i]
}
mushroomData <- select(mushroomData, 'edib-or-poison', 'odor', 'cap-color', 'population', 'habitat')
for(i in 1:length(relabels$X1)){
mushroomData[[relabels$X1[i]]] <- replace(mushroomData[[relabels$X1[i]]] , mushroomData[[relabels$X1[i]]] == relabels$X2[i], relabels$X3[i])
}
head(mushroomData)
## edib-or-poison odor cap-color population habitat
## 1 poisonous pungent brown scattered urban
## 2 edible almond yellow numerous grasses
## 3 edible anise white numerous meadows
## 4 poisonous pungent white scattered urban
## 5 edible none gray abundant grasses
## 6 edible almond yellow numerous grasses
tail(mushroomData)
## edib-or-poison odor cap-color population habitat
## 8119 poisonous foul brown several woods
## 8120 edible none brown clustered leaves
## 8121 edible none brown several leaves
## 8122 edible none brown clustered leaves
## 8123 poisonous fishy brown several leaves
## 8124 edible none brown clustered leaves