Submitted by Zachary Herold
First, I load some useful packages.
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
I set the URL of the dataset as a variable, performing the read.csv function on it, before converting the raw data into a dataframe.I wanted to manipulate the data as strings, rather than factors, and so I used lapply, with the argument as.character. I also set stringsAsFactors = F, but I did not test if this has any concrete effect.
url1 <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
x1 <- read.csv(url1)
data <- data.frame(lapply(x1,as.character), stringsAsFactors = F)
str(data)
## 'data.frame': 8123 obs. of 23 variables:
## $ p : chr "e" "e" "p" "e" ...
## $ x : chr "x" "b" "x" "x" ...
## $ s : chr "s" "s" "y" "s" ...
## $ n : chr "y" "w" "w" "g" ...
## $ t : chr "t" "t" "t" "f" ...
## $ p.1: chr "a" "l" "p" "n" ...
## $ f : chr "f" "f" "f" "f" ...
## $ c : chr "c" "c" "c" "w" ...
## $ n.1: chr "b" "b" "n" "b" ...
## $ k : chr "k" "n" "n" "k" ...
## $ e : chr "e" "e" "e" "t" ...
## $ e.1: chr "c" "c" "e" "e" ...
## $ s.1: chr "s" "s" "s" "s" ...
## $ s.2: chr "s" "s" "s" "s" ...
## $ w : chr "w" "w" "w" "w" ...
## $ w.1: chr "w" "w" "w" "w" ...
## $ p.2: chr "p" "p" "p" "p" ...
## $ w.2: chr "w" "w" "w" "w" ...
## $ o : chr "o" "o" "o" "o" ...
## $ p.3: chr "p" "p" "p" "e" ...
## $ k.1: chr "n" "n" "k" "n" ...
## $ s.3: chr "n" "n" "s" "a" ...
## $ u : chr "g" "m" "u" "g" ...
Using dpylr’s select function with the given headers as arguments, I reduce the dataframe to the columns specifying edible vs. poisonous, cap color, odor, spore print color and habitat, as indicated in the dataset attribution information found at https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names.
data_sub <- select(data,"p","n","p.1","k.1","u")
print(head(data_sub))
## p n p.1 k.1 u
## 1 e y a n g
## 2 e w l n m
## 3 p w p k u
## 4 e g n n g
## 5 e y a k g
## 6 e w a k m
I rename the values so that they are no longer abbreviated. For this I use plyr’s revalue function.
data_sub$p <- revalue(data_sub$p, c("e"="edible","p"="poison"))
data_sub$n <- revalue(data_sub$n, c("n"="brown", "b"="buff","c"="cinnamom","g"="gray","r"="green","p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))
data_sub$p.1 <- revalue(data_sub$p.1, c("a"="almond","l"="anise","c"="creosote","y"="fishy","f"="foul","m"="musty","n"="none","p"="pungent", "s"="spicy"))
data_sub$k.1 <- revalue(data_sub$k.1, c("k"="black","n"="brown","b"="buff","h"="chocolate","r"="green","o"="orange","u"="purple","w"="white","y"="yellow"))
data_sub$u <- revalue(data_sub$u, c("g"="grasses","l"="leaves","m"="meadows","p"="paths","u"="urban","w"="waste","d"="woods"))
print(head(data_sub))
## p n p.1 k.1 u
## 1 edible yellow almond brown grasses
## 2 edible white anise brown meadows
## 3 poison white pungent black urban
## 4 edible gray none brown grasses
## 5 edible yellow almond black grasses
## 6 edible white almond black meadows
I renamed the columns after the values to keep the code for the previous revaluing as compact as possible.
colnames(data_sub)[colnames(data_sub) == c("p","n","p.1","k.1","u" )] <- c("poisonous","cap_color","odor","spore_color","habitat")
print(head(data_sub))
## poisonous cap_color odor spore_color habitat
## 1 edible yellow almond brown grasses
## 2 edible white anise brown meadows
## 3 poison white pungent black urban
## 4 edible gray none brown grasses
## 5 edible yellow almond black grasses
## 6 edible white almond black meadows
As a last step, I filter out all the edible mushrooms, then drop the previous edible vs. poison column by making it NULL. I discover that all the poisonous mushrooms are found in an urban or grasses habitat. So I rearrange the order of the columns, begining with the habitat classifier, and putting the odor last, which is almost always pungent.
data_poison <- filter(data_sub,poisonous =="poison")
## Warning: package 'bindrcpp' was built under R version 3.5.1
data_poison$poisonous <- NULL
data_poison <- data_poison[c(4,3,1,2)]
data_poison_sorted <- arrange(data_poison,habitat,spore_color,cap_color)
head(data_poison_sorted)
## habitat spore_color cap_color odor
## 1 grasses black brown pungent
## 2 grasses black brown pungent
## 3 grasses black brown pungent
## 4 grasses black brown pungent
## 5 grasses black brown pungent
## 6 grasses black brown pungent