Libraries loaded: plyr, stringr, tidyverse, rehape2
## Load mushroom data from URL
mushrooms <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"), header=TRUE)
## Check data
head(mushrooms)
## p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 e x s y t a f c b k e c s s w w p w o p n n g
## 2 e b s w t l f c b n e c s s w w p w o p n n m
## 3 p x y w t p f c n n e e s s w w p w o p k s u
## 4 e x s g f n f w b k t e s s w w p w o e n a g
## 5 e x y y t a f c b n e c s s w w p w o p k n g
## 6 e b s w t a f c b g e c s s w w p w o p k n m
## Create data frame
mushrooms <- as.data.frame(mushrooms)
##subset columns for class, cap-color, odor, and habitat
shrooms <- mushrooms[,c("p","n","p.1","u")]
## Check data
head(shrooms)
## p n p.1 u
## 1 e y a g
## 2 e w l m
## 3 p w p u
## 4 e g n g
## 5 e y a g
## 6 e w a m
##rename columns using plyr
shrooms <- rename(shrooms, c("p"="poison_class","n"="cap_color","p.1"="odor","u"="habitat"))
##check levels, missing values in shrooms#poison_class
levels(shrooms$poison_class)
## [1] "e" "p"
is.null(shrooms$poison_class)
## [1] FALSE
##rename levels using str_detect (in stringr package) and ifelse
##Note: this only works because there are only two levels
shrooms$poison_class <- ifelse(str_detect(shrooms$poison_class,"e")== TRUE, "edible", "poisonous")
##repeat for remaining columns, levels
levels(shrooms$cap_color)
## [1] "b" "c" "e" "g" "n" "p" "r" "u" "w" "y"
##result: too many levels for ifelse! need to use gsub with a regexp, instead
shrooms$cap_color <- gsub("^b$","buff",shrooms$cap_color)
shrooms$cap_color <- gsub("^e$","red",shrooms$cap_color)
shrooms$cap_color <- gsub("^n$","brown",shrooms$cap_color)
shrooms$cap_color <- gsub("^c$","cinnamon",shrooms$cap_color)
shrooms$cap_color <- gsub("^g$","grey",shrooms$cap_color)
shrooms$cap_color <- gsub("^r$","green",shrooms$cap_color)
shrooms$cap_color <- gsub("^w$","white",shrooms$cap_color)
shrooms$cap_color <- gsub("^p$","pink",shrooms$cap_color)
shrooms$cap_color <- gsub("^y$","yellow",shrooms$cap_color)
shrooms$cap_color <- gsub("^u$","purple",shrooms$cap_color)
##gsub will force to character, change back to factor for easy checking work
shrooms$cap_color <- as.factor(shrooms$cap_color)
##repeat steps above for odor and habitat
shrooms$odor <- gsub("^a$","almond",shrooms$odor)
shrooms$odor <- gsub("^l$","anise",shrooms$odor)
shrooms$odor <- gsub("^c$","creosote",shrooms$odor)
shrooms$odor <- gsub("^y$","fishy",shrooms$odor)
shrooms$odor <- gsub("^f$","foul",shrooms$odor)
shrooms$odor <- gsub("^m$","musty",shrooms$odor)
shrooms$odor <- gsub("^n$","none",shrooms$odor)
shrooms$odor <- gsub("^p$","pungent",shrooms$odor)
shrooms$odor <- gsub("^s$","spicy",shrooms$odor)
shrooms$odor <- as.factor(shrooms$odor)
shrooms$habitat <- gsub("^d$","woods",shrooms$habitat)
shrooms$habitat <- gsub("^w$","waste",shrooms$habitat)
shrooms$habitat <- gsub("^l$","leaves",shrooms$habitat)
shrooms$habitat <- gsub("^g$","grasses",shrooms$habitat)
shrooms$habitat <- gsub("^m$","meadows",shrooms$habitat)
shrooms$habitat <- gsub("^p$","paths",shrooms$habitat)
shrooms$habitat <- gsub("^u$","urban",shrooms$habitat)
shrooms$habitat <- as.factor(shrooms$habitat)
## Check data
head(shrooms)
## poison_class cap_color odor habitat
## 1 edible yellow almond grasses
## 2 edible white anise meadows
## 3 poisonous white pungent urban
## 4 edible grey none grasses
## 5 edible yellow almond grasses
## 6 edible white almond meadows
tail(shrooms)
## poison_class cap_color odor habitat
## 8118 poisonous brown foul woods
## 8119 edible brown none leaves
## 8120 edible brown none leaves
## 8121 edible brown none leaves
## 8122 poisonous brown fishy leaves
## 8123 edible brown none leaves
Graphing poison_class vs. characteristics subsetted
#poison_class vs. odor
poison_by_odor <-table(shrooms[c(1,3)])
barplot(poison_by_odor,legend.text=TRUE, beside=TRUE, col=c("light blue","dark orange"), xlab = "Odort",ylab = "Species Count", main="Frequency of Poisonous Mushrooms by Odor")
#poison_class vs. habitat
poison_by_habitat <-table(shrooms[c(1,4)])
barplot(poison_by_habitat,legend.text=TRUE, beside=TRUE, col=c("light blue","dark orange"), xlab = "Habitat",ylab = "Species Count", main="Frequency of Poisonous Mushrooms by Habitat")
#poison_class vs. cap_color
poison_by_habitat <-table(shrooms[c(1,2)])
barplot(poison_by_habitat,legend.text=TRUE, beside=TRUE, col=c("light blue","dark orange"), xlab = "Cap Color",ylab = "Species Count", main="Frequency of Poisonous Mushrooms by Cap Color")
#fancy color scale using ggplot
cap_color <- table(shrooms[2])
cap_color_melt <- melt(cap_color)
cap_color_melt <- as.data.frame(cap_color_melt)
cap_color_melt <- cap_color_melt %>% arrange(value)
data=data.frame(
id=c(1:10),
individual=cap_color_melt[1],
value=cap_color_melt[2]
)
colnames(data) <-c("id","individual","value")
# Make the plot
p = ggplot(data, aes(x=as.factor(individual), y=value))+
geom_bar(stat="identity", aes(fill=individual))+
#Note: colors will be pulled in alphapbetical order
scale_fill_manual("legend", values = c(
"#8B4513", #brown
"#f0dc82", #buff
"#9d4535", #cinnamon
"#008000", #green
"gray", #grey
"#FFC0CB", #pink
"purple", #purple
"red", #red
"white", #white
"#FFFF99" #yellow
)
)+
ggtitle("Mushroom Species Frequency by Cap Colors")+
labs(y="Number of Species", x = "Cap Color")
p