Change column names
mushrooms_df <- setNames(mushrooms_df, c("Edibility", "Cap Shape", "Cap Surface", "Cap Color", "Bruises?", "Odor", "Gill Attachment", "Gill Spacing", "Gill Size", "Gill Color", "Stalk Shape", "Stalk Root", "Stalk, surface Above Ring", "Stalk, surface below ring", "Stalk, color above ring", "Stalk, color below ring", "veil type", "veil color", "ring number", "ring type", "spore print color", "population", "habitat"))
head(mushrooms_df)
## Edibility Cap Shape Cap Surface Cap Color Bruises? Odor Gill Attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## Gill Spacing Gill Size Gill Color Stalk Shape Stalk Root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## Stalk, surface Above Ring Stalk, surface below ring
## 1 s s
## 2 s s
## 3 s s
## 4 s s
## 5 s s
## 6 s s
## Stalk, color above ring Stalk, color below ring veil type veil color
## 1 w w p w
## 2 w w p w
## 3 w w p w
## 4 w w p w
## 5 w w p w
## 6 w w p w
## ring number ring type spore print color population habitat
## 1 o p k s u
## 2 o p n n g
## 3 o p n n m
## 4 o p k s u
## 5 o e n a g
## 6 o p k n g
My subset will be based on Edibility, as required, and also odor, population, and habitat.
subshroom_df <- subset(mushrooms_df, select= c("Edibility", "Odor", "population", "habitat"))
head(subshroom_df)
## Edibility Odor population habitat
## 1 p p s u
## 2 e a n g
## 3 e l n m
## 4 p p s u
## 5 e n a g
## 6 e a n g
To reassign values in the dataframe the library plyr is usefull
library(plyr)
change column values to descriptive names
- classes e=edible ,p=poisonous
- odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
- population:abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
- habitat:grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
subshroom_df$Edibility <- revalue(subshroom_df$Edibility, c("e" = "edible"))
subshroom_df$Edibility <- revalue(subshroom_df$Edibility, c("p" = "poisonous"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("a" = "almond"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("l" = "anise"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("c" = "creosote"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("y" = "fishy"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("f" = "foul"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("m" = "musty"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("n" = "none"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("p" = "pungent"))
subshroom_df$Odor <- revalue(subshroom_df$Odor, c("s" = "spicy"))
subshroom_df$population <- revalue(subshroom_df$population, c("a" = "abundant"))
subshroom_df$population <- revalue(subshroom_df$population, c("c" = "clustered"))
subshroom_df$population <- revalue(subshroom_df$population, c("n" = "numerous"))
subshroom_df$population <- revalue(subshroom_df$population, c("s" = "scattered"))
subshroom_df$population <- revalue(subshroom_df$population, c("v" = "several"))
subshroom_df$population <- revalue(subshroom_df$population, c("y" = "solitary"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("g" = "grasses"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("l" = "leaves"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("m" = "meadows"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("p" = "pathes"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("u" = "urban"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("w" = "waste"))
subshroom_df$habitat <- revalue(subshroom_df$habitat, c("d" = "woods"))
head(subshroom_df)
## Edibility Odor population habitat
## 1 poisonous pungent scattered urban
## 2 edible almond numerous grasses
## 3 edible anise numerous meadows
## 4 poisonous pungent scattered urban
## 5 edible none abundant grasses
## 6 edible almond numerous grasses
Let’s take a look at the data to see if there are any interesting patterns. Since these data are catagorical, pie charts might be handy in looking at how the percentages edible vs poisonous is distributed. I used http://www.statmethods.net/graphs/pie.html as a guide.
Edibility_pie <- table(subshroom_df$Edibility)
lbls <- paste(names(Edibility_pie), "\n", Edibility_pie, sep = " ")
pie(Edibility_pie, labels = lbls, main = "Pie Chart of Mushroom Edibility\n (with sample sizes)")

Next we will look at Odor
Odor_pie <- table(subshroom_df$Odor)
lbls <- paste(names(Odor_pie) ,"\n", Odor_pie, sep = " ")
pie(Odor_pie, labels = lbls, main = "Mushroom Odor Pie Chart\n (with sample sizes)")

That’s a little crowded, though I am surprised to find “none” as the largest catagory. Let’s try a bar chart.
odor_barplot <- table(subshroom_df$Odor)
barplot(odor_barplot, main = "Mushroom Odor Distribution" , xlab = "Odor Types", cex.names = 0.45)

I had to make the Odor type print small so all types would be visible. We will look at population next
pop_barplot <- table(subshroom_df$population)
barplot(pop_barplot, main = "Mushroom Population Distribution" , xlab = "Population Types", cex.names = 0.45)

Finally we will look at habitat.
hab_barplot <- table(subshroom_df$habitat)
barplot(hab_barplot, main = "Mushroom Habitat Distribution" , xlab = "Habitat Types", cex.names = 0.45)
