In this assignment we are tasked with downloading a famous dataset about mushrooms from : https://archive.ics.uci.edu/ml/datasets/Mushroom, putting the data into a data frame such as

Load Data

Import file as CSV

Data has no header row so we must indicate “header=F”.

mushrooms_df<-read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header= FALSE, sep=",")
head(mushrooms_df)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Change column names

mushrooms_df <- setNames(mushrooms_df, c("Edibility", "Cap Shape", "Cap Surface", "Cap Color", "Bruises?", "Odor", "Gill Attachment", "Gill Spacing", "Gill Size", "Gill Color", "Stalk Shape", "Stalk Root", "Stalk, surface Above Ring", "Stalk, surface below ring", "Stalk, color above ring", "Stalk, color below ring", "veil type", "veil color", "ring number", "ring type", "spore print color", "population", "habitat"))
head(mushrooms_df)
##   Edibility Cap Shape Cap Surface Cap Color Bruises? Odor Gill Attachment
## 1         p         x           s         n        t    p               f
## 2         e         x           s         y        t    a               f
## 3         e         b           s         w        t    l               f
## 4         p         x           y         w        t    p               f
## 5         e         x           s         g        f    n               f
## 6         e         x           y         y        t    a               f
##   Gill Spacing Gill Size Gill Color Stalk Shape Stalk Root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   Stalk, surface Above Ring Stalk, surface below ring
## 1                         s                         s
## 2                         s                         s
## 3                         s                         s
## 4                         s                         s
## 5                         s                         s
## 6                         s                         s
##   Stalk, color above ring Stalk, color below ring veil type veil color
## 1                       w                       w         p          w
## 2                       w                       w         p          w
## 3                       w                       w         p          w
## 4                       w                       w         p          w
## 5                       w                       w         p          w
## 6                       w                       w         p          w
##   ring number ring type spore print color population habitat
## 1           o         p                 k          s       u
## 2           o         p                 n          n       g
## 3           o         p                 n          n       m
## 4           o         p                 k          s       u
## 5           o         e                 n          a       g
## 6           o         p                 k          n       g

My subset will be based on Edibility, as required, and also odor, population, and habitat.

subshroom_df <- subset(mushrooms_df, select= c("Edibility", "Odor", "population", "habitat"))
head(subshroom_df)
##   Edibility Odor population habitat
## 1         p    p          s       u
## 2         e    a          n       g
## 3         e    l          n       m
## 4         p    p          s       u
## 5         e    n          a       g
## 6         e    a          n       g

To reassign values in the dataframe the library plyr is usefull

library(plyr)

change column values to descriptive names

Let’s take a look at the data to see if there are any interesting patterns. Since these data are catagorical, pie charts might be handy in looking at how the percentages edible vs poisonous is distributed. I used http://www.statmethods.net/graphs/pie.html as a guide.

Edibility_pie <- table(subshroom_df$Edibility)
lbls <- paste(names(Edibility_pie), "\n", Edibility_pie, sep = " ")
pie(Edibility_pie, labels = lbls, main = "Pie Chart of Mushroom Edibility\n (with sample sizes)")

Next we will look at Odor

Odor_pie <- table(subshroom_df$Odor)
lbls <- paste(names(Odor_pie) ,"\n", Odor_pie, sep = " ")
pie(Odor_pie, labels = lbls, main = "Mushroom Odor Pie Chart\n (with sample sizes)")

That’s a little crowded, though I am surprised to find “none” as the largest catagory. Let’s try a bar chart.

odor_barplot <- table(subshroom_df$Odor)
barplot(odor_barplot, main = "Mushroom Odor Distribution" , xlab = "Odor Types", cex.names = 0.45)

I had to make the Odor type print small so all types would be visible. We will look at population next

pop_barplot <- table(subshroom_df$population)
barplot(pop_barplot, main = "Mushroom Population Distribution" , xlab = "Population Types", cex.names = 0.45)

Finally we will look at habitat.

hab_barplot <- table(subshroom_df$habitat)
barplot(hab_barplot, main = "Mushroom Habitat Distribution" , xlab = "Habitat Types", cex.names = 0.45)