Mushroom Dataset

This assignment uses a dataset about mushrooms in the Agaricus and Lepiota Family. The data set can be found in the UCI repository https://archive.ics.uci.edu/ml/datasets/Mushroom.

We take the data, and create a data frame with a subset of the attributes in the dataset. We include the first attribute that indicates edible or poisonous (“type”) and four other attributes (“cap_shape”, “cap_color”, “odor”, and “population”).

Load packages

library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops

I didn’t use the next block of code because for some reason I can’t think of, the first line of the data obtained includes some numbers mixed togeter and they are not in the data in the web page. Consequently, it changes the number of columns for the first line from the rest of the data.

#table <-read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", sep=",")
#head(table)

Of course, I could use regular expressions to clean up the numbers that appeared but, instead, I did the following to retrieve the data.

Parsing data from web site

con <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data","r")
data <- readLines(con)
head(data)
## [1] "p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u"
## [2] "e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g"
## [3] "e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m"
## [4] "p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u"
## [5] "e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g"
## [6] "e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g"

Read data into a csv temp file and change it to data frame

tf <- tempfile()
writeLines(data, tf)
ncol <- max(count.fields(tf, sep = ","))
df<-read.csv(tf, fill = TRUE, header = FALSE,col.names = paste0("V", seq_len(ncol)))
unlink(tf)
head(df)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Subsetting the data frame and renaming columns

df<-df[,c(1,2,4,6,22)]
head(df)
##   V1 V2 V4 V6 V22
## 1  p  x  n  p   s
## 2  e  x  y  a   n
## 3  e  b  w  l   n
## 4  p  x  w  p   s
## 5  e  x  g  n   a
## 6  e  x  y  a   n
names(df) <- c("type", "cap_shape", "cap_color","odor", "population")
names(df)
## [1] "type"       "cap_shape"  "cap_color"  "odor"       "population"

Cleanse type

df$type <- ifelse(str_detect(df$type, "p")==T, "poisonous", "edible")
df[1:5,]
##        type cap_shape cap_color odor population
## 1 poisonous         x         n    p          s
## 2    edible         x         y    a          n
## 3    edible         b         w    l          n
## 4 poisonous         x         w    p          s
## 5    edible         x         g    n          a

Cleanse cap_shape

patterns<-c("bell"="b","conical"="c","convex"="x","flat"="f", "knobbed"="k","sunken"="s")
match <- sapply(patterns, grepl, df$cap_shape, ignore.case = T) #logical vector indicating which elements match
df$cap_shape <- colnames(match)[max.col(match, ties.method = "first")]
df[1:5,]
##        type cap_shape cap_color odor population
## 1 poisonous    convex         n    p          s
## 2    edible    convex         y    a          n
## 3    edible      bell         w    l          n
## 4 poisonous    convex         w    p          s
## 5    edible    convex         g    n          a

Cleanse cap_color

patterns<-c("brown"="n","buff"="b","cinnamon"="c","gray"="g","green"="r", "pink"="p","purple"="u","red"="e","white"="w","yellow"="y")
match <- sapply(patterns, grepl, df$cap_color, ignore.case = T) 
df$cap_color <- colnames(match)[max.col(match, ties.method = "first")]

Cleanse odor

patterns<-c("almond"="a","anise"="l","creosote"="c","fishy"="y","foul"="f", "musty"="m","none"="n","pungent"="p","spicy"="s" )
match <- sapply(patterns, grepl, df$odor, ignore.case = T) 
df$odor <- colnames(match)[max.col(match, ties.method = "first")]

Cleanse population

patterns<-c("abundant"="a","clustered"="c","numerous"="n", "scattered"="s","several"="v","solitary"="y" )
match <- sapply(patterns, grepl, df$population, ignore.case = T) 
df$population <- colnames(match)[max.col(match, ties.method = "first")]

df[1:5,]
##        type cap_shape cap_color    odor population
## 1 poisonous    convex     brown pungent  scattered
## 2    edible    convex    yellow  almond   numerous
## 3    edible      bell     white   anise   numerous
## 4 poisonous    convex     white pungent  scattered
## 5    edible    convex      gray    none   abundant
summary(df)
##      type            cap_shape          cap_color        
##  Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      odor            population       
##  Length:8124        Length:8124       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character