This assignment uses a dataset about mushrooms in the Agaricus and Lepiota Family. The data set can be found in the UCI repository https://archive.ics.uci.edu/ml/datasets/Mushroom.
We take the data, and create a data frame with a subset of the attributes in the dataset. We include the first attribute that indicates edible or poisonous (“type”) and four other attributes (“cap_shape”, “cap_color”, “odor”, and “population”).
library(stringr)
library(XML)
library(RCurl)
## Loading required package: bitops
I didn’t use the next block of code because for some reason I can’t think of, the first line of the data obtained includes some numbers mixed togeter and they are not in the data in the web page. Consequently, it changes the number of columns for the first line from the rest of the data.
#table <-read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", sep=",")
#head(table)
Of course, I could use regular expressions to clean up the numbers that appeared but, instead, I did the following to retrieve the data.
con <- url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data","r")
data <- readLines(con)
head(data)
## [1] "p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u"
## [2] "e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g"
## [3] "e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m"
## [4] "p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u"
## [5] "e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g"
## [6] "e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g"
tf <- tempfile()
writeLines(data, tf)
ncol <- max(count.fields(tf, sep = ","))
df<-read.csv(tf, fill = TRUE, header = FALSE,col.names = paste0("V", seq_len(ncol)))
unlink(tf)
head(df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
df<-df[,c(1,2,4,6,22)]
head(df)
## V1 V2 V4 V6 V22
## 1 p x n p s
## 2 e x y a n
## 3 e b w l n
## 4 p x w p s
## 5 e x g n a
## 6 e x y a n
names(df) <- c("type", "cap_shape", "cap_color","odor", "population")
names(df)
## [1] "type" "cap_shape" "cap_color" "odor" "population"
df$type <- ifelse(str_detect(df$type, "p")==T, "poisonous", "edible")
df[1:5,]
## type cap_shape cap_color odor population
## 1 poisonous x n p s
## 2 edible x y a n
## 3 edible b w l n
## 4 poisonous x w p s
## 5 edible x g n a
patterns<-c("bell"="b","conical"="c","convex"="x","flat"="f", "knobbed"="k","sunken"="s")
match <- sapply(patterns, grepl, df$cap_shape, ignore.case = T) #logical vector indicating which elements match
df$cap_shape <- colnames(match)[max.col(match, ties.method = "first")]
df[1:5,]
## type cap_shape cap_color odor population
## 1 poisonous convex n p s
## 2 edible convex y a n
## 3 edible bell w l n
## 4 poisonous convex w p s
## 5 edible convex g n a
patterns<-c("brown"="n","buff"="b","cinnamon"="c","gray"="g","green"="r", "pink"="p","purple"="u","red"="e","white"="w","yellow"="y")
match <- sapply(patterns, grepl, df$cap_color, ignore.case = T)
df$cap_color <- colnames(match)[max.col(match, ties.method = "first")]
patterns<-c("almond"="a","anise"="l","creosote"="c","fishy"="y","foul"="f", "musty"="m","none"="n","pungent"="p","spicy"="s" )
match <- sapply(patterns, grepl, df$odor, ignore.case = T)
df$odor <- colnames(match)[max.col(match, ties.method = "first")]
patterns<-c("abundant"="a","clustered"="c","numerous"="n", "scattered"="s","several"="v","solitary"="y" )
match <- sapply(patterns, grepl, df$population, ignore.case = T)
df$population <- colnames(match)[max.col(match, ties.method = "first")]
df[1:5,]
## type cap_shape cap_color odor population
## 1 poisonous convex brown pungent scattered
## 2 edible convex yellow almond numerous
## 3 edible bell white anise numerous
## 4 poisonous convex white pungent scattered
## 5 edible convex gray none abundant
summary(df)
## type cap_shape cap_color
## Length:8124 Length:8124 Length:8124
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## odor population
## Length:8124 Length:8124
## Class :character Class :character
## Mode :character Mode :character