Your task is to study the dataset and the associated description of the data (i.e. “data dictionary”). You may need to look around a bit, but it’s there!

Your deliverable is the R code to perform these transformation tasks

Setup

Mushroom Documentation

## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: plyr

Load & Sample Data

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mush <- getURL(url)
my.mush <- read.csv(text=mush, header = F, sep = ",", stringsAsFactors = F)
head(my.mush)
  V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
  V21 V22 V23
1   k   s   u
2   n   n   g
3   n   n   m
4   k   s   u
5   n   a   g
6   k   n   g

Subset & transform data

new.mush <- my.mush[, c(1, 5, 22, 23)]

names(new.mush) <- c("edibility", "has_bruises", "population", "habitat")
#alternative: rename()

new.mush$edibility[new.mush$edibility == "e"] <- "edible"
new.mush$edibility[new.mush$edibility == "p"] <- "poisonous"

new.mush$has_bruises[new.mush$has_bruises == "t"] <- "TRUE"
new.mush$has_bruises[new.mush$has_bruises == "f"] <- "FALSE"

new.mush$population[new.mush$population == "a"] <- "abundant"
new.mush$population[new.mush$population == "c"] <- "clustered"
new.mush$population[new.mush$population == "n"] <- "numerous"
new.mush$population[new.mush$population == "s"] <- "scattered"
new.mush$population[new.mush$population == "v"] <- "several"
new.mush$population[new.mush$population == "y"] <- "solitary"

new.mush$habitat[new.mush$habitat == "g"] <- "grasses"
new.mush$habitat[new.mush$habitat == "l"] <- "leaves"
new.mush$habitat[new.mush$habitat == "m"] <- "meadows"
new.mush$habitat[new.mush$habitat == "p"] <- "paths"
new.mush$habitat[new.mush$habitat == "u"] <- "urban"
new.mush$habitat[new.mush$habitat == "w"] <- "waste"
new.mush$habitat[new.mush$habitat == "d"] <- "woods"

Sample Data & Convert back to factors

new.mush[] <- lapply(new.mush, factor)
head(new.mush)
  edibility has_bruises population habitat
1 poisonous        TRUE  scattered   urban
2    edible        TRUE   numerous grasses
3    edible        TRUE   numerous meadows
4 poisonous        TRUE  scattered   urban
5    edible       FALSE   abundant grasses
6    edible        TRUE   numerous grasses
str(new.mush)
'data.frame':   8124 obs. of  4 variables:
 $ edibility  : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
 $ has_bruises: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 1 2 2 2 2 2 ...
 $ population : Factor w/ 6 levels "abundant","clustered",..: 4 3 3 4 1 3 3 4 5 4 ...
 $ habitat    : Factor w/ 7 levels "grasses","leaves",..: 5 1 3 5 1 1 3 3 1 3 ...

How is the poisonous mushroom population distributed proportionally?

p.mush <- subset(new.mush$population, new.mush$edibility == "poisonous")

round(t(t(prop.table(table(p.mush)))),2)
           
p.mush      [,1]
  abundant  0.00
  clustered 0.01
  numerous  0.00
  scattered 0.09
  several   0.73
  solitary  0.17