Data 607_Week 01 Assignment

Task

Our task is to study the famous Mushrooms Dataset and the associated description of the data (i.e. “data dictionary”). We should take the data, and create a data frame with a subset of the columns in the dataset. We should include the column that indicates edible or poisonous and three or four other columns. We should also add meaningful column names and replace the abbreviations used in the data—for example, in the appropriate column, “e” might become “edible.” Our deliverable is the R code to perform these transformation tasks.

01. Loading Mushrooms Dataset

Dataset Source: https://archive.ics.uci.edu/ml/datasets/Mushroom

mushroom <- read.table("https://raw.githubusercontent.com/ahmshahparan/DATA607_WEEK01/master/agaricus-lepiota.data", sep=",")
head(mushroom)

##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

02. Subsetting the Dataset

dataSubset <- subset(mushroom, select = c(V1,V2,V4,V6,V22,V23))
head(dataSubset)

##   V1 V2 V4 V6 V22 V23
## 1  p  x  n  p   s   u
## 2  e  x  y  a   n   g
## 3  e  b  w  l   n   m
## 4  p  x  w  p   s   u
## 5  e  x  g  n   a   g
## 6  e  x  y  a   n   g

03. Renaming the Column names

colnames(dataSubset) <- c("V1"="Class", "V2"="Cap-Shape", "V4"="Cap-Color", "V6"="Odor", "V22"="Population","V23"="Habitat")
head(dataSubset)

##   Class Cap-Shape Cap-Color Odor Population Habitat
## 1     p         x         n    p          s       u
## 2     e         x         y    a          n       g
## 3     e         b         w    l          n       m
## 4     p         x         w    p          s       u
## 5     e         x         g    n          a       g
## 6     e         x         y    a          n       g

04. Replacing the Attributes’ abbreviations

levels(dataSubset$Class) <- c(levels(dataSubset$Class), "edible", "poisonous")
dataSubset$Class[dataSubset$Class == 'e'] <- 'edible'
dataSubset$Class[dataSubset$Class == 'p'] <- 'poisonous'

levels(dataSubset$`Cap-Shape`) <- c(levels(dataSubset$`Cap-Shape`), "bell", "conical", "convex", "flat", "knobbed", "sunken")
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'b'] <- 'bell'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'c'] <- 'conical'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'x'] <- 'convex'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'f'] <- 'flat'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'k'] <- 'knobbed'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 's'] <- 'sunken'


levels(dataSubset$`Cap-Color`) <- c(levels(dataSubset$`Cap-Color`), "brown", "buff", "cinnamon", "gray", "green", "pink", "purple", "red", "white", "yellow")
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'n'] <- 'brown'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'b'] <- 'buff'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'c'] <- 'cinnamon'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'g'] <- 'gray'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'r'] <- 'green'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'p'] <- 'pink'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'u'] <- 'purple'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'e'] <- 'red'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'w'] <- 'white'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'y'] <- 'yellow'

levels(dataSubset$Odor) <- c(levels(dataSubset$Odor), "almond", "anise", "creosote", "fishy", "foul", "musty", "none", "pungent", "spicy")
dataSubset$Odor[dataSubset$Odor == 'a'] <- 'almond'
dataSubset$Odor[dataSubset$Odor == 'l'] <- 'anise'
dataSubset$Odor[dataSubset$Odor == 'c'] <- 'creosote'
dataSubset$Odor[dataSubset$Odor == 'y'] <- 'fishy'
dataSubset$Odor[dataSubset$Odor == 'f'] <- 'foul'
dataSubset$Odor[dataSubset$Odor == 'm'] <- 'musty'
dataSubset$Odor[dataSubset$Odor == 'n'] <- 'none'
dataSubset$Odor[dataSubset$Odor == 'p'] <- 'pungent'
dataSubset$Odor[dataSubset$Odor == 's'] <- 'spicy'

levels(dataSubset$Population) <- c(levels(dataSubset$Population), "scattered", "numerous", "abundant", "clustered", "several", "solitary")
dataSubset$Population[dataSubset$Population == 's'] <- 'scattered'
dataSubset$Population[dataSubset$Population == 'n'] <- 'numerous'
dataSubset$Population[dataSubset$Population == 'a'] <- 'abundant'
dataSubset$Population[dataSubset$Population == 'c'] <- 'clustered'
dataSubset$Population[dataSubset$Population == 'v'] <- 'several'
dataSubset$Population[dataSubset$Population == 'y'] <- 'solitary'

levels(dataSubset$Habitat) <- c(levels(dataSubset$Habitat), "grasses", "leaves", "meadows", "paths", "urban", "waste", "woods")
dataSubset$Habitat[dataSubset$Habitat == 'g'] <- 'grasses'
dataSubset$Habitat[dataSubset$Habitat == 'l'] <- 'leaves'
dataSubset$Habitat[dataSubset$Habitat == 'm'] <- 'meadows'
dataSubset$Habitat[dataSubset$Habitat == 'p'] <- 'paths'
dataSubset$Habitat[dataSubset$Habitat == 'u'] <- 'urban'
dataSubset$Habitat[dataSubset$Habitat == 'w'] <- 'waste'
dataSubset$Habitat[dataSubset$Habitat == 'd'] <- 'woods'

05. Transformed Dataset

head(dataSubset)

##       Class Cap-Shape Cap-Color    Odor Population Habitat
## 1 poisonous    convex     brown pungent  scattered   urban
## 2    edible    convex    yellow  almond   numerous grasses
## 3    edible      bell     white   anise   numerous meadows
## 4 poisonous    convex     white pungent  scattered   urban
## 5    edible    convex      gray    none   abundant grasses
## 6    edible    convex    yellow  almond   numerous grasses