Our task is to study the famous Mushrooms Dataset and the associated description of the data (i.e. “data dictionary”). We should take the data, and create a data frame with a subset of the columns in the dataset. We should include the column that indicates edible or poisonous and three or four other columns. We should also add meaningful column names and replace the abbreviations used in the data—for example, in the appropriate column, “e” might become “edible.” Our deliverable is the R code to perform these transformation tasks.
Dataset Source: https://archive.ics.uci.edu/ml/datasets/Mushroom
mushroom <- read.table("https://raw.githubusercontent.com/ahmshahparan/DATA607_WEEK01/master/agaricus-lepiota.data", sep=",")
head(mushroom)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
dataSubset <- subset(mushroom, select = c(V1,V2,V4,V6,V22,V23))
head(dataSubset)
## V1 V2 V4 V6 V22 V23
## 1 p x n p s u
## 2 e x y a n g
## 3 e b w l n m
## 4 p x w p s u
## 5 e x g n a g
## 6 e x y a n g
colnames(dataSubset) <- c("V1"="Class", "V2"="Cap-Shape", "V4"="Cap-Color", "V6"="Odor", "V22"="Population","V23"="Habitat")
head(dataSubset)
## Class Cap-Shape Cap-Color Odor Population Habitat
## 1 p x n p s u
## 2 e x y a n g
## 3 e b w l n m
## 4 p x w p s u
## 5 e x g n a g
## 6 e x y a n g
levels(dataSubset$Class) <- c(levels(dataSubset$Class), "edible", "poisonous")
dataSubset$Class[dataSubset$Class == 'e'] <- 'edible'
dataSubset$Class[dataSubset$Class == 'p'] <- 'poisonous'
levels(dataSubset$`Cap-Shape`) <- c(levels(dataSubset$`Cap-Shape`), "bell", "conical", "convex", "flat", "knobbed", "sunken")
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'b'] <- 'bell'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'c'] <- 'conical'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'x'] <- 'convex'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'f'] <- 'flat'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 'k'] <- 'knobbed'
dataSubset$`Cap-Shape`[dataSubset$`Cap-Shape` == 's'] <- 'sunken'
levels(dataSubset$`Cap-Color`) <- c(levels(dataSubset$`Cap-Color`), "brown", "buff", "cinnamon", "gray", "green", "pink", "purple", "red", "white", "yellow")
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'n'] <- 'brown'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'b'] <- 'buff'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'c'] <- 'cinnamon'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'g'] <- 'gray'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'r'] <- 'green'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'p'] <- 'pink'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'u'] <- 'purple'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'e'] <- 'red'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'w'] <- 'white'
dataSubset$`Cap-Color`[dataSubset$`Cap-Color` == 'y'] <- 'yellow'
levels(dataSubset$Odor) <- c(levels(dataSubset$Odor), "almond", "anise", "creosote", "fishy", "foul", "musty", "none", "pungent", "spicy")
dataSubset$Odor[dataSubset$Odor == 'a'] <- 'almond'
dataSubset$Odor[dataSubset$Odor == 'l'] <- 'anise'
dataSubset$Odor[dataSubset$Odor == 'c'] <- 'creosote'
dataSubset$Odor[dataSubset$Odor == 'y'] <- 'fishy'
dataSubset$Odor[dataSubset$Odor == 'f'] <- 'foul'
dataSubset$Odor[dataSubset$Odor == 'm'] <- 'musty'
dataSubset$Odor[dataSubset$Odor == 'n'] <- 'none'
dataSubset$Odor[dataSubset$Odor == 'p'] <- 'pungent'
dataSubset$Odor[dataSubset$Odor == 's'] <- 'spicy'
levels(dataSubset$Population) <- c(levels(dataSubset$Population), "scattered", "numerous", "abundant", "clustered", "several", "solitary")
dataSubset$Population[dataSubset$Population == 's'] <- 'scattered'
dataSubset$Population[dataSubset$Population == 'n'] <- 'numerous'
dataSubset$Population[dataSubset$Population == 'a'] <- 'abundant'
dataSubset$Population[dataSubset$Population == 'c'] <- 'clustered'
dataSubset$Population[dataSubset$Population == 'v'] <- 'several'
dataSubset$Population[dataSubset$Population == 'y'] <- 'solitary'
levels(dataSubset$Habitat) <- c(levels(dataSubset$Habitat), "grasses", "leaves", "meadows", "paths", "urban", "waste", "woods")
dataSubset$Habitat[dataSubset$Habitat == 'g'] <- 'grasses'
dataSubset$Habitat[dataSubset$Habitat == 'l'] <- 'leaves'
dataSubset$Habitat[dataSubset$Habitat == 'm'] <- 'meadows'
dataSubset$Habitat[dataSubset$Habitat == 'p'] <- 'paths'
dataSubset$Habitat[dataSubset$Habitat == 'u'] <- 'urban'
dataSubset$Habitat[dataSubset$Habitat == 'w'] <- 'waste'
dataSubset$Habitat[dataSubset$Habitat == 'd'] <- 'woods'
head(dataSubset)
## Class Cap-Shape Cap-Color Odor Population Habitat
## 1 poisonous convex brown pungent scattered urban
## 2 edible convex yellow almond numerous grasses
## 3 edible bell white anise numerous meadows
## 4 poisonous convex white pungent scattered urban
## 5 edible convex gray none abundant grasses
## 6 edible convex yellow almond numerous grasses