Using a famous—if slightly moldy—dataset about mushrooms can be found in the UCI repository here: https://archive.ics.uci.edu/ml/datasets/Mushroom. Study the dataset and the associated description of the data (i.e. “data dictionary”). Take the data and create a data frame with a subset of the columns in the dataset. Add meaningful column names and replace the abbreviations used in the data. Deliverable is the R code to perform these transformation tasks.
data <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom <- read.table(file = data, header = FALSE, sep = ",")
subset <- mushroom[, c("V1", "V2", "V4", "V6", "V23")]
names(subset) <- c("Class", "Cap-Shape", "Cap-Color", "Odor", "Habitat")
levels(subset$Class) <- c(levels(subset$Class), "edible", "poisonous")
subset$Class[subset$Class == 'e'] <- 'edible'
subset$Class[subset$Class == 'p'] <- 'poisonous'
levels(subset$`Cap-Shape`) <- c(levels(subset$`Cap-Shape`), "bell", "conical", "convex", "flat", "knobbed", "sunken")
subset$`Cap-Shape`[subset$`Cap-Shape` == 'b'] <- 'bell'
subset$`Cap-Shape`[subset$`Cap-Shape` == 'c'] <- 'conical'
subset$`Cap-Shape`[subset$`Cap-Shape` == 'x'] <- 'convex'
subset$`Cap-Shape`[subset$`Cap-Shape` == 'f'] <- 'flat'
subset$`Cap-Shape`[subset$`Cap-Shape` == 'k'] <- 'knobbed'
subset$`Cap-Shape`[subset$`Cap-Shape` == 's'] <- 'sunken'
levels(subset$`Cap-Color`) <- c(levels(subset$`Cap-Color`), "brown", "buff", "cinnamon", "gray", "green", "pink", "purple", "red", "white", "yellow")
subset$`Cap-Color`[subset$`Cap-Color` == 'n'] <- 'brown'
subset$`Cap-Color`[subset$`Cap-Color` == 'b'] <- 'buff'
subset$`Cap-Color`[subset$`Cap-Color` == 'c'] <- 'cinnamon'
subset$`Cap-Color`[subset$`Cap-Color` == 'g'] <- 'gray'
subset$`Cap-Color`[subset$`Cap-Color` == 'r'] <- 'green'
subset$`Cap-Color`[subset$`Cap-Color` == 'p'] <- 'pink'
subset$`Cap-Color`[subset$`Cap-Color` == 'u'] <- 'purple'
subset$`Cap-Color`[subset$`Cap-Color` == 'e'] <- 'red'
subset$`Cap-Color`[subset$`Cap-Color` == 'w'] <- 'white'
subset$`Cap-Color`[subset$`Cap-Color` == 'y'] <- 'yellow'
levels(subset$Odor) <- c(levels(subset$Odor), "almond", "anise", "creosote", "fishy", "foul", "musty", "none", "pungent", "spicy")
subset$Odor[subset$Odor == 'a'] <- 'almond'
subset$Odor[subset$Odor == 'l'] <- 'anise'
subset$Odor[subset$Odor == 'c'] <- 'creosote'
subset$Odor[subset$Odor == 'y'] <- 'fishy'
subset$Odor[subset$Odor == 'f'] <- 'foul'
subset$Odor[subset$Odor == 'm'] <- 'musty'
subset$Odor[subset$Odor == 'n'] <- 'none'
subset$Odor[subset$Odor == 'p'] <- 'pungent'
subset$Odor[subset$Odor == 's'] <- 'spicy'
levels(subset$Habitat) <- c(levels(subset$Habitat), "grasses", "leaves", "meadows", "paths", "urban", "waste", "woods")
subset$Habitat[subset$Habitat == 'g'] <- 'grasses'
subset$Habitat[subset$Habitat == 'l'] <- 'leaves'
subset$Habitat[subset$Habitat == 'm'] <- 'meadows'
subset$Habitat[subset$Habitat == 'p'] <- 'paths'
subset$Habitat[subset$Habitat == 'u'] <- 'urban'
subset$Habitat[subset$Habitat == 'w'] <- 'waste'
subset$Habitat[subset$Habitat == 'd'] <- 'woods'
head(subset)
## Class Cap-Shape Cap-Color Odor Habitat
## 1 poisonous convex brown pungent urban
## 2 edible convex yellow almond grasses
## 3 edible bell white anise meadows
## 4 poisonous convex white pungent urban
## 5 edible convex gray none grasses
## 6 edible convex yellow almond grasses
Source code I found to produce the “Replaced abbreviations with descriptions”: https://stackoverflow.com/questions/11810605/replace-contents-of-factor-column-in-r-dataframe