The vector below is assigned to the element names for the mushrooms_attribute list and the header values for the mushrooms tbl_df we will create.
header_values <- c("mushroom_class", "cap_shape", "cap_surface", "cap_color", "bruises", "odor", "gill_attachment", "gill_spacing", "gill_size", "gill_color", "stalk_shape", "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring", "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", "ring_type", "spore_print_color", "population", "habitat")
The below creates a list of named vectors of unequal length to store the abbreviations and attribute names. Each set of mushroom attributes is its own list with the abbreviations as the name for each vector element. This was done in order to store the mushroom attributes as a generalized, unsorted object which can be updated and referenced as needed.
mushroom_attributes <- list(
c("edible", "poisonous"),
c("bell", "conical", "convex", "flat", "knobbed", "sunken"),
c("fibrous", "grooves", "scaly", "smooth"),
c("brown", "buff", "cinnamon", "gray", "green", "pink", "purple", "red", "white", "yellow"),
c("bruises", "no"),
c("almond", "anise", "creosote", "fishy", "foul", "musty", "none", "pungent", "spicy"),
c("attached", "descending", "free", "notched"),
c("close", "crowded", "distant"),
c("broad", "narrow"),
c("black", "brown", "buff", "chocolate", "gray", "green", "orange", "pink", "purple", "red", "white", "yellow"),
c("enlarging", "tapering"),
c("bulbous", "club", "cup", "equal", "rhizomorphs", "rooted", "missing"),
c("fibrous", "scaly", "silky", "smooth"),
c("fibrous", "scaly", "silky", "smooth"),
c("brown", "buff", "cinnamon", "gray", "orange", "pink", "red", "white", "yellow"),
c("brown", "buff", "cinnamon", "gray", "orange", "pink", "red", "white", "yellow"),
c("partial", "universal"),
c("brown", "orange", "white", "yellow"),
c("none", "one", "two"),
c("cobwebby", "evanescent", "flaring", "large", "none", "pendant", "sheathing", "zone"),
c("black", "brown", "buff", "chocolate", "green", "orange", "purple", "white", "yellow"),
c("abundant", "clustered", "numerous", "scattered", "several", "solitary"),
c("grasses", "leaves", "meadows", "paths", "urban", "waste", "woods")
)
mushroom_abbreviations <- list(
c("e", "p"),
c("b", "c", "x", "f", "k", "s"),
c("f", "g", "y", "s"),
c("n", "b", "c", "g", "r", "p", "u", "e", "w", "y"),
c("t", "f"),
c("a", "l", "c", "y", "f", "m", "n", "p", "s"),
c("a", "d", "f", "n"),
c("c", "w", "d"),
c("b", "n"),
c("k", "n", "b", "h", "g", "r", "o", "p", "u", "e", "w", "y"),
c("e", "t"),
c("b", "c", "u", "e", "z", "r", "?"),
c("f", "y", "k", "s"),
c("f", "y", "k", "s"),
c("n", "b", "c", "g", "o", "p", "e", "w", "y"),
c("n", "b", "c", "g", "o", "p", "e", "w", "y"),
c("p", "u"),
c("n", "o", "w", "y"),
c("n", "o", "t"),
c("c", "e", "f", "l", "n", "p", "s", "z"),
c("k", "n", "b", "h", "r", "o", "u", "w", "y"),
c("a", "c", "n", "s", "v", "y"),
c("g", "l", "m", "p", "u", "w", "d")
)
# for loop is used to assign a vector name from the mushroom_abbreviations list of vectors to the mushroom_attributes list of vectors
for(i in 1:length(mushroom_attributes)) {
names(mushroom_attributes[[i]]) <- mushroom_abbreviations[[i]]
}
# The names for each list element is assigned with the values within the header_values vector
names(mushroom_attributes) <- header_values
We will make use of the readr package and the read_delim function to read in the mushrooms data for easier data manipulation later on. After the file is read, we will output the head of the tbl_df object.
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
# Load the readr library to create a tbl_df object; this is needed to make use of the dplyr library later on in our output
library(readr)
mushrooms <- read_delim(file = data_url, delim = ',', , na = c("?"), col_names = FALSE)
names(mushrooms) <- header_values
print(mushrooms)
## # A tibble: 8,124 x 23
## mushroom_class cap_shape cap_surface cap_color bruises odor
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x s n t p
## 2 e x s y t a
## 3 e b s w t l
## 4 p x y w t p
## 5 e x s g f n
## 6 e x y y t a
## 7 e b s w t a
## 8 e b y w t l
## 9 p x y w t p
## 10 e b s y t a
## # ... with 8,114 more rows, and 17 more variables: gill_attachment <chr>,
## # gill_spacing <chr>, gill_size <chr>, gill_color <chr>,
## # stalk_shape <chr>, stalk_root <chr>, stalk_surface_above_ring <chr>,
## # stalk_surface_below_ring <chr>, stalk_color_above_ring <chr>,
## # stalk_color_below_ring <chr>, veil_type <chr>, veil_color <chr>,
## # ring_number <chr>, ring_type <chr>, spore_print_color <chr>,
## # population <chr>, habitat <chr>
The below function will transform the mushrom tbl_df such that the abbreviations are more readable.
mushroom_transform <- function(data_set, data_headers) {
for(i in data_headers) {
data_set[[i]] <- mushroom_attributes[[i]][data_set[[i]]]
}
return(data_set)
}
# Load the dplyr library to make use of the select function
library(dplyr)
# Create a character vector of header columns we want returned
query <- c("mushroom_class", "bruises", "population", "habitat")
# Use the select function to output a subset of our mushrooms tbl_df
mushroom_output <- select(mushrooms, query)
print(mushroom_output)
## # A tibble: 8,124 x 4
## mushroom_class bruises population habitat
## <chr> <chr> <chr> <chr>
## 1 p t s u
## 2 e t n g
## 3 e t n m
## 4 p t s u
## 5 e f a g
## 6 e t n g
## 7 e t n m
## 8 e t s m
## 9 p t v g
## 10 e t s m
## # ... with 8,114 more rows
# Use the mushroom_transform function to take our previous output and transform all of the abbreviations
mushroom_output2 <- mushroom_transform(mushroom_output, query)
print(mushroom_output2)
## # A tibble: 8,124 x 4
## mushroom_class bruises population habitat
## <chr> <chr> <chr> <chr>
## 1 poisonous bruises scattered urban
## 2 edible bruises numerous grasses
## 3 edible bruises numerous meadows
## 4 poisonous bruises scattered urban
## 5 edible no abundant grasses
## 6 edible bruises numerous grasses
## 7 edible bruises numerous meadows
## 8 edible bruises scattered meadows
## 9 poisonous bruises several grasses
## 10 edible bruises scattered meadows
## # ... with 8,114 more rows