Use read.csv and readLines to get data directly from web
require(tidyr, quietly = T, warn.conflicts = F)
## Warning: package 'tidyr' was built under R version 3.5.1
require(dplyr, quietly = T, warn.conflicts = F)
## Warning: package 'dplyr' was built under R version 3.5.1
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom_df <- read.csv(data_url, header=F, stringsAsFactors=F)
attribute_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names"
attribute_text <- readLines(attribute_url)
Use various regex and R commands to find, clean, and populate column names, thereby (mostly) avoiding manual labor.
headers_text <- attribute_text[106:140]
headers_list <- headers_text %>% strsplit(., "\\.")
column_names <- headers_list %>% unlist()
column_names <- column_names[column_names %>% grep(":", .)]
column_names <- column_names %>% strsplit(., ":") %>% lapply(., function(x) x[1]) %>% unlist() %>% trimws()
column_names[1] <- "edibility"
column_names <- column_names %>% gsub("-", "_", .) %>% gsub("\\?", "", .)
names(mushroom_df) <- column_names
Make the data frame more readable by replacing one-letter abbreviations with the full words. Do this by first wrangling the abbreviations into a list of data frames (where each list element corresponds to a column of mushroom_df), and then looping through the list to match the abbreviation with the full word to replace within mushroom_df.
abbrev_dict <- paste(headers_text, sep="", collapse="") %>% strsplit(., ":") %>% unlist()
abbrev_dict <- abbrev_dict[-c(1,2)] %>% as.list()
abbrev_dict <- abbrev_dict %>% lapply(., function(x) strsplit(x, ",") %>% unlist() %>% trimws())
abbrev_dict <- abbrev_dict %>% lapply(., function(x) strsplit(x, " ") %>% unlist())
abbrev_dict <- abbrev_dict %>% lapply(., function(x) x[grep("=", x)])
abbrev_dict[[1]] <- abbrev_dict[[1]] %>% gsub(")", "", .)
abbrev_dict <- abbrev_dict %>%
lapply(., function(x) x %>% as.data.frame() %>% separate(., `.`, into=c("full", "short"), sep="="))
for (i in 1:ncol(mushroom_df)){
mushroom_df[ ,i] <- abbrev_dict[[i]]$full[sapply(mushroom_df[ ,i],
function(x) which(abbrev_dict[[i]]$short==x)) %>% unname()]
}
This looks much nicer!
knitr::kable(mushroom_df %>% head() %>% select(1:10))
| edibility | cap_shape | cap_surface | cap_color | bruises | odor | gill_attachment | gill_spacing | gill_size | gill_color |
|---|---|---|---|---|---|---|---|---|---|
| poisonous | convex | smooth | brown | bruises | pungent | free | close | narrow | black |
| edible | convex | smooth | yellow | bruises | almond | free | close | broad | black |
| edible | bell | smooth | white | bruises | anise | free | close | broad | brown |
| poisonous | convex | scaly | white | bruises | pungent | free | close | narrow | brown |
| edible | convex | smooth | gray | no | none | free | crowded | broad | black |
| edible | convex | scaly | yellow | bruises | almond | free | close | broad | brown |