Download the table describing each column and save it in a text file.
# I got the idea for this from one of the sample programs
# provided in the weekly materials section. Basically, I
# just copy/pasted the table name descirptions to a text file
# and formatted them.
mushroom_table_names <- read.csv(file = "./mushroom_table_names.txt",
header = FALSE, sep = ":", stringsAsFactors = FALSE)
mushroom_col_names <- mushroom_table_names[, 1]
Read the data and apply column names from the data table
mushroom_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
mushroom_df <- read.csv(file = mushroom_url,
header = FALSE,
sep = ",",
col.names = mushroom_col_names,
#If we want to treat the missing information as NA, we can uncomment the next line. I also saw this in one of the examples provided to us, but I took it a bit further in the column value conversions below.
#na.strings = "?",
stringsAsFactors = FALSE)
head(mushroom_df)
## type cap.shape cap.surface cap.color bruises. odor gill.attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill.spacing gill.size gill.color stalk.shape stalk.root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk.surface.above.ring stalk.surface.below.ring stalk.color.above.ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk.color.below.ring veil.type veil.color ring.number ring.type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore.print.color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Convert the subsetted data frame from abbreviations to full descriptions.
# I know there has to be a way to refactor the function to
# make this even simpler, such as simply feeding a vector of
# column names [e.g. c('type', 'bruises.')], but I have run
# out of time for this assignment. My guess is that this
# could be accomplished by some form of string concatenation
# with the table from the text file.
mushroom_subset_df$type <- sapply(mushroom_subset_df$type, function(x) {
x <- ifelse(is.na(x), NA, table_converter("type", x))
})
mushroom_subset_df$cap.color <- sapply(mushroom_subset_df$cap.color,
function(x) {
x <- ifelse(is.na(x), NA, table_converter("cap.color",
x))
})
mushroom_subset_df$bruises. <- sapply(mushroom_subset_df$bruises.,
function(x) {
x <- ifelse(is.na(x), NA, table_converter("bruises.",
x))
})
mushroom_subset_df$odor <- sapply(mushroom_subset_df$odor, function(x) {
x <- ifelse(is.na(x), NA, table_converter("odor", x))
})
mushroom_subset_df$stalk.root <- sapply(mushroom_subset_df$stalk.root,
function(x) {
x <- ifelse(is.na(x), NA, table_converter("stalk.root",
x))
})
# I figured I would include code to convert all columns to
# highlight how simple it is; however, I have commented the
# non-subsetted columns in order to increase run-time.
# mushroom_subset_df$cap.shape <-
# sapply(mushroom_subset_df$cap.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.shape', x)) } )
# mushroom_subset_df$cap.surface <-
# sapply(mushroom_subset_df$cap.surface, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.surface', x)) } )
# mushroom_subset_df$gill.attachment <-
# sapply(mushroom_subset_df$gill.attachment, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('gill.attachment',
# x)) } ) mushroom_subset_df$gill.spacing <-
# sapply(mushroom_subset_df$gill.spacing, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.spacing', x)) } )
# mushroom_subset_df$gill.size <-
# sapply(mushroom_subset_df$gill.size, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.size', x)) } )
# mushroom_subset_df$gill.color <-
# sapply(mushroom_subset_df$gill.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.color', x)) } )
# mushroom_subset_df$stalk.shape <-
# sapply(mushroom_subset_df$stalk.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.shape', x)) } )
# mushroom_subset_df$stalk.surface.above.ring <-
# sapply(mushroom_subset_df$stalk.surface.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.above.ring',
# x)) } ) mushroom_subset_df$stalk.surface.below.ring <-
# sapply(mushroom_subset_df$stalk.surface.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.below.ring',
# x)) } ) mushroom_subset_df$stalk.color.above.ring <-
# sapply(mushroom_subset_df$stalk.color.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.above.ring',
# x)) } ) mushroom_subset_df$stalk.color.below.ring <-
# sapply(mushroom_subset_df$stalk.color.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.below.ring',
# x)) } ) mushroom_subset_df$veil.type <-
# sapply(mushroom_subset_df$veil.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.type', x)) } )
# mushroom_subset_df$veil.color <-
# sapply(mushroom_subset_df$veil.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.color', x)) } )
# mushroom_subset_df$ring.number <-
# sapply(mushroom_subset_df$ring.number, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.number', x)) } )
# mushroom_subset_df$ring.type <-
# sapply(mushroom_subset_df$ring.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.type', x)) } )
# mushroom_subset_df$spore.print.color <-
# sapply(mushroom_subset_df$spore.print.color, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('spore.print.color',
# x)) } ) mushroom_subset_df$population <-
# sapply(mushroom_subset_df$population, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('population', x)) } )
# mushroom_subset_df$habitat <-
# sapply(mushroom_subset_df$habitat, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('habitat', x)) } )
head(mushroom_subset_df)
## type bruises. odor cap.color stalk.root
## 2 edible bruises almond yellow club
## 3 edible bruises anise white club
## 5 edible no none gray equal
## 6 edible bruises almond yellow club
## 7 edible bruises almond white club
## 8 edible bruises anise white club
Summary bar charts showing the count of properties for each column
require(ggplot2)
## Loading required package: ggplot2
require(grid)
## Loading required package: grid
require(gridExtra)
## Loading required package: gridExtra
p1 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = bruises.),
position = "dodge")
p2 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = odor),
position = "dodge")
p3 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = cap.color),
position = "dodge")
p4 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = stalk.root),
position = "dodge")
grid.arrange(p1, p2, p3, p4, ncol = 2)
