Grando 8 Homework

# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
    setwd("~/Masters/DATA607/Week1/Assignment")
} else {
    setwd("~/Documents/Masters/DATA607/Week1/Assignment")
}

Download the table describing each column and save it in a text file.

# I got the idea for this from one of the sample programs
# provided in the weekly materials section.  Basically, I
# just copy/pasted the table name descirptions to a text file
# and formatted them.
mushroom_table_names <- read.csv(file = "./mushroom_table_names.txt", 
    header = FALSE, sep = ":", stringsAsFactors = FALSE)
mushroom_col_names <- mushroom_table_names[, 1]

Read the data and apply column names from the data table

mushroom_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
mushroom_df <- read.csv(file = mushroom_url,
                        header = FALSE,
                        sep = ",",
                        col.names = mushroom_col_names,
                        #If we want to treat the missing information as NA, we can uncomment the next line.  I also saw this in one of the examples provided to us, but I took it a bit further in the column value conversions below.
                        #na.strings = "?",
                        stringsAsFactors = FALSE)
head(mushroom_df)
##   type cap.shape cap.surface cap.color bruises. odor gill.attachment
## 1    p         x           s         n        t    p               f
## 2    e         x           s         y        t    a               f
## 3    e         b           s         w        t    l               f
## 4    p         x           y         w        t    p               f
## 5    e         x           s         g        f    n               f
## 6    e         x           y         y        t    a               f
##   gill.spacing gill.size gill.color stalk.shape stalk.root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   stalk.surface.above.ring stalk.surface.below.ring stalk.color.above.ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk.color.below.ring veil.type veil.color ring.number ring.type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         p
## 5                      w         p          w           o         e
## 6                      w         p          w           o         p
##   spore.print.color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m
## 4                 k          s       u
## 5                 n          a       g
## 6                 k          n       g

Create a function that returns a converted value for each column value based on the name of the column entered and the abbreviation given.

table_converter <- function(name_of_col, ab) {
    column_number <- which(colnames(mushroom_df) == name_of_col)
    sublist <- unlist(strsplit(mushroom_table_names[2][column_number, 
        1], split = ",|="))
    sublist_keys <- sublist[c(FALSE, TRUE)]
    sublist_values <- sublist[c(TRUE, FALSE)]
    conversion_df <- data.frame(keys = sublist_keys, values = sublist_values)
    as.character(conversion_df[which(conversion_df$keys == ab), 
        2])
}

Subset the dataframe.

# I chose to subset by a particular gill size (broad) just to
# show the methodology for selecting a specific group by
# column value, no other reason.
mushroom_subset_df <- subset(mushroom_df, mushroom_df$gill.size == 
    "b", select = c("type", "bruises.", "odor", "cap.color", 
    "stalk.root"))

Convert the subsetted data frame from abbreviations to full descriptions.

# I know there has to be a way to refactor the function to
# make this even simpler, such as simply feeding a vector of
# column names [e.g. c('type', 'bruises.')], but I have run
# out of time for this assignment.  My guess is that this
# could be accomplished by some form of string concatenation
# with the table from the text file.
mushroom_subset_df$type <- sapply(mushroom_subset_df$type, function(x) {
    x <- ifelse(is.na(x), NA, table_converter("type", x))
})
mushroom_subset_df$cap.color <- sapply(mushroom_subset_df$cap.color, 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("cap.color", 
            x))
    })
mushroom_subset_df$bruises. <- sapply(mushroom_subset_df$bruises., 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("bruises.", 
            x))
    })
mushroom_subset_df$odor <- sapply(mushroom_subset_df$odor, function(x) {
    x <- ifelse(is.na(x), NA, table_converter("odor", x))
})
mushroom_subset_df$stalk.root <- sapply(mushroom_subset_df$stalk.root, 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("stalk.root", 
            x))
    })
# I figured I would include code to convert all columns to
# highlight how simple it is; however, I have commented the
# non-subsetted columns in order to increase run-time.


# mushroom_subset_df$cap.shape <-
# sapply(mushroom_subset_df$cap.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.shape', x)) } )
# mushroom_subset_df$cap.surface <-
# sapply(mushroom_subset_df$cap.surface, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.surface', x)) } )
# mushroom_subset_df$gill.attachment <-
# sapply(mushroom_subset_df$gill.attachment, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('gill.attachment',
# x)) } ) mushroom_subset_df$gill.spacing <-
# sapply(mushroom_subset_df$gill.spacing, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.spacing', x)) } )
# mushroom_subset_df$gill.size <-
# sapply(mushroom_subset_df$gill.size, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.size', x)) } )
# mushroom_subset_df$gill.color <-
# sapply(mushroom_subset_df$gill.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.color', x)) } )
# mushroom_subset_df$stalk.shape <-
# sapply(mushroom_subset_df$stalk.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.shape', x)) } )
# mushroom_subset_df$stalk.surface.above.ring <-
# sapply(mushroom_subset_df$stalk.surface.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.above.ring',
# x)) } ) mushroom_subset_df$stalk.surface.below.ring <-
# sapply(mushroom_subset_df$stalk.surface.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.below.ring',
# x)) } ) mushroom_subset_df$stalk.color.above.ring <-
# sapply(mushroom_subset_df$stalk.color.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.above.ring',
# x)) } ) mushroom_subset_df$stalk.color.below.ring <-
# sapply(mushroom_subset_df$stalk.color.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.below.ring',
# x)) } ) mushroom_subset_df$veil.type <-
# sapply(mushroom_subset_df$veil.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.type', x)) } )
# mushroom_subset_df$veil.color <-
# sapply(mushroom_subset_df$veil.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.color', x)) } )
# mushroom_subset_df$ring.number <-
# sapply(mushroom_subset_df$ring.number, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.number', x)) } )
# mushroom_subset_df$ring.type <-
# sapply(mushroom_subset_df$ring.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.type', x)) } )
# mushroom_subset_df$spore.print.color <-
# sapply(mushroom_subset_df$spore.print.color, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('spore.print.color',
# x)) } ) mushroom_subset_df$population <-
# sapply(mushroom_subset_df$population, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('population', x)) } )
# mushroom_subset_df$habitat <-
# sapply(mushroom_subset_df$habitat, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('habitat', x)) } )
head(mushroom_subset_df)
##     type bruises.    odor cap.color stalk.root
## 2 edible  bruises  almond    yellow       club
## 3 edible  bruises   anise     white       club
## 5 edible       no    none      gray      equal
## 6 edible  bruises  almond    yellow       club
## 7 edible  bruises  almond     white       club
## 8 edible  bruises   anise     white       club

Summary bar charts showing the count of properties for each column

require(ggplot2)
## Loading required package: ggplot2
require(grid)
## Loading required package: grid
require(gridExtra)
## Loading required package: gridExtra
p1 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = bruises.), 
    position = "dodge")
p2 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = odor), 
    position = "dodge")
p3 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = cap.color), 
    position = "dodge")
p4 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = stalk.root), 
    position = "dodge")
grid.arrange(p1, p2, p3, p4, ncol = 2)