Data607_Week1

Grando 8 Homework

# This is a standard setup I include so that my working
# directory is set correctly whether I work on one of my
# windows or linux machines.
if (Sys.info()["sysname"] == "Windows") {
    setwd("~/Masters/DATA607/Week1/Assignment")
} else {
    setwd("~/Documents/Masters/DATA607/Week1/Assignment")
}

Download the table describing each column and save it in a text file.

# I got the idea for this from one of the sample programs
# provided in the weekly materials section.  Basically, I
# just copy/pasted the table name descirptions to a text file
# and formatted them.
mushroom_table_names <- read.csv(file = "./mushroom_table_names.txt", 
    header = FALSE, sep = ":", stringsAsFactors = FALSE)
mushroom_col_names <- mushroom_table_names[, 1]

Read the data and apply column names from the data table

mushroom_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
mushroom_df <- read.csv(file = mushroom_url,
                        header = FALSE,
                        sep = ",",
                        col.names = mushroom_col_names,
                        #If we want to treat the missing information as NA, we can uncomment the next line.  I also saw this in one of the examples provided to us, but I took it a bit further in the column value conversions below.
                        #na.strings = "?",
                        stringsAsFactors = FALSE)
head(mushroom_df)

##   type cap.shape cap.surface cap.color bruises. odor gill.attachment
## 1    p         x           s         n        t    p               f
## 2    e         x           s         y        t    a               f
## 3    e         b           s         w        t    l               f
## 4    p         x           y         w        t    p               f
## 5    e         x           s         g        f    n               f
## 6    e         x           y         y        t    a               f
##   gill.spacing gill.size gill.color stalk.shape stalk.root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   stalk.surface.above.ring stalk.surface.below.ring stalk.color.above.ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk.color.below.ring veil.type veil.color ring.number ring.type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         p
## 5                      w         p          w           o         e
## 6                      w         p          w           o         p
##   spore.print.color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m
## 4                 k          s       u
## 5                 n          a       g
## 6                 k          n       g

Create a function that returns a converted value for each column value based on the name of the column entered and the abbreviation given.

table_converter <- function(name_of_col, ab) {
    column_number <- which(colnames(mushroom_df) == name_of_col)
    sublist <- unlist(strsplit(mushroom_table_names[2][column_number, 
        1], split = ",|="))
    sublist_keys <- sublist[c(FALSE, TRUE)]
    sublist_values <- sublist[c(TRUE, FALSE)]
    conversion_df <- data.frame(keys = sublist_keys, values = sublist_values)
    as.character(conversion_df[which(conversion_df$keys == ab), 
        2])
}

Subset the dataframe.

# I chose to subset by a particular gill size (broad) just to
# show the methodology for selecting a specific group by
# column value, no other reason.
mushroom_subset_df <- subset(mushroom_df, mushroom_df$gill.size == 
    "b", select = c("type", "bruises.", "odor", "cap.color", 
    "stalk.root"))

Convert the subsetted data frame from abbreviations to full descriptions.

# I know there has to be a way to refactor the function to
# make this even simpler, such as simply feeding a vector of
# column names [e.g. c('type', 'bruises.')], but I have run
# out of time for this assignment.  My guess is that this
# could be accomplished by some form of string concatenation
# with the table from the text file.
mushroom_subset_df$type <- sapply(mushroom_subset_df$type, function(x) {
    x <- ifelse(is.na(x), NA, table_converter("type", x))
})
mushroom_subset_df$cap.color <- sapply(mushroom_subset_df$cap.color, 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("cap.color", 
            x))
    })
mushroom_subset_df$bruises. <- sapply(mushroom_subset_df$bruises., 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("bruises.", 
            x))
    })
mushroom_subset_df$odor <- sapply(mushroom_subset_df$odor, function(x) {
    x <- ifelse(is.na(x), NA, table_converter("odor", x))
})
mushroom_subset_df$stalk.root <- sapply(mushroom_subset_df$stalk.root, 
    function(x) {
        x <- ifelse(is.na(x), NA, table_converter("stalk.root", 
            x))
    })
# I figured I would include code to convert all columns to
# highlight how simple it is; however, I have commented the
# non-subsetted columns in order to increase run-time.


# mushroom_subset_df$cap.shape <-
# sapply(mushroom_subset_df$cap.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.shape', x)) } )
# mushroom_subset_df$cap.surface <-
# sapply(mushroom_subset_df$cap.surface, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('cap.surface', x)) } )
# mushroom_subset_df$gill.attachment <-
# sapply(mushroom_subset_df$gill.attachment, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('gill.attachment',
# x)) } ) mushroom_subset_df$gill.spacing <-
# sapply(mushroom_subset_df$gill.spacing, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.spacing', x)) } )
# mushroom_subset_df$gill.size <-
# sapply(mushroom_subset_df$gill.size, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.size', x)) } )
# mushroom_subset_df$gill.color <-
# sapply(mushroom_subset_df$gill.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('gill.color', x)) } )
# mushroom_subset_df$stalk.shape <-
# sapply(mushroom_subset_df$stalk.shape, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.shape', x)) } )
# mushroom_subset_df$stalk.surface.above.ring <-
# sapply(mushroom_subset_df$stalk.surface.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.above.ring',
# x)) } ) mushroom_subset_df$stalk.surface.below.ring <-
# sapply(mushroom_subset_df$stalk.surface.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.surface.below.ring',
# x)) } ) mushroom_subset_df$stalk.color.above.ring <-
# sapply(mushroom_subset_df$stalk.color.above.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.above.ring',
# x)) } ) mushroom_subset_df$stalk.color.below.ring <-
# sapply(mushroom_subset_df$stalk.color.below.ring,
# function(x){ x <-
# ifelse(is.na(x),NA,table_converter('stalk.color.below.ring',
# x)) } ) mushroom_subset_df$veil.type <-
# sapply(mushroom_subset_df$veil.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.type', x)) } )
# mushroom_subset_df$veil.color <-
# sapply(mushroom_subset_df$veil.color, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('veil.color', x)) } )
# mushroom_subset_df$ring.number <-
# sapply(mushroom_subset_df$ring.number, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.number', x)) } )
# mushroom_subset_df$ring.type <-
# sapply(mushroom_subset_df$ring.type, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('ring.type', x)) } )
# mushroom_subset_df$spore.print.color <-
# sapply(mushroom_subset_df$spore.print.color, function(x){ x
# <- ifelse(is.na(x),NA,table_converter('spore.print.color',
# x)) } ) mushroom_subset_df$population <-
# sapply(mushroom_subset_df$population, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('population', x)) } )
# mushroom_subset_df$habitat <-
# sapply(mushroom_subset_df$habitat, function(x){ x <-
# ifelse(is.na(x),NA,table_converter('habitat', x)) } )
head(mushroom_subset_df)

##     type bruises.    odor cap.color stalk.root
## 2 edible  bruises  almond    yellow       club
## 3 edible  bruises   anise     white       club
## 5 edible       no    none      gray      equal
## 6 edible  bruises  almond    yellow       club
## 7 edible  bruises  almond     white       club
## 8 edible  bruises   anise     white       club

Summary bar charts showing the count of properties for each column

require(ggplot2)

## Loading required package: ggplot2

require(grid)

## Loading required package: grid

require(gridExtra)

## Loading required package: gridExtra

p1 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = bruises.), 
    position = "dodge")
p2 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = odor), 
    position = "dodge")
p3 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = cap.color), 
    position = "dodge")
p4 <- ggplot(mushroom_subset_df, aes(type)) + geom_bar(aes(fill = stalk.root), 
    position = "dodge")
grid.arrange(p1, p2, p3, p4, ncol = 2)

Data607_Week1

John Grando

August 28, 2017

Grando 8 Homework

Download the table describing each column and save it in a text file.

Read the data and apply column names from the data table

Create a function that returns a converted value for each column value based on the name of the column entered and the abbreviation given.

Subset the dataframe.

Convert the subsetted data frame from abbreviations to full descriptions.

Summary bar charts showing the count of properties for each column