DATA 607 HW1

Load mushroom data as a data frame from github raw file.

Change the names of each column so they are identifiable.

names(df) = c("Poisonous?", "Cap-Shape", "Cap-Surface", "Cap-Color", "Bruises?", "Odor", "gill-attatchment", "gill-spacing", 
                     "gill-size", "gill-color", "stalk-shape", "stalk-root", "stack-surface-above-ring", "stalk-surface-below-ring", 
                     "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", 
                     "ring-type", "spore-print-color", "population", "habitat")
head(df)

##   Poisonous? Cap-Shape Cap-Surface Cap-Color Bruises? Odor
## 1          e         x           s         y        t    a
## 2          e         b           s         w        t    l
## 3          p         x           y         w        t    p
## 4          e         x           s         g        f    n
## 5          e         x           y         y        t    a
## 6          e         b           s         w        t    a
##   gill-attatchment gill-spacing gill-size gill-color stalk-shape
## 1                f            c         b          k           e
## 2                f            c         b          n           e
## 3                f            c         n          n           e
## 4                f            w         b          k           t
## 5                f            c         b          n           e
## 6                f            c         b          g           e
##   stalk-root stack-surface-above-ring stalk-surface-below-ring
## 1          c                        s                        s
## 2          c                        s                        s
## 3          e                        s                        s
## 4          e                        s                        s
## 5          c                        s                        s
## 6          c                        s                        s
##   stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## 1                      w                      w         p          w
## 2                      w                      w         p          w
## 3                      w                      w         p          w
## 4                      w                      w         p          w
## 5                      w                      w         p          w
## 6                      w                      w         p          w
##   ring-number ring-type spore-print-color population habitat
## 1           o         p                 n          n       g
## 2           o         p                 n          n       m
## 3           o         p                 k          s       u
## 4           o         e                 n          a       g
## 5           o         p                 k          n       g
## 6           o         p                 k          n       m

Subset the mushroom data so only the features of interest are viewed. I want to identify which colors, odors and habitatas are associated with poisonous mushrooms, therefore the subset consists of those four features.

mushrooms <- data.frame(df[c("Poisonous?", "Cap-Color", "Odor", "habitat")])

Next the values in each feature should be changed so that it can be understood. The values must be defined as characters before replacing the values so the as.character function is used.

mushrooms$Poisonous. <- as.character(mushrooms$Poisonous.)
mushrooms$Poisonous.[mushrooms$Poisonous. == "e"] <- "edible"
mushrooms$Poisonous.[mushrooms$Poisonous. == "p"] <- "poisonous"


mushrooms$Cap.Color <- as.character(mushrooms$Cap.Color)
mushrooms$Cap.Color[mushrooms$Cap.Color == "n"] <- "brown"
mushrooms$Cap.Color[mushrooms$Cap.Color == "e"] <- "red"
mushrooms$Cap.Color[mushrooms$Cap.Color == "g"] <- "gray"
mushrooms$Cap.Color[mushrooms$Cap.Color == "y"] <- "yellow"
mushrooms$Cap.Color[mushrooms$Cap.Color == "w"] <- "white"
mushrooms$Cap.Color[mushrooms$Cap.Color == "b"] <- "buff"
mushrooms$Cap.Color[mushrooms$Cap.Color == "c"] <- "cinnamon"
mushrooms$Cap.Color[mushrooms$Cap.Color == "r"] <- "green"
mushrooms$Cap.Color[mushrooms$Cap.Color == "p"] <- "pink"
mushrooms$Cap.Color[mushrooms$Cap.Color == "u"] <- "purple"

mushrooms$Odor <- as.character(mushrooms$Odor)
mushrooms$Odor[mushrooms$Odor == "a"] <- "almond"
mushrooms$Odor[mushrooms$Odor == "l"] <- "anise"
mushrooms$Odor[mushrooms$Odor == "c"] <- "creosote"
mushrooms$Odor[mushrooms$Odor == "y"] <- "fishy"
mushrooms$Odor[mushrooms$Odor == "f"] <- "foul"
mushrooms$Odor[mushrooms$Odor == "m"] <- "musty"
mushrooms$Odor[mushrooms$Odor == "n"] <- "none"
mushrooms$Odor[mushrooms$Odor == "p"] <- "pungent"
mushrooms$Odor[mushrooms$Odor == "s"] <- "spicy"

mushrooms$habitat <- as.character(mushrooms$habitat)
mushrooms$habitat[mushrooms$habitat == "g"] <- "grasses"
mushrooms$habitat[mushrooms$habitat == "l"] <- "leaves"
mushrooms$habitat[mushrooms$habitat == "m"] <- "meadows"
mushrooms$habitat[mushrooms$habitat == "p"] <- "paths"
mushrooms$habitat[mushrooms$habitat == "u"] <- "urban"
mushrooms$habitat[mushrooms$habitat == "w"] <- "waste"
mushrooms$habitat[mushrooms$habitat == "d"] <- "woods"
head(mushrooms)

##   Poisonous. Cap.Color    Odor habitat
## 1     edible    yellow  almond grasses
## 2     edible     white   anise meadows
## 3  poisonous     white pungent   urban
## 4     edible      gray    none grasses
## 5     edible    yellow  almond grasses
## 6     edible     white  almond meadows

Now that we have correctly structured the data and we are ready for analysis. In this next step the data is split into two datasets based on whether the mushrooms are poisonous or edible.

PoisonMushrooms <- subset(mushrooms, Poisonous. == "poisonous")
EdibleMushrooms <- subset(mushrooms, Poisonous. == "edible")

A bar graph is created according to the mushroom’s habitats for both the edible and poisonous subset.

phabfreq <- table(PoisonMushrooms$habitat)
ehabfreq <- table(EdibleMushrooms$habitat)
barplot(phabfreq[order(phabfreq, decreasing = T)],
        main = "Poisonous",
        xlab = "Habitat",
        ylab = "Count")

barplot(ehabfreq[order(ehabfreq, decreasing = T)],
        main = "Edible",
        xlab = "Habitat",
        ylab = "Count")

Next three bargraphs are created for the odor, color and habitat features. These plots show the edible and poisonous mushroom proportions side by side.

table1 <- table(mushrooms$Poisonous.,mushrooms$Cap.Color)
table2 <- prop.table(table1, 1)
barplot(table2, beside=TRUE,legend = rownames(table2), ylab = "Proportion of Poisonous Mushrooms", col = c("blue","red"))

table1 <- table(mushrooms$Poisonous.,mushrooms$Odor)
table2 <- prop.table(table1, 1)
barplot(table2, beside=TRUE,legend = rownames(table2), ylab = "Proportion of Poisonous Mushrooms", col = c("blue","red"))

table1 <- table(mushrooms$Poisonous.,mushrooms$habitat)
table2 <- prop.table(table1, 1)
barplot(table2, beside=TRUE,legend = rownames(table2), ylab = "Proportion of Poisonous Mushrooms", col = c("blue","red"))

DATA 607 HW1

Will Outcault

8/29/2019