Get The Data

Use getURL function to get the data from my github repository. The assign the data to x.

library(RCurl)
x <- getURL("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/mushroom.csv")

Import The Data

Will utlized the tidyverse library and related tools to complete this assignment.

library (tidyverse)
mushrooms <-read_csv(x, col_names= c("isEdible",
                                     "cap_shape",
                                     "cap_surface",
                                     "cap_color",
                                     "bruises",
                                     "odor",
                                     "gill_attachment",
                                     "gill_spacing",
                                     "gill_size",
                                     "gill_color",
                                     "stalk_shape",
                                     "stalk_root",
                                     "stalk_surface_above_ring",
                                     "stalk_surface_below_ring",
                                     "stalk_color_above_ring",
                                     "stalk_color_below_ring",
                                     "veil_type",
                                     "veil_color",
                                     "ring_number",
                                     "ring_type",
                                     "spore_print_color",
                                     "population",
                                     "habitat")
)
glimpse(mushrooms)
## Observations: 8,124
## Variables: 23
## $ isEdible                 <chr> "p", "e", "e", "p", "e", "e", "e", "e...
## $ cap_shape                <chr> "x", "x", "b", "x", "x", "x", "b", "b...
## $ cap_surface              <chr> "s", "s", "s", "y", "s", "y", "s", "y...
## $ cap_color                <chr> "n", "y", "w", "w", "g", "y", "w", "w...
## $ bruises                  <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, ...
## $ odor                     <chr> "p", "a", "l", "p", "n", "a", "a", "l...
## $ gill_attachment          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA...
## $ gill_spacing             <chr> "c", "c", "c", "c", "w", "c", "c", "c...
## $ gill_size                <chr> "n", "b", "b", "n", "b", "b", "b", "b...
## $ gill_color               <chr> "k", "k", "n", "n", "k", "n", "g", "n...
## $ stalk_shape              <chr> "e", "e", "e", "e", "t", "e", "e", "e...
## $ stalk_root               <chr> "e", "c", "c", "e", "e", "c", "c", "c...
## $ stalk_surface_above_ring <chr> "s", "s", "s", "s", "s", "s", "s", "s...
## $ stalk_surface_below_ring <chr> "s", "s", "s", "s", "s", "s", "s", "s...
## $ stalk_color_above_ring   <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ stalk_color_below_ring   <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ veil_type                <chr> "p", "p", "p", "p", "p", "p", "p", "p...
## $ veil_color               <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ ring_number              <chr> "o", "o", "o", "o", "o", "o", "o", "o...
## $ ring_type                <chr> "p", "p", "p", "p", "e", "p", "p", "p...
## $ spore_print_color        <chr> "k", "n", "n", "k", "n", "k", "k", "n...
## $ population               <chr> "s", "n", "n", "s", "a", "n", "n", "s...
## $ habitat                  <chr> "u", "g", "m", "u", "g", "g", "m", "m...

Tidy The Data

We will define each variable as a factor and then apply the required data transformations.

mushrooms <- mushrooms %>% map_df(function(.x) as.factor(.x))

levels(mushrooms$isEdible) <- c("edible", "poisonous")
levels(mushrooms$cap_shape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(mushrooms$cap_color) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                 "green", "purple", "white", "yellow")
levels(mushrooms$cap_surface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(mushrooms$bruises) <- c("no", "yes")
levels(mushrooms$odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(mushrooms$gill_attachment) <- c("attached", "free")
levels(mushrooms$gill_spacing) <- c("close", "crowded")
levels(mushrooms$gill_size) <- c("broad", "narrow")
levels(mushrooms$gill_color) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange", 
                                  "pink", "green", "purple", "white", "yellow")
levels(mushrooms$stalk_shape) <- c("enlarging", "tapering")
levels(mushrooms$stalk_root) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(mushrooms$stalk_surface_above_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(mushrooms$stalk_surface_below_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(mushrooms$stalk_color_above_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                              "green", "purple", "white", "yellow")
levels(mushrooms$stalk_color_below_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink", 
                                              "green", "purple", "white", "yellow")
levels(mushrooms$veil_type) <- "partial"
levels(mushrooms$veil_color) <- c("brown", "orange", "white", "yellow")
levels(mushrooms$ring_number) <- c("none", "one", "two")
levels(mushrooms$ring_type) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(mushrooms$spore_print_color) <- c("buff", "chocolate", "black", "brown", "orange", 
                                         "green", "purple", "white", "yellow")
levels(mushrooms$population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(mushrooms$habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")

Here are the transformed facators

str(mushrooms)
## Classes 'tbl_df', 'tbl' and 'data.frame':    8124 obs. of  23 variables:
##  $ isEdible                : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "bell","conical",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ cap_surface             : Factor w/ 4 levels "fibrous","grooves",..: 3 3 3 4 3 4 3 4 4 3 ...
##  $ cap_color               : Factor w/ 10 levels "buff","cinnamon",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ bruises                 : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ odor                    : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "attached","free": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gill_spacing            : Factor w/ 2 levels "close","crowded": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "broad","narrow": 2 1 1 2 1 1 1 1 2 1 ...
##  $ gill_color              : Factor w/ 12 levels "buff","red","gray",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "enlarging","tapering": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "missing","bulbous",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ stalk_color_above_ring  : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ stalk_color_below_ring  : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ veil_type               : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "brown","orange",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ ring_number             : Factor w/ 3 levels "none","one","two": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ring_type               : Factor w/ 5 levels "evanescent","flaring",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ spore_print_color       : Factor w/ 9 levels "buff","chocolate",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ population              : Factor w/ 6 levels "abundant","clustered",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ habitat                 : Factor w/ 7 levels "wood","grasses",..: 6 2 4 6 2 2 4 4 2 4 ...

Remove factor (veil_type) with one level.

mushrooms <- mushrooms %>% select(- veil_type)

Check for missing data / values

map_dbl(mushrooms, function(.x) {sum(is.na(.x))})
##                 isEdible                cap_shape              cap_surface 
##                        0                        0                        0 
##                cap_color                  bruises                     odor 
##                        0                        0                        0 
##          gill_attachment             gill_spacing                gill_size 
##                      210                        0                        0 
##               gill_color              stalk_shape               stalk_root 
##                        0                        0                        0 
## stalk_surface_above_ring stalk_surface_below_ring   stalk_color_above_ring 
##                        0                        0                        0 
##   stalk_color_below_ring               veil_color              ring_number 
##                        0                        0                        0 
##                ring_type        spore_print_color               population 
##                        0                        0                        0 
##                  habitat 
##                        0

Remove gill_attachment because it has missing values

mushrooms <- mushrooms %>% select(- gill_attachment)

Select isEdible and four or five other columns.

mushrooms <- mushrooms %>% select(isEdible, odor, cap_surface, cap_color, gill_color, cap_shape)

Show summary of the mushrooms dataframe (tibble)

summary(mushrooms)
##       isEdible         odor       cap_surface     cap_color   
##  edible   :4208   none   :3528   fibrous:2320   brown  :2284  
##  poisonous:3916   foul   :2160   grooves:   4   gray   :1840  
##                   spicy  : 576   scaly  :2556   red    :1500  
##                   fishy  : 576   smooth :3244   yellow :1072  
##                   almond : 400                  white  :1040  
##                   anise  : 400                  buff   : 168  
##                   (Other): 484                  (Other): 220  
##      gill_color     cap_shape   
##  buff     :1728   bell   : 452  
##  pink     :1492   conical:   4  
##  white    :1202   flat   :3152  
##  brown    :1048   knobbed: 828  
##  gray     : 752   sunken :  32  
##  chocolate: 732   convex :3656  
##  (Other)  :1170

Explore The Selected Data

Use ggplot2 to visulize the data to help determine the interesting variables.

ggplot(mushrooms, aes(x = cap_surface, y = cap_color, col = isEdible)) + 
  geom_jitter(alpha = 0.5) + 
  scale_color_manual(breaks = c("edible", "poisonous"), 
                     values = c("green", "red"))

Fibrous mushrooms appear the safest to eat.

ggplot(mushrooms, aes(x = cap_shape, y = cap_color, col = isEdible)) + 
  geom_jitter(alpha = 0.5) + 
  scale_color_manual(breaks = c("edible", "poisonous"), 
                     values = c("green", "red"))

Bell shaped mushrooms seem like the safe bet.

ggplot(mushrooms, aes(x = gill_color, y = cap_color, col = isEdible)) + 
  geom_jitter(alpha = 0.5) + 
  scale_color_manual(breaks = c("edible", "poisonous"), 
                     values = c("green", "red"))

ggplot(mushrooms, aes(x = isEdible, y = odor, col = isEdible)) + 
  geom_jitter(alpha = 0.5) + 
  scale_color_manual(breaks = c("edible", "poisonous"), 
                     values = c("green", "red"))

‘’’ Inspiration 1. R for Data Science 2. Machine Learning with R 3. ggplot2 (the book) ‘’’