Use getURL function to get the data from my github repository. The assign the data to x.
library(RCurl)
x <- getURL("https://raw.githubusercontent.com/MundyMSDS/DATA607/master/mushroom.csv")
Will utlized the tidyverse library and related tools to complete this assignment.
library (tidyverse)
mushrooms <-read_csv(x, col_names= c("isEdible",
"cap_shape",
"cap_surface",
"cap_color",
"bruises",
"odor",
"gill_attachment",
"gill_spacing",
"gill_size",
"gill_color",
"stalk_shape",
"stalk_root",
"stalk_surface_above_ring",
"stalk_surface_below_ring",
"stalk_color_above_ring",
"stalk_color_below_ring",
"veil_type",
"veil_color",
"ring_number",
"ring_type",
"spore_print_color",
"population",
"habitat")
)
glimpse(mushrooms)
## Observations: 8,124
## Variables: 23
## $ isEdible <chr> "p", "e", "e", "p", "e", "e", "e", "e...
## $ cap_shape <chr> "x", "x", "b", "x", "x", "x", "b", "b...
## $ cap_surface <chr> "s", "s", "s", "y", "s", "y", "s", "y...
## $ cap_color <chr> "n", "y", "w", "w", "g", "y", "w", "w...
## $ bruises <lgl> TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, ...
## $ odor <chr> "p", "a", "l", "p", "n", "a", "a", "l...
## $ gill_attachment <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA...
## $ gill_spacing <chr> "c", "c", "c", "c", "w", "c", "c", "c...
## $ gill_size <chr> "n", "b", "b", "n", "b", "b", "b", "b...
## $ gill_color <chr> "k", "k", "n", "n", "k", "n", "g", "n...
## $ stalk_shape <chr> "e", "e", "e", "e", "t", "e", "e", "e...
## $ stalk_root <chr> "e", "c", "c", "e", "e", "c", "c", "c...
## $ stalk_surface_above_ring <chr> "s", "s", "s", "s", "s", "s", "s", "s...
## $ stalk_surface_below_ring <chr> "s", "s", "s", "s", "s", "s", "s", "s...
## $ stalk_color_above_ring <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ stalk_color_below_ring <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ veil_type <chr> "p", "p", "p", "p", "p", "p", "p", "p...
## $ veil_color <chr> "w", "w", "w", "w", "w", "w", "w", "w...
## $ ring_number <chr> "o", "o", "o", "o", "o", "o", "o", "o...
## $ ring_type <chr> "p", "p", "p", "p", "e", "p", "p", "p...
## $ spore_print_color <chr> "k", "n", "n", "k", "n", "k", "k", "n...
## $ population <chr> "s", "n", "n", "s", "a", "n", "n", "s...
## $ habitat <chr> "u", "g", "m", "u", "g", "g", "m", "m...
We will define each variable as a factor and then apply the required data transformations.
mushrooms <- mushrooms %>% map_df(function(.x) as.factor(.x))
levels(mushrooms$isEdible) <- c("edible", "poisonous")
levels(mushrooms$cap_shape) <- c("bell", "conical", "flat", "knobbed", "sunken", "convex")
levels(mushrooms$cap_color) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(mushrooms$cap_surface) <- c("fibrous", "grooves", "scaly", "smooth")
levels(mushrooms$bruises) <- c("no", "yes")
levels(mushrooms$odor) <- c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(mushrooms$gill_attachment) <- c("attached", "free")
levels(mushrooms$gill_spacing) <- c("close", "crowded")
levels(mushrooms$gill_size) <- c("broad", "narrow")
levels(mushrooms$gill_color) <- c("buff", "red", "gray", "chocolate", "black", "brown", "orange",
"pink", "green", "purple", "white", "yellow")
levels(mushrooms$stalk_shape) <- c("enlarging", "tapering")
levels(mushrooms$stalk_root) <- c("missing", "bulbous", "club", "equal", "rooted")
levels(mushrooms$stalk_surface_above_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(mushrooms$stalk_surface_below_ring) <- c("fibrous", "silky", "smooth", "scaly")
levels(mushrooms$stalk_color_above_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(mushrooms$stalk_color_below_ring) <- c("buff", "cinnamon", "red", "gray", "brown", "pink",
"green", "purple", "white", "yellow")
levels(mushrooms$veil_type) <- "partial"
levels(mushrooms$veil_color) <- c("brown", "orange", "white", "yellow")
levels(mushrooms$ring_number) <- c("none", "one", "two")
levels(mushrooms$ring_type) <- c("evanescent", "flaring", "large", "none", "pendant")
levels(mushrooms$spore_print_color) <- c("buff", "chocolate", "black", "brown", "orange",
"green", "purple", "white", "yellow")
levels(mushrooms$population) <- c("abundant", "clustered", "numerous", "scattered", "several", "solitary")
levels(mushrooms$habitat) <- c("wood", "grasses", "leaves", "meadows", "paths", "urban", "waste")
Here are the transformed facators
str(mushrooms)
## Classes 'tbl_df', 'tbl' and 'data.frame': 8124 obs. of 23 variables:
## $ isEdible : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_shape : Factor w/ 6 levels "bell","conical",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap_surface : Factor w/ 4 levels "fibrous","grooves",..: 3 3 3 4 3 4 3 4 4 3 ...
## $ cap_color : Factor w/ 10 levels "buff","cinnamon",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ bruises : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
## $ odor : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill_attachment : Factor w/ 2 levels "attached","free": 1 1 1 1 1 1 1 1 1 1 ...
## $ gill_spacing : Factor w/ 2 levels "close","crowded": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill_size : Factor w/ 2 levels "broad","narrow": 2 1 1 2 1 1 1 1 2 1 ...
## $ gill_color : Factor w/ 12 levels "buff","red","gray",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ stalk_shape : Factor w/ 2 levels "enlarging","tapering": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk_root : Factor w/ 5 levels "missing","bulbous",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ stalk_surface_above_ring: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_surface_below_ring: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk_color_above_ring : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ stalk_color_below_ring : Factor w/ 10 levels "buff","cinnamon",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ veil_type : Factor w/ 1 level "partial": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil_color : Factor w/ 4 levels "brown","orange",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ ring_number : Factor w/ 3 levels "none","one","two": 2 2 2 2 2 2 2 2 2 2 ...
## $ ring_type : Factor w/ 5 levels "evanescent","flaring",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ spore_print_color : Factor w/ 9 levels "buff","chocolate",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "abundant","clustered",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "wood","grasses",..: 6 2 4 6 2 2 4 4 2 4 ...
Remove factor (veil_type) with one level.
mushrooms <- mushrooms %>% select(- veil_type)
Check for missing data / values
map_dbl(mushrooms, function(.x) {sum(is.na(.x))})
## isEdible cap_shape cap_surface
## 0 0 0
## cap_color bruises odor
## 0 0 0
## gill_attachment gill_spacing gill_size
## 210 0 0
## gill_color stalk_shape stalk_root
## 0 0 0
## stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 0 0 0
## stalk_color_below_ring veil_color ring_number
## 0 0 0
## ring_type spore_print_color population
## 0 0 0
## habitat
## 0
Remove gill_attachment because it has missing values
mushrooms <- mushrooms %>% select(- gill_attachment)
Select isEdible and four or five other columns.
mushrooms <- mushrooms %>% select(isEdible, odor, cap_surface, cap_color, gill_color, cap_shape)
Show summary of the mushrooms dataframe (tibble)
summary(mushrooms)
## isEdible odor cap_surface cap_color
## edible :4208 none :3528 fibrous:2320 brown :2284
## poisonous:3916 foul :2160 grooves: 4 gray :1840
## spicy : 576 scaly :2556 red :1500
## fishy : 576 smooth :3244 yellow :1072
## almond : 400 white :1040
## anise : 400 buff : 168
## (Other): 484 (Other): 220
## gill_color cap_shape
## buff :1728 bell : 452
## pink :1492 conical: 4
## white :1202 flat :3152
## brown :1048 knobbed: 828
## gray : 752 sunken : 32
## chocolate: 732 convex :3656
## (Other) :1170
Use ggplot2 to visulize the data to help determine the interesting variables.
ggplot(mushrooms, aes(x = cap_surface, y = cap_color, col = isEdible)) +
geom_jitter(alpha = 0.5) +
scale_color_manual(breaks = c("edible", "poisonous"),
values = c("green", "red"))
Fibrous mushrooms appear the safest to eat.
ggplot(mushrooms, aes(x = cap_shape, y = cap_color, col = isEdible)) +
geom_jitter(alpha = 0.5) +
scale_color_manual(breaks = c("edible", "poisonous"),
values = c("green", "red"))
Bell shaped mushrooms seem like the safe bet.
ggplot(mushrooms, aes(x = gill_color, y = cap_color, col = isEdible)) +
geom_jitter(alpha = 0.5) +
scale_color_manual(breaks = c("edible", "poisonous"),
values = c("green", "red"))
ggplot(mushrooms, aes(x = isEdible, y = odor, col = isEdible)) +
geom_jitter(alpha = 0.5) +
scale_color_manual(breaks = c("edible", "poisonous"),
values = c("green", "red"))
‘’’ Inspiration 1. R for Data Science 2. Machine Learning with R 3. ggplot2 (the book) ‘’’