library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ readr 1.1.1
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ ggplot2 3.1.0 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
# read in data from website
shroomsdf <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",col_names=FALSE)
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
# take a look at the data
head(shroomsdf)
## # A tibble: 6 x 23
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x s n t p f c n k e e
## 2 e x s y t a f c b k e c
## 3 e b s w t l f c b n e c
## 4 p x y w t p f c n n e e
## 5 e x s g f n f w b k t e
## 6 e x y y t a f c b n e c
## # ... with 11 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>,
## # X17 <chr>, X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>,
## # X23 <chr>
# check out unique values of columns
unique(shroomsdf$X1)
## [1] "p" "e"
unique(shroomsdf$X3)
## [1] "s" "y" "f" "g"
unique(shroomsdf$X23)
## [1] "u" "g" "m" "d" "p" "w" "l"
# rename all column names with meaningful names
shrooms<- plyr::rename(shroomsdf,c("X1" = "class",
"X2" = "cap_shape",
"X3" = "cap_surface",
"X4" = "cap_color",
"X5" = "bruises",
"X6" = "odor",
"X7" = "gill_attachment",
"X8" = "gill_spacing",
"X9" = "gill_size",
"X10" = "gill_color",
"X11" = "stalk_shape",
"X12" = "stalk_root",
"X13" = "stalk_surface_above_ring",
"X14" = "stalk_surface_below_ring",
"X15" = "stalk_color_above_ring",
"X16" = "stalk_color_below_ring",
"X17" = "veil_type",
"X18" = "veil_color",
"X19" = "ring_number",
"X20" = "ring_type",
"X21" = "spore_print_color",
"X22" = "population",
"X23" = "habitat"))
# confirm new headers
head(shrooms)
## # A tibble: 6 x 23
## class cap_shape cap_surface cap_color bruises odor gill_attachment
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## # ... with 16 more variables: gill_spacing <chr>, gill_size <chr>,
## # gill_color <chr>, stalk_shape <chr>, stalk_root <chr>,
## # stalk_surface_above_ring <chr>, stalk_surface_below_ring <chr>,
## # stalk_color_above_ring <chr>, stalk_color_below_ring <chr>,
## # veil_type <chr>, veil_color <chr>, ring_number <chr>, ring_type <chr>,
## # spore_print_color <chr>, population <chr>, habitat <chr>
# subset columns
colnames(shrooms)
## [1] "class" "cap_shape"
## [3] "cap_surface" "cap_color"
## [5] "bruises" "odor"
## [7] "gill_attachment" "gill_spacing"
## [9] "gill_size" "gill_color"
## [11] "stalk_shape" "stalk_root"
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring" "stalk_color_below_ring"
## [17] "veil_type" "veil_color"
## [19] "ring_number" "ring_type"
## [21] "spore_print_color" "population"
## [23] "habitat"
shrooms_sm <- shrooms[c("class","cap_shape","bruises","odor","population")]
shrooms_sm
## # A tibble: 8,124 x 5
## class cap_shape bruises odor population
## <chr> <chr> <chr> <chr> <chr>
## 1 p x t p s
## 2 e x t a n
## 3 e b t l n
## 4 p x t p s
## 5 e x f n a
## 6 e x t a n
## 7 e b t a n
## 8 e b t l s
## 9 p x t p v
## 10 e b t a s
## # ... with 8,114 more rows
# replace abbrev in data
# unique(shrooms_sm$cap_shape)
# bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
shrooms_sm$cap_shape <- mapvalues(shrooms_sm$cap_shape,
from = c("x","b","s","f","k","c"),
to = c("convex","bell","sunken", "flat", "knobbed", "conical"))
# unique(shrooms_sm$cap_shape)
# unique(shrooms_sm$class)
shrooms_sm$class <- mapvalues(shrooms_sm$class,
from = c("p","e"),
to = c("poisonous","edible"))
#unique(shrooms_sm$bruises)
shrooms_sm$bruises <- mapvalues(shrooms_sm$bruises,
from = c("t","f"),
to = c("bruises","no"))
shrooms_sm$odor <- mapvalues(shrooms_sm$odor,
from = c("a","l","c","y","f","m","n","p","s"),
to = c("almond","anise","creosote", "fishy", "foul",
"musty","none","pungent","spicy"))
shrooms_sm$population <- mapvalues(shrooms_sm$population,
from = c("a","c","n","s","v","y"),
to = c("abundant","clustered","numerous", "scattered",
"several", "solitary"))
# unique(shrooms_sm$population)
head(shrooms_sm)
## # A tibble: 6 x 5
## class cap_shape bruises odor population
## <chr> <chr> <chr> <chr> <chr>
## 1 poisonous convex bruises pungent scattered
## 2 edible convex bruises almond numerous
## 3 edible bell bruises anise numerous
## 4 poisonous convex bruises pungent scattered
## 5 edible convex no none abundant
## 6 edible convex bruises almond numerous