library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ readr   1.1.1
## ✔ tibble  1.4.2     ✔ purrr   0.2.5
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ ggplot2 3.1.0     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
# read in data from website
shroomsdf <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",col_names=FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
# take a look at the data
head(shroomsdf)
## # A tibble: 6 x 23
##   X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 p     x     s     n     t     p     f     c     n     k     e     e    
## 2 e     x     s     y     t     a     f     c     b     k     e     c    
## 3 e     b     s     w     t     l     f     c     b     n     e     c    
## 4 p     x     y     w     t     p     f     c     n     n     e     e    
## 5 e     x     s     g     f     n     f     w     b     k     t     e    
## 6 e     x     y     y     t     a     f     c     b     n     e     c    
## # ... with 11 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>,
## #   X17 <chr>, X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>,
## #   X23 <chr>
# check out unique values of columns
unique(shroomsdf$X1)
## [1] "p" "e"
unique(shroomsdf$X3)
## [1] "s" "y" "f" "g"
unique(shroomsdf$X23)
## [1] "u" "g" "m" "d" "p" "w" "l"
# rename all column names with meaningful names
shrooms<- plyr::rename(shroomsdf,c("X1" = "class",
  "X2" = "cap_shape",
  "X3" = "cap_surface",
  "X4" = "cap_color",
  "X5" = "bruises",
  "X6" = "odor",
  "X7" = "gill_attachment",
  "X8" = "gill_spacing",
  "X9" = "gill_size",
  "X10" = "gill_color",
  "X11" = "stalk_shape",
  "X12" = "stalk_root",
  "X13" = "stalk_surface_above_ring",
  "X14" = "stalk_surface_below_ring",
  "X15" = "stalk_color_above_ring",
  "X16" = "stalk_color_below_ring",
  "X17" = "veil_type",
  "X18" = "veil_color",
  "X19" = "ring_number",
  "X20" = "ring_type",
  "X21" = "spore_print_color",
  "X22" = "population",
  "X23" = "habitat"))

# confirm new headers
head(shrooms)
## # A tibble: 6 x 23
##   class cap_shape cap_surface cap_color bruises odor  gill_attachment
##   <chr> <chr>     <chr>       <chr>     <chr>   <chr> <chr>          
## 1 p     x         s           n         t       p     f              
## 2 e     x         s           y         t       a     f              
## 3 e     b         s           w         t       l     f              
## 4 p     x         y           w         t       p     f              
## 5 e     x         s           g         f       n     f              
## 6 e     x         y           y         t       a     f              
## # ... with 16 more variables: gill_spacing <chr>, gill_size <chr>,
## #   gill_color <chr>, stalk_shape <chr>, stalk_root <chr>,
## #   stalk_surface_above_ring <chr>, stalk_surface_below_ring <chr>,
## #   stalk_color_above_ring <chr>, stalk_color_below_ring <chr>,
## #   veil_type <chr>, veil_color <chr>, ring_number <chr>, ring_type <chr>,
## #   spore_print_color <chr>, population <chr>, habitat <chr>
# subset columns
colnames(shrooms)
##  [1] "class"                    "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_type"                "veil_color"              
## [19] "ring_number"              "ring_type"               
## [21] "spore_print_color"        "population"              
## [23] "habitat"
shrooms_sm <- shrooms[c("class","cap_shape","bruises","odor","population")]
shrooms_sm
## # A tibble: 8,124 x 5
##    class cap_shape bruises odor  population
##    <chr> <chr>     <chr>   <chr> <chr>     
##  1 p     x         t       p     s         
##  2 e     x         t       a     n         
##  3 e     b         t       l     n         
##  4 p     x         t       p     s         
##  5 e     x         f       n     a         
##  6 e     x         t       a     n         
##  7 e     b         t       a     n         
##  8 e     b         t       l     s         
##  9 p     x         t       p     v         
## 10 e     b         t       a     s         
## # ... with 8,114 more rows
# replace abbrev in data

# unique(shrooms_sm$cap_shape)
# bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
shrooms_sm$cap_shape <- mapvalues(shrooms_sm$cap_shape, 
                        from = c("x","b","s","f","k","c"), 
                        to = c("convex","bell","sunken", "flat", "knobbed", "conical"))
# unique(shrooms_sm$cap_shape)

# unique(shrooms_sm$class)
shrooms_sm$class <- mapvalues(shrooms_sm$class, 
                        from = c("p","e"), 
                        to = c("poisonous","edible"))
#unique(shrooms_sm$bruises)
shrooms_sm$bruises <- mapvalues(shrooms_sm$bruises, 
                        from = c("t","f"), 
                        to = c("bruises","no"))

shrooms_sm$odor <- mapvalues(shrooms_sm$odor, 
                        from = c("a","l","c","y","f","m","n","p","s"), 
                        to = c("almond","anise","creosote", "fishy", "foul", 
                               "musty","none","pungent","spicy"))

shrooms_sm$population <- mapvalues(shrooms_sm$population, 
                        from = c("a","c","n","s","v","y"), 
                        to = c("abundant","clustered","numerous", "scattered", 
                               "several", "solitary"))
# unique(shrooms_sm$population)
head(shrooms_sm)
## # A tibble: 6 x 5
##   class     cap_shape bruises odor    population
##   <chr>     <chr>     <chr>   <chr>   <chr>     
## 1 poisonous convex    bruises pungent scattered 
## 2 edible    convex    bruises almond  numerous  
## 3 edible    bell      bruises anise   numerous  
## 4 poisonous convex    bruises pungent scattered 
## 5 edible    convex    no      none    abundant  
## 6 edible    convex    bruises almond  numerous