library(readr)
library(ggplot2)

Mushrooms Data


The Mushrooms data is a well-known dataset in the data science community makes it a good dataset to use for comparative benchmarking. For example, if someone was working to build a better decision tree algorithm (or other predictive classifier) to analyze categorical data, this dataset could be useful. (https://archive.ics.uci.edu/ml/datasets/Mushroom)

Display original dataset columns.

mushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
## Warning: Duplicated column names deduplicated: 'p' => 'p_1' [6], 'n' =>
## 'n_1' [9], 'e' => 'e_1' [12], 's' => 's_1' [13], 's' => 's_2' [14], 'w' =>
## 'w_1' [16], 'p' => 'p_2' [17], 'w' => 'w_2' [18], 'p' => 'p_3' [20], 'k' =>
## 'k_1' [21], 's' => 's_3' [22]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   t = col_logical(),
##   f = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 210 parsing failures.
##  row col           expected actual                                                                                       file
## 6038   f 1/0/T/F/TRUE/FALSE      a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6040   f 1/0/T/F/TRUE/FALSE      a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6375   f 1/0/T/F/TRUE/FALSE      a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6424   f 1/0/T/F/TRUE/FALSE      a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## 6434   f 1/0/T/F/TRUE/FALSE      a 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
## .... ... .................. ...... ..........................................................................................
## See problems(...) for more details.


1. Use the summary function to gain an overview of the data set. Then display attributes.

summary(mushrooms)
##       p                  x                  s            
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##       n                 t               p_1                f          
##  Length:8123        Mode :logical   Length:8123        Mode :logical  
##  Class :character   FALSE:4748      Class :character   FALSE:7913     
##  Mode  :character   TRUE :3375      Mode  :character   NA's :210      
##       c                 n_1                 k            
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##       e                 e_1                s_1           
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      s_2                 w                 w_1           
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      p_2                w_2                 o            
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##      p_3                k_1                s_3           
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##       u            
##  Length:8123       
##  Class :character  
##  Mode  :character


Subsetting Data


2. Create a new data frame with a subset of the columns

subsetMushrooms <- mushrooms[c(1, 2, 3:4)]


3. Create new column names for the new data frame.

colnames(subsetMushrooms) <- c("class", "bruises", "gill-size", "habitat")
DT::datatable(subsetMushrooms, options = list(pageLength = 10))


4. Use the summary function to create an overview of your new data frame and print attributes.

summary(subsetMushrooms)
##     class             bruises           gill-size        
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##    habitat         
##  Length:8123       
##  Class :character  
##  Mode  :character


5. 4. Replace abbreviation with meaningful names (i.e.; e = edible)

#newdata <- subset(SubsetMush, class >= "e" & Class <= "p")
newdata <- subset(subsetMushrooms, class == "e" | class <= "p")
newdata [newdata == "e"] <- "Edible"
newdata [newdata == "p"] <- "Poisonous"


6. See examples of all of steps 1-5 above.

DT::datatable(newdata, options = list(pageLength = 10))  


Plotting

library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v purrr   0.2.5     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#library(ggplot2)
#library(dplyr)
#factorMushrooms <- mushrooms[c(1, 3, 9, 23)]
#factorMushrooms <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")

subsetMushrooms <- mushrooms[c(1, 3, 9, 23)]
colnames(subsetMushrooms) <- c("class", "capsurface", "gillsize", "habitat")

summary(subsetMushrooms)
##     class            capsurface          gillsize        
##  Length:8123        Length:8123        Length:8123       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##    habitat         
##  Length:8123       
##  Class :character  
##  Mode  :character
subsetMushrooms <- subsetMushrooms %>%
  mutate(class=as.factor(class), capsurface=as.factor(capsurface), 
         gillsize=as.factor(gillsize), habitat=as.factor(habitat))
  #mutate(class=factor(class), habitat=factor(habitat))
#unique(factorMushrooms$class)

summary(subsetMushrooms)
##  class    capsurface gillsize habitat 
##  e:4208   f:2320     b:5612   d:3148  
##  p:3915   g:   4     n:2511   g:2148  
##           s:2555              l: 832  
##           y:3244              m: 292  
##                               p:1144  
##                               u: 367  
##                               w: 192
levels(subsetMushrooms$class)
## [1] "e" "p"
levels(subsetMushrooms$capsurface)
## [1] "f" "g" "s" "y"
levels(subsetMushrooms$gillsize)
## [1] "b" "n"
levels(subsetMushrooms$habitat)
## [1] "d" "g" "l" "m" "p" "u" "w"
# Set up factors
#df <- data.frame(a = factorMushrooms$class, factorMushrooms$bruises)

ggplot(data = subsetMushrooms) + 
  geom_point(mapping = aes(x=class, y=habitat))

#subsetMushrooms <- subsetMushrooms[c(1, 3, 9, 23)]

require(data.table) 
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
counts<-data.frame(table(subsetMushrooms))
DT::datatable(counts, options = list(pageLength = 10))  
plot(subsetMushrooms)

plot(counts$class,counts$capsurface)


Please email to: kleber.perez@live.com for any suggestion.