mushrooms

Load the data (in CSV format) from the URL. Print summary and check data.

library(entropy)
library(data.table)
library(datasets)
mdata = read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"), header = FALSE)
str(mdata)

## 'data.frame':    8124 obs. of  23 variables:
##  $ V1 : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
##  $ V2 : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
##  $ V3 : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
##  $ V4 : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
##  $ V5 : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
##  $ V6 : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ V7 : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V8 : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ V9 : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
##  $ V10: Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
##  $ V11: Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ V12: Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
##  $ V13: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V14: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V15: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ V16: Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ V17: Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V18: Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ V19: Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V20: Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
##  $ V21: Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ V22: Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
##  $ V23: Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...

The description in the file agaricus-lepiota.names identifies disjunctive rules for poisonous mushrooms which lists the most important features. (Alternatively, a relative comparison of feature entropies would yield similar conclusions.) Give descriptive names to the columns (features). Select columns based on description in the file.

Create a subset of the original dataframe that includes only the columns selected.

df = mdata[,c(1,6,21,14,23)]
colnames(df)[1] = "Edible"
colnames(df)[2] = "Odor"
colnames(df)[3] = "Spore-print-color"
colnames(df)[4] = "Stalk-surface-below-ring"
colnames(df)[5] = "Habitat"

levels(df$Edible) = c("Edible", "Poisonous")
levels(df$Odor) = c("almond", "creosote", "foul", "anise", "musty", "none", "pungent", "spicy", "fishy")
levels(df$`Spore-print-color`) = c("buff", "chocolate", "black", "brown", "orange", "green", "purple", "white", "yellow")
levels(df$`Stalk-surface-below-ring`) = c("fibrous", "silky", "smooth", "scaly")
levels(df$Habitat) = c("woods", "grasses", "leaves", "meadows", "paths", "urban", "waste")

str(df)

## 'data.frame':    8124 obs. of  5 variables:
##  $ Edible                  : Factor w/ 2 levels "Edible","Poisonous": 2 1 1 2 1 1 1 1 2 1 ...
##  $ Odor                    : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ Spore-print-color       : Factor w/ 9 levels "buff","chocolate",..: 3 4 4 3 4 3 3 4 3 3 ...
##  $ Stalk-surface-below-ring: Factor w/ 4 levels "fibrous","silky",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Habitat                 : Factor w/ 7 levels "woods","grasses",..: 6 2 4 6 2 2 4 4 2 4 ...

head(df)

##      Edible    Odor Spore-print-color Stalk-surface-below-ring Habitat
## 1 Poisonous pungent             black                   smooth   urban
## 2    Edible  almond             brown                   smooth grasses
## 3    Edible   anise             brown                   smooth meadows
## 4 Poisonous pungent             black                   smooth   urban
## 5    Edible    none             brown                   smooth grasses
## 6    Edible  almond             black                   smooth grasses

mushrooms

Vikas Sinha

September 3, 2017