DATA 607 Assignment 1

Loading Libraries

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(readr)

Read Mushroom Data Set

From the Data Dictionary: This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be’’ for Poisonous Oak and Ivy.

mushroomURL <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

# read in the mushroom data as a csv with no headers and comma separator
mushroomData <- read.csv(mushroomURL, header = FALSE, sep = ",")
mushroomData <- as.data.frame(mushroomData)

let’s take a look at it.

ncol(mushroomData)
## [1] 23
nrow(mushroomData)
## [1] 8124
head(mushroomData)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p   k
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p   n
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p   n
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p   k
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e   n
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p   k
##   V22 V23
## 1   s   u
## 2   n   g
## 3   n   m
## 4   s   u
## 5   a   g
## 6   n   g
dim(mushroomData)
## [1] 8124   23
table(mushroomData$V1)
## 
##    e    p 
## 4208 3916
unique(mushroomData$V1)
## [1] "p" "e"

Creation of a vector to store all header information

headers <- c('edib-or-poison', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
             'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
             'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
             'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
             'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat');

Create data frame to hold all relabeling information

relabels <- rbind(c('edib-or-poison', "e", "edible"), 
                  c('edib-or-poison', "p", "poisonous"),
                  c('odor', "a", "almond"),
                  c('odor', "l", "anise"),
                  c('odor', "c", "creosote"),
                  c('odor', "y", "fishy"),
                  c('odor', "f", "foul"),
                  c('odor', "m", "musty"),
                  c('odor', "n", "none"),
                  c('odor', "p", "pungent"),
                  c('odor', "s", "spicy"),
                  c('cap-color', "n", "brown"),
                  c('cap-color', "b", "buff"),
                  c('cap-color', "c", "cinnamon"),
                  c('cap-color', "g", "gray"),
                  c('cap-color', "r", "green"),
                  c('cap-color', "p", "pink"),
                  c('cap-color', "u", "purple"),
                  c('cap-color', "e", "red"),
                  c('cap-color', "w", "white"),
                  c('cap-color', "y", "yellow"),
                  c('population', "a", "abundant"),
                  c('population', "c", "clustered"),
                  c('population', "n", "numerous"),
                  c('population', "s", "scattered"),
                  c('population', "v", "several"),
                  c('population', "y", "solitary"),
                  c('habitat', "g", "grasses"),
                  c('habitat', "l", "leaves"),
                  c('habitat', "m", "meadows"),
                  c('habitat', "p", "paths"),
                  c('habitat', "u", "urban"),
                  c('habitat', "w", "waste"),
                  c('habitat', "d", "woods")
                );

relabels <- data.frame(relabels, stringsAsFactors = FALSE)

Added headers to mushroom data

for(i in 1:length(headers)) {
  names(mushroomData)[i] <- headers[i]
}

Subset mushroom dataset to include 5 columns,adding population, and habitad.

mushroomData <- select(mushroomData, 'edib-or-poison', 'odor', 'cap-color', 'population', 'habitat')

re-label values in mushroom dataset

createtion of a loop to update the data based on its corresponding values.

for(i in 1:length(relabels$X1)){
  mushroomData[[relabels$X1[i]]] <- replace(mushroomData[[relabels$X1[i]]] , mushroomData[[relabels$X1[i]]] == relabels$X2[i], relabels$X3[i])
}