Data Set Title: Mushroom Database
Sources: (a) Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf (b) Donor: Jeff Schlimmer (c) Date: 27 April 1987 (d) Data Set: https://archive.ics.uci.edu/ml/datasets/Mushroom.
In the following R markdown I am converting the Mushroom Dataset from variables to their full attribute information.
The actual data set is found here: https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data
I chose the first 6 columns from the data set and transform the information based on the data dictionary here: https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names.
I produced the conversion using three methods, the first is basic whereas the second and third uses the package plyr.
The below method is the least elegant but can be accomplished without a package
f <- file("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", open="r" )
data <- read.table(f, sep=",", header=FALSE, stringsAsFactors = FALSE)
newdata <- data[1:6]
y <- data.frame(newdata)
colnames(y) <- c("CLASS","cap-shape","cap-surface","cap-color","bruises?","odor")
y$CLASS[y$CLASS == "p"] <- "poisonous"
y$CLASS[y$CLASS == "e"] <- "edible"
y$CLASS <- as.factor(y$CLASS)
y$`cap-shape`[y$`cap-shape` == "b"] <- "bell"
y$`cap-shape`[y$`cap-shape` == "c"] <- "conical"
y$`cap-shape`[y$`cap-shape` == "x"] <- "convex"
y$`cap-shape`[y$`cap-shape` == "f"] <- "flat"
y$`cap-shape`[y$`cap-shape` == "k"] <- "knobbed"
y$`cap-shape`[y$`cap-shape` == "s"] <- "suken"
y$`cap-shape` <- as.factor(y$`cap-shape`)
y$`cap-surface`[y$`cap-surface` == "f"] <- "fibrous"
y$`cap-surface`[y$`cap-surface` == "g"] <- "grooves"
y$`cap-surface`[y$`cap-surface` == "y"] <- "scaly"
y$`cap-surface`[y$`cap-surface` == "s"] <- "smooth"
y$`cap-surface` <- as.factor(y$`cap-surface`)
y$`cap-color`[y$`cap-color` == "n"] <- "brown"
y$`cap-color`[y$`cap-color` == "b"] <- "buff"
y$`cap-color`[y$`cap-color` == "c"] <- "cinnamon"
y$`cap-color`[y$`cap-color` == "g"] <- "gray"
y$`cap-color`[y$`cap-color` == "r"] <- "green"
y$`cap-color`[y$`cap-color` == "p"] <- "pink"
y$`cap-color`[y$`cap-color` == "u"] <- "purple"
y$`cap-color`[y$`cap-color` == "e"] <- "red"
y$`cap-color`[y$`cap-color` == "w"] <- "white"
y$`cap-color`[y$`cap-color` == "y"] <- "yellow"
y$`cap-color` <- as.factor(y$`cap-color`)
y$`bruises?`[y$`bruises?` == "t"] <- "bruises"
y$`bruises?`[y$`bruises?` == "f"] <- "no"
y$`bruises?` <- as.factor(y$`bruises?`)
y$odor[y$odor == "a"] <- "almond"
y$odor[y$odor == "l"] <- "anise"
y$odor[y$odor == "c"] <- "creosote"
y$odor[y$odor == "y"] <- "fishy"
y$odor[y$odor == "f"] <- "foul"
y$odor[y$odor == "m"] <- "musty"
y$odor[y$odor == "n"] <- "none"
y$odor[y$odor == "p"] <- "pungent"
y$odor[y$odor == "s"] <- "spicy"
y$odor <- as.factor(y$odor)
close(f)
The below is provided for verification of the variable conversion
summary(y)
## CLASS cap-shape cap-surface cap-color
## edible :4208 bell : 452 fibrous:2320 brown :2284
## poisonous:3916 conical: 4 grooves: 4 gray :1840
## convex :3656 scaly :3244 red :1500
## flat :3152 smooth :2556 yellow :1072
## knobbed: 828 white :1040
## suken : 32 buff : 168
## (Other): 220
## bruises? odor
## bruises:3376 none :3528
## no :4748 foul :2160
## fishy : 576
## spicy : 576
## almond : 400
## anise : 400
## (Other): 484
str(y)
## 'data.frame': 8124 obs. of 6 variables:
## $ CLASS : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap-shape : Factor w/ 6 levels "bell","conical",..: 3 3 1 3 3 3 1 1 3 1 ...
## $ cap-surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 3 4 3 4 3 3 4 ...
## $ cap-color : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
## $ bruises? : Factor w/ 2 levels "bruises","no": 1 1 1 1 2 1 1 1 1 1 ...
## $ odor : Factor w/ 9 levels "almond","anise",..: 8 1 2 8 7 1 1 2 8 1 ...
head(y)
## CLASS cap-shape cap-surface cap-color bruises? odor
## 1 poisonous convex smooth brown bruises pungent
## 2 edible convex smooth yellow bruises almond
## 3 edible bell smooth white bruises anise
## 4 poisonous convex scaly white bruises pungent
## 5 edible convex smooth gray no none
## 6 edible convex scaly yellow bruises almond
The below uses mapvalues from the plyr package which I personally preferred
f <- file("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", open="r" )
data <- read.table(f, sep=",", header=FALSE, stringsAsFactors = FALSE)
newdata <- data[1:6]
t <- data.frame(newdata)
colnames(t) <- c("CLASS","cap-shape","cap-surface","cap-color","bruises?","odor")
library(plyr)
t$CLASS <- as.factor(mapvalues(t$CLASS,c("p", "e"),c("poisonous","edible")))
t$`cap-shape` <- as.factor(mapvalues(t$`cap-shape`, c("b","c","x","f","k","s"), c("bell","conical","convex","flat","knobbed","suken")))
t$`cap-surface` <- as.factor(mapvalues(t$`cap-surface`, c("f","g","y","s"), c("fibrous","grooves","scaly","smooth")))
t$`cap-color` <- as.factor(mapvalues(t$`cap-color`,c("n","b","c","g","r","p","u","e","w","y"), c("brown","buff","cinnamon","gray","green","pink","purple","red","white","yellow")))
t$`bruises?` <- as.factor(mapvalues(t$`bruises?`, c("t","f"), c("bruises","no")))
t$odor <- as.factor(mapvalues(t$odor, c("a","l","c","y","f","m","n","p","s"), c("almond","anise","creosote","fishy","foul","musty","none","pungent","spicy")))
close(f)
The below is provided for verification of the variable conversion
summary(t)
## CLASS cap-shape cap-surface cap-color
## edible :4208 bell : 452 fibrous:2320 brown :2284
## poisonous:3916 conical: 4 grooves: 4 gray :1840
## convex :3656 scaly :3244 red :1500
## flat :3152 smooth :2556 yellow :1072
## knobbed: 828 white :1040
## suken : 32 buff : 168
## (Other): 220
## bruises? odor
## bruises:3376 none :3528
## no :4748 foul :2160
## fishy : 576
## spicy : 576
## almond : 400
## anise : 400
## (Other): 484
str(t)
## 'data.frame': 8124 obs. of 6 variables:
## $ CLASS : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap-shape : Factor w/ 6 levels "bell","conical",..: 3 3 1 3 3 3 1 1 3 1 ...
## $ cap-surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 3 4 3 4 3 3 4 ...
## $ cap-color : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
## $ bruises? : Factor w/ 2 levels "bruises","no": 1 1 1 1 2 1 1 1 1 1 ...
## $ odor : Factor w/ 9 levels "almond","anise",..: 8 1 2 8 7 1 1 2 8 1 ...
head(t)
## CLASS cap-shape cap-surface cap-color bruises? odor
## 1 poisonous convex smooth brown bruises pungent
## 2 edible convex smooth yellow bruises almond
## 3 edible bell smooth white bruises anise
## 4 poisonous convex scaly white bruises pungent
## 5 edible convex smooth gray no none
## 6 edible convex scaly yellow bruises almond
The below method uses the revalue function from the plyr package and is the clearest conversion method
f <- file("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", open="r" )
data <- read.table(f, sep=",", header=FALSE, stringsAsFactors = FALSE)
newdata <- data[1:6]
k <- data.frame(newdata)
colnames(k) <- c("CLASS","cap-shape","cap-surface","cap-color","bruises?","odor")
library(plyr)
k$CLASS <- as.factor(revalue(k$CLASS, c("p" = "poisonous", "e" = "edible")))
k$`cap-shape` <- as.factor(revalue(k$`cap-shape`, c("b" = "bell", "c" = "conical", "x" = "convex", "f" = "flat", "k" = "knobbed", "s" = "suken")))
k$`cap-surface` <- as.factor(revalue(k$`cap-surface`, c("f" = "fibrous", "g" = "grooves", "y" = "scaly", "s" = "smooth")))
k$`cap-color` <- as.factor(revalue(k$`cap-color`, c("n" = "brown", "b" = "buff", "c" = "cinnamon", "g" = "gray", "r" = "green", "p" = "pink", "u" = "purple", "e" = "red", "w" = "white", "y" = "yellow")))
k$`bruises?` <- as.factor(revalue(k$`bruises?`, c("t" = "bruises", "f" = "no")))
k$odor <- as.factor(revalue(k$odor, c("a" = "almond","l" = "anise","c" = "creosote","y" = "fishy","f" = "foul","m" = "musty","n" = "none","p" = "pungent","s" = "spicy")))
close(f)
The below is provided for verification of the variable conversion
summary(k)
## CLASS cap-shape cap-surface cap-color
## edible :4208 bell : 452 fibrous:2320 brown :2284
## poisonous:3916 conical: 4 grooves: 4 gray :1840
## convex :3656 scaly :3244 red :1500
## flat :3152 smooth :2556 yellow :1072
## knobbed: 828 white :1040
## suken : 32 buff : 168
## (Other): 220
## bruises? odor
## bruises:3376 none :3528
## no :4748 foul :2160
## fishy : 576
## spicy : 576
## almond : 400
## anise : 400
## (Other): 484
str(k)
## 'data.frame': 8124 obs. of 6 variables:
## $ CLASS : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap-shape : Factor w/ 6 levels "bell","conical",..: 3 3 1 3 3 3 1 1 3 1 ...
## $ cap-surface: Factor w/ 4 levels "fibrous","grooves",..: 4 4 4 3 4 3 4 3 3 4 ...
## $ cap-color : Factor w/ 10 levels "brown","buff",..: 1 10 9 9 4 10 9 9 9 10 ...
## $ bruises? : Factor w/ 2 levels "bruises","no": 1 1 1 1 2 1 1 1 1 1 ...
## $ odor : Factor w/ 9 levels "almond","anise",..: 8 1 2 8 7 1 1 2 8 1 ...
head(k)
## CLASS cap-shape cap-surface cap-color bruises? odor
## 1 poisonous convex smooth brown bruises pungent
## 2 edible convex smooth yellow bruises almond
## 3 edible bell smooth white bruises anise
## 4 poisonous convex scaly white bruises pungent
## 5 edible convex smooth gray no none
## 6 edible convex scaly yellow bruises almond