library(repmis)
library(pander)
library(stringr)
mushrooms <- source_data ('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data')
## Downloading data from: https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data
## SHA-1 hash of the downloaded data file is:
## 7277e7bc03888ea5684b2281a62e0a9350caa00c
pander(head(mushrooms), type = "grid", caption = "Sample of Original Dataset")
Sample of Original Dataset
| e |
x |
s |
y |
t |
a |
f |
c |
b |
k |
e |
c |
s |
s |
w |
w |
p |
w |
o |
p |
n |
n |
g |
| e |
b |
s |
w |
t |
l |
f |
c |
b |
n |
e |
c |
s |
s |
w |
w |
p |
w |
o |
p |
n |
n |
m |
| p |
x |
y |
w |
t |
p |
f |
c |
n |
n |
e |
e |
s |
s |
w |
w |
p |
w |
o |
p |
k |
s |
u |
| e |
x |
s |
g |
f |
n |
f |
w |
b |
k |
t |
e |
s |
s |
w |
w |
p |
w |
o |
e |
n |
a |
g |
| e |
x |
y |
y |
t |
a |
f |
c |
b |
n |
e |
c |
s |
s |
w |
w |
p |
w |
o |
p |
k |
n |
g |
| e |
b |
s |
w |
t |
a |
f |
c |
b |
g |
e |
c |
s |
s |
w |
w |
p |
w |
o |
p |
k |
n |
m |
sub <- mushrooms[,c(1,4,6,18,23)]
colnames(sub) <- c("Poisonous", "cap_color", "Odor", "Veil_color", "Habitat")
haves <-c('e', 'p')
wants <-c('edible', 'poisonous')
for (i in 1:length(haves)){
sub[,"Poisonous"][sub[,"Poisonous"] == haves[i]] <- wants[i]
}
## I tried to make this a function, and believe I was very close, but could not make it work (global/local problem?)
##convert <- function ( data, col, have_vector = haves, want_vector = wants){
## len <- length(have_vector)
## for (i in 1:len){
## data[,col][data[, col] == have_vector[i]] <- want_vector[i]
## }
##}
haves <- c('n', 'b', 'c', 'g', 'r', 'p', 'u', 'e', 'w', 'y')
string = "brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y"
wants <-str_extract_all(string, pattern = "[[:alpha:]]{2,16}")
wants <-wants[[1]]
## I pasted the string from the data dictionary to exract to haves and wants with regular expressions. I was not able to get the one letter haves, but saved a lot of typing by getting the wants automatically
for (i in 1:length(haves)){
sub[,"cap_color"][sub[,"cap_color"] == haves[i]] <- wants[i]
}
haves <- c('a', 'l', 'c', 'y', 'f', 'm', 'n', 'p', 's')
string = "almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s"
wants <-str_extract_all(string, pattern = "[[:alpha:]]{2,16}")
wants <-wants[[1]]
for (i in 1:length(haves)){
sub[,"Odor"][sub[,"Odor"] == haves[i]] <- wants[i]
}
haves <- c('n', 'o', 'w', 'y')
string = "brown=n,orange=o,white=w,yellow=y"
wants <-str_extract_all(string, pattern = "[[:alpha:]]{2,16}")
wants <-wants[[1]]
for (i in 1:length(haves)){
sub[,"Veil_color"][sub[,"Veil_color"] == haves[i]] <- wants[i]
}
haves <- c('g', 'l', 'm', 'p', 'u', 'w', 'd')
string = "habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d"
wants <-str_extract_all(string, pattern = "[[:alpha:]]{2,16}")
wants <-wants[[1]]
for (i in 1:length(haves)){
sub[,"Habitat"][sub[,"Habitat"] == haves[i]] <- wants[i]
}
pander(head(sub,20), type = "grid", caption = "Sample of Transformed Dataset")
Sample of Transformed Dataset
| edible |
yellow |
almond |
white |
habitat |
| edible |
white |
anise |
white |
leaves |
| poisonous |
white |
pungent |
white |
paths |
| edible |
gray |
none |
white |
habitat |
| edible |
yellow |
almond |
white |
habitat |
| edible |
white |
almond |
white |
leaves |
| edible |
white |
anise |
white |
leaves |
| poisonous |
white |
pungent |
white |
habitat |
| edible |
yellow |
almond |
white |
leaves |
| edible |
yellow |
anise |
white |
habitat |
| edible |
yellow |
almond |
white |
leaves |
| edible |
yellow |
almond |
white |
habitat |
| poisonous |
white |
pungent |
white |
paths |
| edible |
brown |
none |
white |
habitat |
| edible |
gray |
none |
white |
paths |
| edible |
white |
none |
white |
habitat |
| poisonous |
brown |
pungent |
white |
habitat |
| poisonous |
white |
pungent |
white |
paths |
| poisonous |
brown |
pungent |
white |
paths |
| edible |
yellow |
almond |
white |
leaves |