Columns habitat and cap-color were picked because together they have 100% accuracy when former = leaves and latter = white.
Column spore-print-color was picked because it has a 99.41% accuracy when value is green.
require(plyr)
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.3.1
require(curl)
## Loading required package: curl
## Warning: package 'curl' was built under R version 3.3.1
df <- read.table(curl("https://raw.githubusercontent.com/sjv1030/Data607-Lab1/master/agaricus-lepiota.data"),sep=",")
dim(df)
## [1] 8124 23
df_e <- subset(df[1],V1 == "e")
nrow(df_e)
## [1] 4208
df_p <- subset(df[1],V1 == "p")
nrow(df_p)
## [1] 3916
ans <- df[c(1,21,23,4)]
colnames(ans) <- c("e_p","spore_print_color","habitat","cap_color")
ans$e_p <- mapvalues(ans$e_p, from=c("e","p"), to=c("edible","poisonous"))
ans$spore_print_color <- mapvalues(ans$spore_print_color,
from=c("k","n","b","h","r","o","u","w","y"),
to=c("black","brown","buff","chocolate","green",
"orange","purple","white","yellow"))
ans$habitat <- mapvalues(ans$habitat, from=c("g","l","m","p","u","w","d"),
to=c("grasses","leaves","meadows","path","urban","waste","woods"))
ans$cap_color <- mapvalues(ans$cap_color,
from=c("n","b","r","u","w","y","c","g","p","e"),
to=c("brown","buff","green","purple","white",
"yellow","cinnamon", "gray","pink","red"))
Top of dataset:
head(ans) # show top of dataset
## e_p spore_print_color habitat cap_color
## 1 poisonous black urban brown
## 2 edible brown grasses yellow
## 3 edible brown meadows white
## 4 poisonous black urban white
## 5 edible brown grasses gray
## 6 edible black grasses yellow
Bottom of dataset:
tail(ans) # show bottom of dataset
## e_p spore_print_color habitat cap_color
## 8119 poisonous white woods brown
## 8120 edible buff leaves brown
## 8121 edible buff leaves brown
## 8122 edible buff leaves brown
## 8123 poisonous white leaves brown
## 8124 edible orange leaves brown