—Wk1 Hui Gracie Han 607 Mushroom Data Explortn
title: “Wk1 Hui Gracie Han 607 Mushroom Data Explortn” author: “Hui (Gracie) Han” date: “September 2, 2018” output: html_document
Load data
MushroomSite <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
MushroomData <- read.table(file =MushroomSite, header=FALSE, sep=",")
explore the dataset (dimentions, colNames), then compare it with the UCI descprition on the mushroom data
dim (MushroomData)
## [1] 8124 23
head(MushroomData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
colnames (MushroomData)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"
make the Data name shorter, so that it is easier to code
M <- MushroomData
head(M)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
find distinct value in a column, so that it is easy to eyeball and compare with the info from the UCI website
table (M$V1)
##
## e p
## 4208 3916
table (M$V2)
##
## b c f k s x
## 452 4 3152 828 32 3656
table (M$V3)
##
## f g s y
## 2320 4 2556 3244
table (M$V4)
##
## b c e g n p r u w y
## 168 44 1500 1840 2284 144 16 16 1040 1072
table (M$V5)
##
## f t
## 4748 3376
table (M$V6)
##
## a c f l m n p s y
## 400 192 2160 400 36 3528 256 576 576
table (M$V7)
##
## a f
## 210 7914
table (M$V8)
##
## c w
## 6812 1312
table (M$V9)
##
## b n
## 5612 2512
table (M$V10)
##
## b e g h k n o p r u w y
## 1728 96 752 732 408 1048 64 1492 24 492 1202 86
table (M$V11)
##
## e t
## 3516 4608
table (M$V12)
##
## ? b c e r
## 2480 3776 556 1120 192
table (M$V13)
##
## f k s y
## 552 2372 5176 24
table (M$V14)
##
## f k s y
## 600 2304 4936 284
table (M$V15)
##
## b c e g n o p w y
## 432 36 96 576 448 192 1872 4464 8
table (M$V16)
##
## b c e g n o p w y
## 432 36 96 576 512 192 1872 4384 24
table (M$V17)
##
## p
## 8124
table (M$V18)
##
## n o w y
## 96 96 7924 8
table (M$V19)
##
## n o t
## 36 7488 600
table (M$V20)
##
## e f l n p
## 2776 48 1296 36 3968
table (M$V21)
##
## b h k n o r u w y
## 48 1632 1872 1968 48 72 48 2388 48
table (M$V22)
##
## a c n s v y
## 384 340 400 1248 4040 1712
table (M$V23)
##
## d g l m p u w
## 3148 2148 832 292 1144 368 192
Subset the data using the base R Subset function, to get the columns of interest
Msubset <- M [,c('V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V21', 'V22', 'V23')]
Rename the columns of interest above
names(Msubset) <- c("edibility", "capShape", "capSurface", "capColor",
"bruises", "odor", "gillAttachment", "gillSpacing", "gillSize",
"gillColor", "sporePrintColor", "population", "habitat")
head(Msubset)
## edibility capShape capSurface capColor bruises odor gillAttachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gillSpacing gillSize gillColor sporePrintColor population habitat
## 1 c n k k s u
## 2 c b k n n g
## 3 c b n n n m
## 4 c n n k s u
## 5 w b k n a g
## 6 c b n k n g
Rename the description to meaningful descriptions
library (dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Msubset <- Msubset %>% mutate(edibility = recode(edibility, e = "edible",
p = "poisonous")) %>% mutate(capSurface = recode(capSurface,
f = "fibrous", g = "grooves", y = "scaly", s = "smooth")) %>%
mutate(capShape = recode(capShape, b = "bell", c = "conical",
x = "convex", f = "flat", k = "knobbed", s = "sunken")) %>%
mutate(capColor = recode(capColor, n = "brown", b = "buff",
c = "cinnamon", g = "gray", r = "green", p = "pink",
u = "purple", e = "red", w = "white", y = "yellow")) %>%
mutate(bruises = recode(bruises, t = "bruises", f = "none")) %>%
mutate(odor = recode(odor, a = "almond", l = "anise", c = "creosote",
y = "fishy", f = "foul", m = "musty", n = "none", p = "pungent",
s = "spicy")) %>% mutate(gillAttachment = recode(gillAttachment,
a = "attachment", d = "descending", f = "free", n = "notched")) %>%
mutate(gillSpacing = recode(gillSpacing, c = "close", w = "crowded",
d = "distant")) %>% mutate(gillSize = recode(gillSize,
b = "broad", n = "narrow")) %>% mutate(gillColor = recode(gillColor,
k = "black", n = "brown", b = "buff", h = "chocolate", g = "gray",
r = "green", o = "orange", p = "pink", u = "purple", e = "red",
w = "white", y = "yellow")) %>% mutate(sporePrintColor = recode(sporePrintColor,
k = "black", n = "brown", b = "buff", h = "chocolate", r = "green",
o = "orange", u = "purple", w = "white", y = "yellow")) %>%
mutate(population = recode(population, a = "abundant", c = "clustered",
n = "numerous", s = "scattered", v = "several", y = "solitary")) %>%
mutate(habitat = recode(habitat, g = "grasses", l = "leaves",
m = "meadows", p = "paths", u = "urban", w = "waste",
d = "woods"))
## Warning: package 'bindrcpp' was built under R version 3.3.3
# to check the renaming of the description
head(Msubset)
## edibility capShape capSurface capColor bruises odor gillAttachment
## 1 poisonous convex smooth brown bruises pungent free
## 2 edible convex smooth yellow bruises almond free
## 3 edible bell smooth white bruises anise free
## 4 poisonous convex scaly white bruises pungent free
## 5 edible convex smooth gray none none free
## 6 edible convex scaly yellow bruises almond free
## gillSpacing gillSize gillColor sporePrintColor population habitat
## 1 close narrow black black scattered urban
## 2 close broad black brown numerous grasses
## 3 close broad brown brown numerous meadows
## 4 close narrow brown black scattered urban
## 5 crowded broad black brown abundant grasses
## 6 close broad brown black numerous grasses