—Wk1 Hui Gracie Han 607 Mushroom Data Explortn

title: “Wk1 Hui Gracie Han 607 Mushroom Data Explortn” author: “Hui (Gracie) Han” date: “September 2, 2018” output: html_document

Load data

MushroomSite <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
MushroomData <- read.table(file =MushroomSite, header=FALSE, sep=",")

explore the dataset (dimentions, colNames), then compare it with the UCI descprition on the mushroom data

dim (MushroomData)
## [1] 8124   23
head(MushroomData)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g
colnames (MushroomData)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"

make the Data name shorter, so that it is easier to code

M <- MushroomData
head(M)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

find distinct value in a column, so that it is easy to eyeball and compare with the info from the UCI website

table (M$V1)
## 
##    e    p 
## 4208 3916
 table (M$V2)
## 
##    b    c    f    k    s    x 
##  452    4 3152  828   32 3656
 table (M$V3)
## 
##    f    g    s    y 
## 2320    4 2556 3244
 table (M$V4)
## 
##    b    c    e    g    n    p    r    u    w    y 
##  168   44 1500 1840 2284  144   16   16 1040 1072
 table (M$V5)
## 
##    f    t 
## 4748 3376
 table (M$V6)
## 
##    a    c    f    l    m    n    p    s    y 
##  400  192 2160  400   36 3528  256  576  576
 table (M$V7)
## 
##    a    f 
##  210 7914
 table (M$V8)
## 
##    c    w 
## 6812 1312
 table (M$V9)
## 
##    b    n 
## 5612 2512
 table (M$V10)
## 
##    b    e    g    h    k    n    o    p    r    u    w    y 
## 1728   96  752  732  408 1048   64 1492   24  492 1202   86
 table (M$V11)
## 
##    e    t 
## 3516 4608
 table (M$V12)
## 
##    ?    b    c    e    r 
## 2480 3776  556 1120  192
 table (M$V13)
## 
##    f    k    s    y 
##  552 2372 5176   24
 table (M$V14)
## 
##    f    k    s    y 
##  600 2304 4936  284
 table (M$V15)
## 
##    b    c    e    g    n    o    p    w    y 
##  432   36   96  576  448  192 1872 4464    8
 table (M$V16)
## 
##    b    c    e    g    n    o    p    w    y 
##  432   36   96  576  512  192 1872 4384   24
 table (M$V17)
## 
##    p 
## 8124
 table (M$V18)
## 
##    n    o    w    y 
##   96   96 7924    8
 table (M$V19)
## 
##    n    o    t 
##   36 7488  600
 table (M$V20)
## 
##    e    f    l    n    p 
## 2776   48 1296   36 3968
 table (M$V21)
## 
##    b    h    k    n    o    r    u    w    y 
##   48 1632 1872 1968   48   72   48 2388   48
 table (M$V22)
## 
##    a    c    n    s    v    y 
##  384  340  400 1248 4040 1712
 table (M$V23)
## 
##    d    g    l    m    p    u    w 
## 3148 2148  832  292 1144  368  192

Subset the data using the base R Subset function, to get the columns of interest

 Msubset <-   M [,c('V1', 'V2',  'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V21', 'V22', 'V23')]

Rename the columns of interest above

names(Msubset) <-  c("edibility", "capShape", "capSurface", "capColor", 
       "bruises", "odor", "gillAttachment", "gillSpacing", "gillSize", 
       "gillColor", "sporePrintColor", "population", "habitat")
 
  head(Msubset)
##   edibility capShape capSurface capColor bruises odor gillAttachment
## 1         p        x          s        n       t    p              f
## 2         e        x          s        y       t    a              f
## 3         e        b          s        w       t    l              f
## 4         p        x          y        w       t    p              f
## 5         e        x          s        g       f    n              f
## 6         e        x          y        y       t    a              f
##   gillSpacing gillSize gillColor sporePrintColor population habitat
## 1           c        n         k               k          s       u
## 2           c        b         k               n          n       g
## 3           c        b         n               n          n       m
## 4           c        n         n               k          s       u
## 5           w        b         k               n          a       g
## 6           c        b         n               k          n       g

Rename the description to meaningful descriptions

library (dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  Msubset <- Msubset %>% mutate(edibility = recode(edibility, e = "edible", 
                                             p = "poisonous")) %>% mutate(capSurface = recode(capSurface, 
                                                                                              f = "fibrous", g = "grooves", y = "scaly", s = "smooth")) %>% 
    mutate(capShape = recode(capShape, b = "bell", c = "conical", 
                             x = "convex", f = "flat", k = "knobbed", s = "sunken")) %>% 
    mutate(capColor = recode(capColor, n = "brown", b = "buff", 
                             c = "cinnamon", g = "gray", r = "green", p = "pink", 
                             u = "purple", e = "red", w = "white", y = "yellow")) %>% 
    mutate(bruises = recode(bruises, t = "bruises", f = "none")) %>% 
    mutate(odor = recode(odor, a = "almond", l = "anise", c = "creosote", 
                         y = "fishy", f = "foul", m = "musty", n = "none", p = "pungent", 
                         s = "spicy")) %>% mutate(gillAttachment = recode(gillAttachment, 
                                                                          a = "attachment", d = "descending", f = "free", n = "notched")) %>% 
    mutate(gillSpacing = recode(gillSpacing, c = "close", w = "crowded", 
                                d = "distant")) %>% mutate(gillSize = recode(gillSize, 
                                                                             b = "broad", n = "narrow")) %>% mutate(gillColor = recode(gillColor, 
                                                                                                                                       k = "black", n = "brown", b = "buff", h = "chocolate", g = "gray", 
                                                                                                                                       r = "green", o = "orange", p = "pink", u = "purple", e = "red", 
                                                                                                                                       w = "white", y = "yellow")) %>% mutate(sporePrintColor = recode(sporePrintColor, 
                                                                                                                                                                                                       k = "black", n = "brown", b = "buff", h = "chocolate", r = "green", 
                                                                                                                                                                                                       o = "orange", u = "purple", w = "white", y = "yellow")) %>% 
    mutate(population = recode(population, a = "abundant", c = "clustered", 
                               n = "numerous", s = "scattered", v = "several", y = "solitary")) %>% 
    mutate(habitat = recode(habitat, g = "grasses", l = "leaves", 
                            m = "meadows", p = "paths", u = "urban", w = "waste", 
                            d = "woods"))
## Warning: package 'bindrcpp' was built under R version 3.3.3
# to check the renaming of the description
  head(Msubset)
##   edibility capShape capSurface capColor bruises    odor gillAttachment
## 1 poisonous   convex     smooth    brown bruises pungent           free
## 2    edible   convex     smooth   yellow bruises  almond           free
## 3    edible     bell     smooth    white bruises   anise           free
## 4 poisonous   convex      scaly    white bruises pungent           free
## 5    edible   convex     smooth     gray    none    none           free
## 6    edible   convex      scaly   yellow bruises  almond           free
##   gillSpacing gillSize gillColor sporePrintColor population habitat
## 1       close   narrow     black           black  scattered   urban
## 2       close    broad     black           brown   numerous grasses
## 3       close    broad     brown           brown   numerous meadows
## 4       close   narrow     brown           black  scattered   urban
## 5     crowded    broad     black           brown   abundant grasses
## 6       close    broad     brown           black   numerous grasses