IS607 Week1 Assignment

The task is to study the dataset and the associated description of the data (i.e. “data dictionary”). You may need to look around a bit, but it’s there! You should take the data, and create a data frame with a subset of the columns in the dataset. You should include the column that indicates edible or poisonous and three or four other columns. You should also add meaningful column names and replace the abbreviations used in the data.

Get the DATA

theurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
thedata <- read.table(file = theurl, header = FALSE, sep = ",")
head(thedata)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Add Descriptive Columns

names(thedata)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"
names(thedata) <-c("classes","cshape","csurface","ccolor","bruises?","odor", "gattachment", "gspacing", "gsize","gcolor", "sshape", "sroot", "ssurfacearing", "ssurfacebring", "scoloraring", "scolorbring", "vtype", "vcolor", "rnumber", "rtype", "sprintcolor", "population", "habitat")
names(thedata)
##  [1] "classes"       "cshape"        "csurface"      "ccolor"       
##  [5] "bruises?"      "odor"          "gattachment"   "gspacing"     
##  [9] "gsize"         "gcolor"        "sshape"        "sroot"        
## [13] "ssurfacearing" "ssurfacebring" "scoloraring"   "scolorbring"  
## [17] "vtype"         "vcolor"        "rnumber"       "rtype"        
## [21] "sprintcolor"   "population"    "habitat"

Replace abbreviated data

library(plyr)
thedata$classes <-mapvalues(thedata$classes, c("e","p"), c("edible","poisonous"))

thedata$cshape <-mapvalues(thedata$cshape, c("b","c","x","f","k","s"), c("bell","conical","convex","flat","knobbed","sunken"))

thedata$csurface <-mapvalues(thedata$csurface, c("f","g","y","s"), c("fibrous","grooves","scaly","smooth"))

thedata$ccolor <-mapvalues(thedata$ccolor, c("n","b","c","g","r","p","u","e","w","y"), c("brown","buff","cinnamon","gray","green","pink","purple","red","white","yellow"))

thedata$bruises <-mapvalues(thedata$bruises, c("t","f"), c("bruises","no"))

thedata$odor <-mapvalues(thedata$odor, c("a","l","c","y","f","m","n","p","s"), c("almond","anise","creosote","fishy","foul","musty","none","pungent","spicy"))

thedata$gattachment <-mapvalues(thedata$gattachment, c("a","d","f","n"), c("attached","descending","free","notched"))
## The following `from` values were not present in `x`: d, n
thedata$gspacing <-mapvalues(thedata$gspacing, c("c","w","d"), c("close","crowded","distant"))
## The following `from` values were not present in `x`: d
thedata$gsize <-mapvalues(thedata$gsize, c("b","n"), c("broad","narrow"))

thedata$gcolor <-mapvalues(thedata$gcolor, c("k","n","b","h","g","r","o","p","u","e","w","y"), c("black","brown","buff","chocolate","gray","green","orange","pink","purple","red","white","yellow"))

thedata$sshape <-mapvalues(thedata$sshape, c("e","t"), c("enlarging","tapering"))

thedata$sroot <-mapvalues(thedata$sroot, c("b","c","u","e","z","r","?"), c("bulubous","clud","cup","equal","rhizomorphs","rooted","missing"))
## The following `from` values were not present in `x`: u, z
thedata$ssurfacearing <-mapvalues(thedata$ssurfacearing, c("f","y","k","s"), c("fibrous","scaly","silky","smooth"))

thedata$ssurfacebring <-mapvalues(thedata$ssurfacebring, c("f","y","k","s"), c("fibrous","scaly","silky","smooth"))

thedata$scoloraring <-mapvalues(thedata$scoloraring, c("n","b","c","g","o","p","e","w","y"), c("brown","buff","cinnamon","gray","orange","pink","red","white","yellow"))

thedata$scolorbring <-mapvalues(thedata$scolorbring, c("n","b","c","g","o","p","e","w","y"), c("brown","buff","cinnamon","gray","orange","pink","red","white","yellow"))

thedata$vtype <-mapvalues(thedata$vtype, c("p","u"), c("partial","universal"))
## The following `from` values were not present in `x`: u
thedata$vcolor <-mapvalues(thedata$vcolor, c("b","n","w","y"), c("brown","orange","white","yellow"))
## The following `from` values were not present in `x`: b
thedata$rnumber <-mapvalues(thedata$rnumber, c("n","o","t"), c("none","one","two"))

thedata$rtype <-mapvalues(thedata$rtype, c("c","e","f","l","n","p","s","z"), c("coweboy","evanescent","flaring","large","none","pendant","sheathing","zone"))
## The following `from` values were not present in `x`: c, s, z
thedata$sprintcolor <-mapvalues(thedata$sprintcolor, c("k","n","b","h","r","o","u","w","y"), c("black","brown","buff","chocolate","green","orange","purple","white","yellow"))

thedata$population <-mapvalues(thedata$population, c("a","c","n","s","v","y"), c("abundant","clustered","numerous","scattered","several","solitary"))

thedata$habitat <-mapvalues(thedata$habitat, c("g","l","m","p","u","w","d"), c("grasses","leaves","meadows","paths","urban","waste","woods"))

head(thedata)
##     classes cshape csurface ccolor bruises?    odor gattachment gspacing
## 1 poisonous convex   smooth  brown        t pungent        free    close
## 2    edible convex   smooth yellow        t  almond        free    close
## 3    edible   bell   smooth  white        t   anise        free    close
## 4 poisonous convex    scaly  white        t pungent        free    close
## 5    edible convex   smooth   gray        f    none        free  crowded
## 6    edible convex    scaly yellow        t  almond        free    close
##    gsize gcolor    sshape sroot ssurfacearing ssurfacebring scoloraring
## 1 narrow  black enlarging equal        smooth        smooth       white
## 2  broad  black enlarging  clud        smooth        smooth       white
## 3  broad  brown enlarging  clud        smooth        smooth       white
## 4 narrow  brown enlarging equal        smooth        smooth       white
## 5  broad  black  tapering equal        smooth        smooth       white
## 6  broad  brown enlarging  clud        smooth        smooth       white
##   scolorbring   vtype vcolor rnumber      rtype sprintcolor population
## 1       white partial  white     one    pendant       black  scattered
## 2       white partial  white     one    pendant       brown   numerous
## 3       white partial  white     one    pendant       brown   numerous
## 4       white partial  white     one    pendant       black  scattered
## 5       white partial  white     one evanescent       brown   abundant
## 6       white partial  white     one    pendant       black   numerous
##   habitat bruises
## 1   urban bruises
## 2 grasses bruises
## 3 meadows bruises
## 4   urban bruises
## 5 grasses      no
## 6 grasses bruises

Getting Subset of data for Classes = Edible

thedataedible <-head(subset(thedata, classes == "edible", select = c('classes','sshape','rnumber','vcolor','population','odor','gspacing','habitat')))
head(thedataedible)
##   classes    sshape rnumber vcolor population   odor gspacing habitat
## 2  edible enlarging     one  white   numerous almond    close grasses
## 3  edible enlarging     one  white   numerous  anise    close meadows
## 5  edible  tapering     one  white   abundant   none  crowded grasses
## 6  edible enlarging     one  white   numerous almond    close grasses
## 7  edible enlarging     one  white   numerous almond    close meadows
## 8  edible enlarging     one  white  scattered  anise    close meadows

Getting Subset of data for Classes = Poisonus

thedatapoisonous <-head(subset(thedata, classes == "poisonous", select = c('classes','sshape','rnumber','vcolor','population','odor','gspacing','habitat')))
head(thedatapoisonous)
##      classes    sshape rnumber vcolor population    odor gspacing habitat
## 1  poisonous enlarging     one  white  scattered pungent    close   urban
## 4  poisonous enlarging     one  white  scattered pungent    close   urban
## 9  poisonous enlarging     one  white    several pungent    close grasses
## 14 poisonous enlarging     one  white    several pungent    close   urban
## 18 poisonous enlarging     one  white  scattered pungent    close grasses
## 19 poisonous enlarging     one  white  scattered pungent    close   urban