Requirements:

You should take the data, and create a data frame with a subset of the columns in the dataset.

You should include the column that indicates edible or poisonous and three or four other columns.

You should also add meaningful column names and replace the abbreviations used in the data — for example, in the appropriate column, “e” might become “edible.” Your deliverable is the R code to perform these transformation tasks.

Attribute Information

https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names
 0. classes  :                edible=e, poisonous=p
 1. cap-shape:                bell=b,conical=c,convex=x,flat=f,
                              knobbed=k,sunken=s
 2. cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s
 3. cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r,
                              pink=p,purple=u,red=e,white=w,yellow=y
 4. bruises?:                 bruises=t,no=f
 5. odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f,
                              musty=m,none=n,pungent=p,spicy=s
 6. gill-attachment:          attached=a,descending=d,free=f,notched=n
 :: only a, f
 7. gill-spacing:             close=c,crowded=w,distant=d
 8. gill-size:                broad=b,narrow=n
 9. gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g,
                              green=r,orange=o,pink=p,purple=u,red=e,
                              white=w,yellow=y
10. stalk-shape:              enlarging=e,tapering=t
11. stalk-root:               bulbous=b,club=c,cup=u,equal=e,
                              rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                              pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o,
                              pink=p,red=e,white=w,yellow=y
16. veil-type:                partial=p,universal=u
17. veil-color:               brown=n,orange=o,white=w,yellow=y
18. ring-number:              none=n,one=o,two=t
19. ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l,
                              none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r,
                              orange=o,purple=u,white=w,yellow=y
21. population:               abundant=a,clustered=c,numerous=n,
                              scattered=s,several=v,solitary=y
22. habitat:                  grasses=g,leaves=l,meadows=m,paths=p,
                              urban=u,waste=w,woods=d
                              

1. Load packages

library(RCurl)
## Loading required package: bitops
library(plyr)
library(data.table)

2. Load data

# readlines of original dataset from the url directly
dt.original <- readLines("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")

#leave the original copy and make data frame as dt
dt <- as.data.frame(dt.original, stringsAsFactors = F)

#split the elements of character vector by the separator ","
dt <- strsplit(dt$dt, ",")

#combine lists elements into a dataframe and also set the frame as data table
dt <- ldply(dt)
dt <- as.data.table(dt)

3. Pre-processing the dataset

3.1 Check table of columns and replace the values according to the attribution information and label the column names.

As provided in logical rules that showed significance in classification results, I have edited and subsetted the followings: odor, spore-print-color, stalk-surface-below-ring, habitat, population, cap-color ; abbreviation -> definition, table of results
0. classes: edible=e, poisonous=p
table(dt$V1)
## 
##    e    p 
## 4208 3916
dt$V1 <- ifelse(dt$V1 == "e", "edible", "poisonous")
setnames(dt, "V1", "classes")
table(dt$classes)
## 
##    edible poisonous 
##      4208      3916
  1. cap-shape: bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
table(dt$V2)
## 
##    b    c    f    k    s    x 
##  452    4 3152  828   32 3656
dt$V2 <- ifelse(dt$V2 == "b", "bell", dt$V2)
dt$V2 <- ifelse(dt$V2 == "c", "conical", dt$V2)
dt$V2 <- ifelse(dt$V2 == "x", "convex", dt$V2)
dt$V2 <- ifelse(dt$V2 == "f", "flat", dt$V2)
dt$V2 <- ifelse(dt$V2 == "k", "knobbed", dt$V2)
dt$V2 <- ifelse(dt$V2 == "s", "sunken", dt$V2)

setnames(dt, "V2", "cap.shape")
table(dt$cap.shape)
## 
##    bell conical  convex    flat knobbed  sunken 
##     452       4    3656    3152     828      32
  1. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
table(dt$V3)
## 
##    f    g    s    y 
## 2320    4 2556 3244
dt$V3[dt$V3 == "f"] <- "fibrous"
dt$V3[dt$V3 == "g"] <- "grooves"
dt$V3[dt$V3 == "s"] <- "smooth"
dt$V3[dt$V3 == "y"] <- "scaly"

setnames(dt, "V3", "cap.surface")
table(dt$cap.surface)
## 
## fibrous grooves   scaly  smooth 
##    2320       4    3244    2556
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
table(dt$V4)
## 
##    b    c    e    g    n    p    r    u    w    y 
##  168   44 1500 1840 2284  144   16   16 1040 1072
dt$V4[dt$V4 == "b"] <- "buff"
dt$V4[dt$V4 == "c"] <- "cinnamon"
dt$V4[dt$V4 == "e"] <- "red"
dt$V4[dt$V4 == "g"] <- "gray"
dt$V4[dt$V4 == "n"] <- "brown"
dt$V4[dt$V4 == "p"] <- "pink"
dt$V4[dt$V4 == "r"] <- "green"
dt$V4[dt$V4 == "u"] <- "purple"
dt$V4[dt$V4 == "w"] <- "white"
dt$V4[dt$V4 == "y"] <- "yellow"

setnames(dt, "V4", "cap.color")
table(dt$cap.color)
## 
##    brown     buff cinnamon     gray    green     pink   purple      red 
##     2284      168       44     1840       16      144       16     1500 
##    white   yellow 
##     1040     1072
  1. bruises?: bruises=t,no=f
table(dt$V5)
## 
##    f    t 
## 4748 3376
dt$V5 <- ifelse(dt$V5 == "f", "no_bruises", "bruises")
setnames(dt, "V5", "bruises")
table(dt$bruises)
## 
##    bruises no_bruises 
##       3376       4748
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
table(dt$V6)
## 
##    a    c    f    l    m    n    p    s    y 
##  400  192 2160  400   36 3528  256  576  576
dt$V6[dt$V6 == "a"] <- "almond"
dt$V6[dt$V6 == "c"] <- "creosote"
dt$V6[dt$V6 == "f"] <- "foul"
dt$V6[dt$V6 == "l"] <- "anise"
dt$V6[dt$V6 == "m"] <- "musty"
dt$V6[dt$V6 == "n"] <- "none"
dt$V6[dt$V6 == "p"] <- "pungent"
dt$V6[dt$V6 == "s"] <- "spicy"
dt$V6[dt$V6 == "y"] <- "fishy"

setnames(dt, "V6", "odor")
table(dt$odor)
## 
##   almond    anise creosote    fishy     foul    musty     none  pungent 
##      400      400      192      576     2160       36     3528      256 
##    spicy 
##      576
  1. gill-attachment: attached=a,descending=d,free=f,notched=n
table(dt$V7)
## 
##    a    f 
##  210 7914
  1. gill-spacing: close=c,crowded=w,distant=d
table(dt$V8)
## 
##    c    w 
## 6812 1312
  1. gill-size: broad=b,narrow=n
table(dt$V9)
## 
##    b    n 
## 5612 2512
  1. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
table(dt$V10)
## 
##    b    e    g    h    k    n    o    p    r    u    w    y 
## 1728   96  752  732  408 1048   64 1492   24  492 1202   86
  1. stalk-shape: enlarging=e,tapering=t
table(dt$V11)
## 
##    e    t 
## 3516 4608
  1. stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=? This column has 2480 data points of NA value.
table(dt$V12)
## 
##    ?    b    c    e    r 
## 2480 3776  556 1120  192
  1. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
table(dt$V13)
## 
##    f    k    s    y 
##  552 2372 5176   24
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
table(dt$V14)
## 
##    f    k    s    y 
##  600 2304 4936  284
dt$V14[dt$V14 == "f"] <- "fibrous"
dt$V14[dt$V14 == "k"] <- "silky"
dt$V14[dt$V14 == "s"] <- "smooth"
dt$V14[dt$V14 == "y"] <- "scaly"

setnames(dt, "V14", "stalk.surface.below.ring")
table(dt$stalk.surface.below.ring)
## 
## fibrous   scaly   silky  smooth 
##     600     284    2304    4936
  1. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
table(dt$V15)
## 
##    b    c    e    g    n    o    p    w    y 
##  432   36   96  576  448  192 1872 4464    8
  1. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
table(dt$V16)
## 
##    b    c    e    g    n    o    p    w    y 
##  432   36   96  576  512  192 1872 4384   24
  1. veil-type: partial=p,universal=u
table(dt$V17)
## 
##    p 
## 8124
  1. veil-color: brown=n,orange=o,white=w,yellow=y
table(dt$V18)
## 
##    n    o    w    y 
##   96   96 7924    8
  1. ring-number: none=n,one=o,two=t
table(dt$V19)
## 
##    n    o    t 
##   36 7488  600
  1. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
table(dt$V20)
## 
##    e    f    l    n    p 
## 2776   48 1296   36 3968
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
table(dt$V21)
## 
##    b    h    k    n    o    r    u    w    y 
##   48 1632 1872 1968   48   72   48 2388   48
dt$V21[dt$V21 == "b"] <- "buff"
dt$V21[dt$V21 == "h"] <- "chocolate"
dt$V21[dt$V21 == "k"] <- "black"
dt$V21[dt$V21 == "n"] <- "brown"
dt$V21[dt$V21 == "o"] <- "orange"
dt$V21[dt$V21 == "r"] <- "green"
dt$V21[dt$V21 == "u"] <- "purple"
dt$V21[dt$V21 == "w"] <- "white"
dt$V21[dt$V21 == "y"] <- "yellow"

setnames(dt, "V21", "spore.print.color")
table(dt$spore.print.color)
## 
##     black     brown      buff chocolate     green    orange    purple 
##      1872      1968        48      1632        72        48        48 
##     white    yellow 
##      2388        48
21. population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
table(dt$V22)
## 
##    a    c    n    s    v    y 
##  384  340  400 1248 4040 1712
dt$V22[dt$V22 == "a"] <- "abundant"
dt$V22[dt$V22 == "c"] <- "clustered"
dt$V22[dt$V22 == "n"] <- "numerous"
dt$V22[dt$V22 == "s"] <- "scattered"
dt$V22[dt$V22 == "v"] <- "several"
dt$V22[dt$V22 == "y"] <- "solitary"

setnames(dt, "V22", "population")
table(dt$population)
## 
##  abundant clustered  numerous scattered   several  solitary 
##       384       340       400      1248      4040      1712
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
table(dt$V23)
## 
##    d    g    l    m    p    u    w 
## 3148 2148  832  292 1144  368  192
dt$V23[dt$V23 == "d"] <- "woods"
dt$V23[dt$V23 == "g"] <- "grasses"
dt$V23[dt$V23 == "l"] <- "leaves"
dt$V23[dt$V23 == "m"] <- "meadows"
dt$V23[dt$V23 == "p"] <- "paths"
dt$V23[dt$V23 == "u"] <- "urban"
dt$V23[dt$V23 == "w"] <- "waste"

setnames(dt, "V23", "habitat")
table(dt$habitat)
## 
## grasses  leaves meadows   paths   urban   waste   woods 
##    2148     832     292    1144     368     192    3148

3.2 Subset the data

dt.subset <- subset(dt, select = c("classes", "odor", "spore.print.color", "stalk.surface.below.ring", "habitat", "population", "cap.color"))
head(dt.subset, 10)
##       classes    odor spore.print.color stalk.surface.below.ring habitat
##  1: poisonous pungent             black                   smooth   urban
##  2:    edible  almond             brown                   smooth grasses
##  3:    edible   anise             brown                   smooth meadows
##  4: poisonous pungent             black                   smooth   urban
##  5:    edible    none             brown                   smooth grasses
##  6:    edible  almond             black                   smooth grasses
##  7:    edible  almond             black                   smooth meadows
##  8:    edible   anise             brown                   smooth meadows
##  9: poisonous pungent             black                   smooth grasses
## 10:    edible  almond             black                   smooth meadows
##     population cap.color
##  1:  scattered     brown
##  2:   numerous    yellow
##  3:   numerous     white
##  4:  scattered     white
##  5:   abundant      gray
##  6:   numerous    yellow
##  7:   numerous     white
##  8:  scattered     white
##  9:    several     white
## 10:  scattered    yellow

4. View the final dataset

#check if there is any NA value in the dataset
table(is.na(dt.subset))
## 
## FALSE 
## 56868
#check the structure of the dataset
str(dt.subset)
## Classes 'data.table' and 'data.frame':   8124 obs. of  7 variables:
##  $ classes                 : chr  "poisonous" "edible" "edible" "poisonous" ...
##  $ odor                    : chr  "pungent" "almond" "anise" "pungent" ...
##  $ spore.print.color       : chr  "black" "brown" "brown" "black" ...
##  $ stalk.surface.below.ring: chr  "smooth" "smooth" "smooth" "smooth" ...
##  $ habitat                 : chr  "urban" "grasses" "meadows" "urban" ...
##  $ population              : chr  "scattered" "numerous" "numerous" "scattered" ...
##  $ cap.color               : chr  "brown" "yellow" "white" "white" ...
##  - attr(*, ".internal.selfref")=<externalptr>
#check the first 10 lines of the dataset
head(dt.subset, 10)
##       classes    odor spore.print.color stalk.surface.below.ring habitat
##  1: poisonous pungent             black                   smooth   urban
##  2:    edible  almond             brown                   smooth grasses
##  3:    edible   anise             brown                   smooth meadows
##  4: poisonous pungent             black                   smooth   urban
##  5:    edible    none             brown                   smooth grasses
##  6:    edible  almond             black                   smooth grasses
##  7:    edible  almond             black                   smooth meadows
##  8:    edible   anise             brown                   smooth meadows
##  9: poisonous pungent             black                   smooth grasses
## 10:    edible  almond             black                   smooth meadows
##     population cap.color
##  1:  scattered     brown
##  2:   numerous    yellow
##  3:   numerous     white
##  4:  scattered     white
##  5:   abundant      gray
##  6:   numerous    yellow
##  7:   numerous     white
##  8:  scattered     white
##  9:    several     white
## 10:  scattered    yellow
#download the csv file
write.csv(dt.subset, "mushroom.csv")