You should take the data, and create a data frame with a subset of the columns in the dataset.
You should include the column that indicates edible or poisonous and three or four other columns.
You should also add meaningful column names and replace the abbreviations used in the data — for example, in the appropriate column, “e” might become “edible.” Your deliverable is the R code to perform these transformation tasks.
0. classes : edible=e, poisonous=p
1. cap-shape: bell=b,conical=c,convex=x,flat=f,
knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,
pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,
musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
:: only a, f
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g,
green=r,orange=o,pink=p,purple=u,red=e,
white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e,
rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,
pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,
none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,
orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n,
scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p,
urban=u,waste=w,woods=d
library(RCurl)
## Loading required package: bitops
library(plyr)
library(data.table)
# readlines of original dataset from the url directly
dt.original <- readLines("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
#leave the original copy and make data frame as dt
dt <- as.data.frame(dt.original, stringsAsFactors = F)
#split the elements of character vector by the separator ","
dt <- strsplit(dt$dt, ",")
#combine lists elements into a dataframe and also set the frame as data table
dt <- ldply(dt)
dt <- as.data.table(dt)
table(dt$V1)
##
## e p
## 4208 3916
dt$V1 <- ifelse(dt$V1 == "e", "edible", "poisonous")
setnames(dt, "V1", "classes")
table(dt$classes)
##
## edible poisonous
## 4208 3916
table(dt$V2)
##
## b c f k s x
## 452 4 3152 828 32 3656
dt$V2 <- ifelse(dt$V2 == "b", "bell", dt$V2)
dt$V2 <- ifelse(dt$V2 == "c", "conical", dt$V2)
dt$V2 <- ifelse(dt$V2 == "x", "convex", dt$V2)
dt$V2 <- ifelse(dt$V2 == "f", "flat", dt$V2)
dt$V2 <- ifelse(dt$V2 == "k", "knobbed", dt$V2)
dt$V2 <- ifelse(dt$V2 == "s", "sunken", dt$V2)
setnames(dt, "V2", "cap.shape")
table(dt$cap.shape)
##
## bell conical convex flat knobbed sunken
## 452 4 3656 3152 828 32
table(dt$V3)
##
## f g s y
## 2320 4 2556 3244
dt$V3[dt$V3 == "f"] <- "fibrous"
dt$V3[dt$V3 == "g"] <- "grooves"
dt$V3[dt$V3 == "s"] <- "smooth"
dt$V3[dt$V3 == "y"] <- "scaly"
setnames(dt, "V3", "cap.surface")
table(dt$cap.surface)
##
## fibrous grooves scaly smooth
## 2320 4 3244 2556
table(dt$V4)
##
## b c e g n p r u w y
## 168 44 1500 1840 2284 144 16 16 1040 1072
dt$V4[dt$V4 == "b"] <- "buff"
dt$V4[dt$V4 == "c"] <- "cinnamon"
dt$V4[dt$V4 == "e"] <- "red"
dt$V4[dt$V4 == "g"] <- "gray"
dt$V4[dt$V4 == "n"] <- "brown"
dt$V4[dt$V4 == "p"] <- "pink"
dt$V4[dt$V4 == "r"] <- "green"
dt$V4[dt$V4 == "u"] <- "purple"
dt$V4[dt$V4 == "w"] <- "white"
dt$V4[dt$V4 == "y"] <- "yellow"
setnames(dt, "V4", "cap.color")
table(dt$cap.color)
##
## brown buff cinnamon gray green pink purple red
## 2284 168 44 1840 16 144 16 1500
## white yellow
## 1040 1072
table(dt$V5)
##
## f t
## 4748 3376
dt$V5 <- ifelse(dt$V5 == "f", "no_bruises", "bruises")
setnames(dt, "V5", "bruises")
table(dt$bruises)
##
## bruises no_bruises
## 3376 4748
table(dt$V6)
##
## a c f l m n p s y
## 400 192 2160 400 36 3528 256 576 576
dt$V6[dt$V6 == "a"] <- "almond"
dt$V6[dt$V6 == "c"] <- "creosote"
dt$V6[dt$V6 == "f"] <- "foul"
dt$V6[dt$V6 == "l"] <- "anise"
dt$V6[dt$V6 == "m"] <- "musty"
dt$V6[dt$V6 == "n"] <- "none"
dt$V6[dt$V6 == "p"] <- "pungent"
dt$V6[dt$V6 == "s"] <- "spicy"
dt$V6[dt$V6 == "y"] <- "fishy"
setnames(dt, "V6", "odor")
table(dt$odor)
##
## almond anise creosote fishy foul musty none pungent
## 400 400 192 576 2160 36 3528 256
## spicy
## 576
table(dt$V7)
##
## a f
## 210 7914
table(dt$V8)
##
## c w
## 6812 1312
table(dt$V9)
##
## b n
## 5612 2512
table(dt$V10)
##
## b e g h k n o p r u w y
## 1728 96 752 732 408 1048 64 1492 24 492 1202 86
table(dt$V11)
##
## e t
## 3516 4608
table(dt$V12)
##
## ? b c e r
## 2480 3776 556 1120 192
table(dt$V13)
##
## f k s y
## 552 2372 5176 24
table(dt$V14)
##
## f k s y
## 600 2304 4936 284
dt$V14[dt$V14 == "f"] <- "fibrous"
dt$V14[dt$V14 == "k"] <- "silky"
dt$V14[dt$V14 == "s"] <- "smooth"
dt$V14[dt$V14 == "y"] <- "scaly"
setnames(dt, "V14", "stalk.surface.below.ring")
table(dt$stalk.surface.below.ring)
##
## fibrous scaly silky smooth
## 600 284 2304 4936
table(dt$V15)
##
## b c e g n o p w y
## 432 36 96 576 448 192 1872 4464 8
table(dt$V16)
##
## b c e g n o p w y
## 432 36 96 576 512 192 1872 4384 24
table(dt$V17)
##
## p
## 8124
table(dt$V18)
##
## n o w y
## 96 96 7924 8
table(dt$V19)
##
## n o t
## 36 7488 600
table(dt$V20)
##
## e f l n p
## 2776 48 1296 36 3968
table(dt$V21)
##
## b h k n o r u w y
## 48 1632 1872 1968 48 72 48 2388 48
dt$V21[dt$V21 == "b"] <- "buff"
dt$V21[dt$V21 == "h"] <- "chocolate"
dt$V21[dt$V21 == "k"] <- "black"
dt$V21[dt$V21 == "n"] <- "brown"
dt$V21[dt$V21 == "o"] <- "orange"
dt$V21[dt$V21 == "r"] <- "green"
dt$V21[dt$V21 == "u"] <- "purple"
dt$V21[dt$V21 == "w"] <- "white"
dt$V21[dt$V21 == "y"] <- "yellow"
setnames(dt, "V21", "spore.print.color")
table(dt$spore.print.color)
##
## black brown buff chocolate green orange purple
## 1872 1968 48 1632 72 48 48
## white yellow
## 2388 48
table(dt$V22)
##
## a c n s v y
## 384 340 400 1248 4040 1712
dt$V22[dt$V22 == "a"] <- "abundant"
dt$V22[dt$V22 == "c"] <- "clustered"
dt$V22[dt$V22 == "n"] <- "numerous"
dt$V22[dt$V22 == "s"] <- "scattered"
dt$V22[dt$V22 == "v"] <- "several"
dt$V22[dt$V22 == "y"] <- "solitary"
setnames(dt, "V22", "population")
table(dt$population)
##
## abundant clustered numerous scattered several solitary
## 384 340 400 1248 4040 1712
table(dt$V23)
##
## d g l m p u w
## 3148 2148 832 292 1144 368 192
dt$V23[dt$V23 == "d"] <- "woods"
dt$V23[dt$V23 == "g"] <- "grasses"
dt$V23[dt$V23 == "l"] <- "leaves"
dt$V23[dt$V23 == "m"] <- "meadows"
dt$V23[dt$V23 == "p"] <- "paths"
dt$V23[dt$V23 == "u"] <- "urban"
dt$V23[dt$V23 == "w"] <- "waste"
setnames(dt, "V23", "habitat")
table(dt$habitat)
##
## grasses leaves meadows paths urban waste woods
## 2148 832 292 1144 368 192 3148
dt.subset <- subset(dt, select = c("classes", "odor", "spore.print.color", "stalk.surface.below.ring", "habitat", "population", "cap.color"))
head(dt.subset, 10)
## classes odor spore.print.color stalk.surface.below.ring habitat
## 1: poisonous pungent black smooth urban
## 2: edible almond brown smooth grasses
## 3: edible anise brown smooth meadows
## 4: poisonous pungent black smooth urban
## 5: edible none brown smooth grasses
## 6: edible almond black smooth grasses
## 7: edible almond black smooth meadows
## 8: edible anise brown smooth meadows
## 9: poisonous pungent black smooth grasses
## 10: edible almond black smooth meadows
## population cap.color
## 1: scattered brown
## 2: numerous yellow
## 3: numerous white
## 4: scattered white
## 5: abundant gray
## 6: numerous yellow
## 7: numerous white
## 8: scattered white
## 9: several white
## 10: scattered yellow
#check if there is any NA value in the dataset
table(is.na(dt.subset))
##
## FALSE
## 56868
#check the structure of the dataset
str(dt.subset)
## Classes 'data.table' and 'data.frame': 8124 obs. of 7 variables:
## $ classes : chr "poisonous" "edible" "edible" "poisonous" ...
## $ odor : chr "pungent" "almond" "anise" "pungent" ...
## $ spore.print.color : chr "black" "brown" "brown" "black" ...
## $ stalk.surface.below.ring: chr "smooth" "smooth" "smooth" "smooth" ...
## $ habitat : chr "urban" "grasses" "meadows" "urban" ...
## $ population : chr "scattered" "numerous" "numerous" "scattered" ...
## $ cap.color : chr "brown" "yellow" "white" "white" ...
## - attr(*, ".internal.selfref")=<externalptr>
#check the first 10 lines of the dataset
head(dt.subset, 10)
## classes odor spore.print.color stalk.surface.below.ring habitat
## 1: poisonous pungent black smooth urban
## 2: edible almond brown smooth grasses
## 3: edible anise brown smooth meadows
## 4: poisonous pungent black smooth urban
## 5: edible none brown smooth grasses
## 6: edible almond black smooth grasses
## 7: edible almond black smooth meadows
## 8: edible anise brown smooth meadows
## 9: poisonous pungent black smooth grasses
## 10: edible almond black smooth meadows
## population cap.color
## 1: scattered brown
## 2: numerous yellow
## 3: numerous white
## 4: scattered white
## 5: abundant gray
## 6: numerous yellow
## 7: numerous white
## 8: scattered white
## 9: several white
## 10: scattered yellow
#download the csv file
write.csv(dt.subset, "mushroom.csv")