Take the data and loaded from the Mushroom Dataset, create a data frame with a subset of the columns in the dataset. Include the column that indicates edible or poisonous and three or four other columns.Finally, should also add meaningful column names and replace the abbreviations used in the data.
1.- Searching the Datafile of Mushroom Dataset.
data <-"https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
2.- Loading the Datafile into DataFrame
datafile <- read.table(file = data, sep=',', header=FALSE)
head(datafile)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
The dataframe for the datafile indicate the following:
dim(datafile)
## [1] 8124 23
summary(datafile)
## V1 V2 V3 V4 V5 V6
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## V7 V8 V9 V10 V11 V12 V13
## a: 210 c:6812 b:5612 b :1728 e:3516 ?:2480 f: 552
## f:7914 w:1312 n:2512 p :1492 t:4608 b:3776 k:2372
## w :1202 c: 556 s:5176
## n :1048 e:1120 y: 24
## g : 752 r: 192
## h : 732
## (Other):1170
## V14 V15 V16 V17 V18 V19
## f: 600 w :4464 w :4384 p:8124 n: 96 n: 36
## k:2304 p :1872 p :1872 o: 96 o:7488
## s:4936 g : 576 g : 576 w:7924 t: 600
## y: 284 n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## V20 V21 V22 V23
## e:2776 w :2388 a: 384 d:3148
## f: 48 n :1968 c: 340 g:2148
## l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
3.- Rename the columnsof the dataframe
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
names(datafile)<-c("class","capshape","capsurface","capcolor","bruises","odor","gillattachment","gillspacing","gillsize","gillcolor","stalkshape","stalkroot","stalksurface_above_ring","stalksurface_below_ring","stalkcolor_above_ring","stalkcolor_below_ring","veiltype","veilcolor","ringnumber","ringtype","sporeprint_color","population","habitat")
4.- subsetting the dataframe
datafile<- datafile %>%
select(class, capsurface, capcolor, population, habitat)
lapply(datafile, function(mushroom) table(mushroom))
## $class
## mushroom
## e p
## 4208 3916
##
## $capsurface
## mushroom
## f g s y
## 2320 4 2556 3244
##
## $capcolor
## mushroom
## b c e g n p r u w y
## 168 44 1500 1840 2284 144 16 16 1040 1072
##
## $population
## mushroom
## a c n s v y
## 384 340 400 1248 4040 1712
##
## $habitat
## mushroom
## d g l m p u w
## 3148 2148 832 292 1144 368 192
5.- Records of variables of the dataframes
datafile<-datafile %>%
mutate(class=recode(class,p="poison",e="edible"),
capsurface = recode(capsurface, f="fibrous", g="grooves", y="scaly",s="smooth"),
capcolor = recode(capcolor, b="buff", c="binnamon", g="gray", n="brown", p="pink", r="green", u="purple", e="red", w="white",y="yellow"),
population = recode(population, a="abundant", c="clustered", n="numerous", s="scattered", v="several",y="solitary"),
habitat = recode(habitat, g="grasses", l="leaves", m="meadows", p="paths", u="urban", w="waste",d="woods"))
summary(datafile)
## class capsurface capcolor population
## edible:4208 fibrous:2320 brown :2284 abundant : 384
## poison:3916 grooves: 4 gray :1840 clustered: 340
## smooth :2556 red :1500 numerous : 400
## scaly :3244 yellow :1072 scattered:1248
## white :1040 several :4040
## buff : 168 solitary :1712
## (Other): 220
## habitat
## woods :3148
## grasses:2148
## leaves : 832
## meadows: 292
## paths :1144
## urban : 368
## waste : 192