## Reading mashroom data from the URL, since the data does not have a header, the
## 'header=False' argument is used
library(plyr)
msrmDS <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header=FALSE, sep=",")
## Dimension of the dataset
dim(msrmDS)
## [1] 8124 23
## so the dataset has 23 variables (colums) and 8124 rows
## Create a subset of the data with 1000 randomly selected rows and
## the 1st, 2nd, 6th and 20th through 23rd columns, the first columns indicates
## if they are edible or poisonous
msrmDS <- msrmDS[sample(1:nrow(msrmDS),1000,replace=FALSE), c(1,2,6,20:23)]
View(msrmDS)
## Naming the columns based on the original data dictionary
names(msrmDS) <- c("classes","cap-shape","odor","ring-type",
"spore-print-color","population","habitat")
View(msrmDS)
## TRANSFORMATION OF DATA
## mapvalues and revalue, both functions of 'plyr' package achieve
# the same so both funtions were used to see the results
## Transforming data for "classes" variable
msrmDS$classes <- mapvalues(msrmDS$classes, from = c('e','p'), to = c('edible',
'poisonous'))
## Transforming data for "cap-shape" variable
msrmDS$`cap-shape` <- revalue(msrmDS$`cap-shape`, c('b' = 'bell','c' = 'conical',
'x' = 'convex','f' = 'flat',
'k' = 'knobbed', 's' = 'sunken'))
## Transforming data for "odor" variable
msrmDS$odor <- revalue(msrmDS$odor, c( 'a' = 'almond','l' = 'anise',
'c' = 'creosote','y' = 'fishy',
'f' = 'foul','m' = 'musty',
'n' = 'none','p' = 'pungent','s' = 'spicy'))
## Transforming data for "ring-type" variable. While the subset of data does
## not have all values- the statement below addresses all the possible values
## for "ring-type" column
msrmDS$`ring-type`<-revalue(msrmDS$`ring-type`,c('c'='cobwebby','e'='evanescent',
'f'='flaring','l'='large',
'n'='none','p'='pendant',
's'='sheathing','z'='zone'))
## The following `from` values were not present in `x`: c, s, z
## Transforming data for "spore-print-color" variable
msrmDS$`spore-print-color`<- revalue(msrmDS$`spore-print-color`,c('k'='black','n'='brown',
'b'='buff','h'='chocolate',
'r'='green','o'='orange',
'u'='purple','w'='white',
'y'='yellow'))
## Transforming data for "population" variable
msrmDS$population<-revalue(msrmDS$population,c('a' = 'abundant', 'c' = 'clustered',
'n' = 'numerous','s' = 'scattered',
'v' = 'several','y' = 'soletary'))
## Transforming data for "habitat" variable
msrmDS$habitat<-revalue(msrmDS$habitat,c( 'g' = 'grasses', 'l' = 'leaves',
'm' = 'meadows','p' = 'paths',
'u' = 'urban','w' = 'waster','d' = 'woods'))
head(msrmDS)
## classes cap-shape odor ring-type spore-print-color population
## 7450 edible knobbed none pendant white soletary
## 29 edible flat none pendant black soletary
## 1778 poisonous flat pungent pendant brown scattered
## 99 edible bell almond pendant brown scattered
## 1357 edible convex none evanescent black scattered
## 1415 edible flat none evanescent brown abundant
## habitat
## 7450 paths
## 29 urban
## 1778 urban
## 99 grasses
## 1357 grasses
## 1415 grasses