Mushroom Dataset

Loading information from the source

download.file('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 'agaricus-lepiota.data')
df <- read.table('agaricus-lepiota.data', sep=',', stringsAsFactors=FALSE)

Understanding the structure of data to compare with the data dictionary Map names with headings

head(df)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g
dim(df)
## [1] 8124   23
names(df)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10" "V11"
## [12] "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20" "V21" "V22"
## [23] "V23"

Getting a subset of the dataframe and substitute column names

dfnew <- subset(df,select = c(V2,V3,V4,V1))
colnames(dfnew)[1] <- "shape"
colnames(dfnew)[2] <- "surface"
colnames(dfnew)[3] <- "color"
colnames(dfnew)[4] <- "classes"
summary(dfnew)
##     shape             surface             color          
##  Length:8124        Length:8124        Length:8124       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##    classes         
##  Length:8124       
##  Class :character  
##  Mode  :character
head (dfnew)
##   shape surface color classes
## 1     x       s     n       p
## 2     x       s     y       e
## 3     b       s     w       e
## 4     x       y     w       p
## 5     x       s     g       e
## 6     x       y     y       e

Substitute letters for understandable information defined in data dictionary

dfnew$classes <- as.character(dfnew$classes)
dfnew$classes[dfnew$classes=="e"]<- "edible"
dfnew$classes[dfnew$classes=="p"]<- "poisonous"
dfnew$classes <- as.factor(dfnew$classes)

dfnew$shape <- as.character(dfnew$shape)
dfnew$shape[dfnew$shape=="b"]<- "bell"
dfnew$shape[dfnew$shape=="c"]<- "conical"
dfnew$shape[dfnew$shape=="f"]<- "flat"
dfnew$shape[dfnew$shape=="k"]<- "knobbed"
dfnew$shape[dfnew$shape=="s"]<- "sunken"
dfnew$shape[dfnew$shape=="x"]<- "convex"
dfnew$shape <- as.factor(dfnew$shape)

dfnew$surface <- as.character(dfnew$surface)
dfnew$surface[dfnew$surface=="f"]<- "fibrous"
dfnew$surface[dfnew$surface=="g"]<- "grooves"
dfnew$surface[dfnew$surface=="s"]<- "smooth"
dfnew$surface[dfnew$surface=="y"]<- "scaly"
dfnew$surface <- as.factor(dfnew$surface)

dfnew$color <- as.character(dfnew$color)
dfnew$color[dfnew$color=="n"]<- "brown"
dfnew$color[dfnew$color=="g"]<- "gray"
dfnew$color[dfnew$color=="e"]<- "red"
dfnew$color[dfnew$color=="y"]<- "yellow"
dfnew$color[dfnew$color=="w"]<- "white"
dfnew$color[dfnew$color=="b"]<- "buff"
dfnew$color <- as.factor(dfnew$color)

And then displayed the new data frame

summary(dfnew)
##      shape         surface         color           classes    
##  bell   : 452   fibrous:2320   brown  :2284   edible   :4208  
##  conical:   4   grooves:   4   gray   :1840   poisonous:3916  
##  convex :3656   scaly  :3244   red    :1500                   
##  flat   :3152   smooth :2556   yellow :1072                   
##  knobbed: 828                  white  :1040                   
##  sunken :  32                  buff   : 168                   
##                                (Other): 220