R Markdown

Load library to scrape data from University of California Irvine website

library(curl)
## Warning: package 'curl' was built under R version 3.3.3

Load data original data from website into variable originalDataCSV

data_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
original <- curl(data_url)
originalDataCSV<-readLines(original)

Store original CSV data into an R dtaframe

originalData<-read.csv(text=originalDataCSV,header=FALSE)

See first few rows of data set

head(originalData)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1  p  x  s  n  t  p  f  c  n   k   e   e   s   s   w   w   p   w   o   p
## 2  e  x  s  y  t  a  f  c  b   k   e   c   s   s   w   w   p   w   o   p
## 3  e  b  s  w  t  l  f  c  b   n   e   c   s   s   w   w   p   w   o   p
## 4  p  x  y  w  t  p  f  c  n   n   e   e   s   s   w   w   p   w   o   p
## 5  e  x  s  g  f  n  f  w  b   k   t   e   s   s   w   w   p   w   o   e
## 6  e  x  y  y  t  a  f  c  b   n   e   c   s   s   w   w   p   w   o   p
##   V21 V22 V23
## 1   k   s   u
## 2   n   n   g
## 3   n   n   m
## 4   k   s   u
## 5   n   a   g
## 6   k   n   g

Create array with names for all atributes/columns

atributes<-c('classes','cap-shape','capsurface','capcolor','bruises','odor','gillattachment','gillspacing','gillsize','gillcolor','stalkshape','stalkroot','stalksurfaceabovering','stalksurfacebelowring','stalkcolorabovering','stalkcolorbelowring','veiltype','veilcolor','ringnumber','ringtype','sporeprintcolor','population','habitat')

Assign Ccolumn names to data set

colnames(originalData)<-atributes
head(originalData)
##   classes cap-shape capsurface capcolor bruises odor gillattachment
## 1       p         x          s        n       t    p              f
## 2       e         x          s        y       t    a              f
## 3       e         b          s        w       t    l              f
## 4       p         x          y        w       t    p              f
## 5       e         x          s        g       f    n              f
## 6       e         x          y        y       t    a              f
##   gillspacing gillsize gillcolor stalkshape stalkroot
## 1           c        n         k          e         e
## 2           c        b         k          e         c
## 3           c        b         n          e         c
## 4           c        n         n          e         e
## 5           w        b         k          t         e
## 6           c        b         n          e         c
##   stalksurfaceabovering stalksurfacebelowring stalkcolorabovering
## 1                     s                     s                   w
## 2                     s                     s                   w
## 3                     s                     s                   w
## 4                     s                     s                   w
## 5                     s                     s                   w
## 6                     s                     s                   w
##   stalkcolorbelowring veiltype veilcolor ringnumber ringtype
## 1                   w        p         w          o        p
## 2                   w        p         w          o        p
## 3                   w        p         w          o        p
## 4                   w        p         w          o        p
## 5                   w        p         w          o        e
## 6                   w        p         w          o        p
##   sporeprintcolor population habitat
## 1               k          s       u
## 2               n          n       g
## 3               n          n       m
## 4               k          s       u
## 5               n          a       g
## 6               k          n       g

Subset the data frame to include the classes column (edible or not), and four other columns. Selected were: odor, gill-color, stalk-root and habitat

selectedData<-originalData[c('classes','odor','gillsize','stalkshape','habitat')]
head(selectedData)
##   classes odor gillsize stalkshape habitat
## 1       p    p        n          e       u
## 2       e    a        b          e       g
## 3       e    l        b          e       m
## 4       p    p        n          e       u
## 5       e    n        b          t       g
## 6       e    a        b          e       g

Next we replace the abbreviations used in the dataset with meaningful names.

classes<-data.frame(sapply(selectedData$classes,function(x) { gsub("e","edible",x)}))
classes<-data.frame(sapply(classes,function(x) { gsub("p","poisonous",x)}))

odor<-data.frame(sapply(selectedData$odor,function(x) { gsub("a","almond",x)}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="l","anise",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="c","creosote",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="y","fishy",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="f","foul",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="m","musty",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="n","none",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="p","pungent",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="s","spicy",as.character(x))}))

gillsize<-data.frame(sapply(selectedData$gillsize,function(x) {ifelse(x=="b","broad",as.character(x))}))
gillsize<-data.frame(sapply(gillsize,function(x) {ifelse(x=="n","narrow",as.character(x))}))

stalkshape<-data.frame(sapply(selectedData$stalkshape,function(x) {ifelse(x=="e","enlarging",as.character(x))}))
stalkshape<-data.frame(sapply(stalkshape,function(x) {ifelse(x=="t","tapering",as.character(x))}))

habitat<-data.frame(sapply(selectedData$habitat,function(x) {ifelse(x=="g","grasses",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="l","leaves",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="m","meadows",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="p","paths",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="u","urban",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="w","waste",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="d","woods",as.character(x))}))

finalData<-data.frame(classes=classes,odor=odor, gillsize=gillsize, stalkshape=stalkshape,habitat=habitat)
colnames(finalData)<-c('classes','odor','gillsize','stalkshape','habitat')
head(finalData)
##     classes    odor gillsize stalkshape habitat
## 1 poisonous pungent   narrow  enlarging   urban
## 2    edible  almond    broad  enlarging grasses
## 3    edible   anise    broad  enlarging meadows
## 4 poisonous pungent   narrow  enlarging   urban
## 5    edible    none    broad   tapering grasses
## 6    edible  almond    broad  enlarging grasses
tail(finalData)
##        classes  odor gillsize stalkshape habitat
## 8119 poisonous  foul   narrow   tapering   woods
## 8120    edible  none    broad  enlarging  leaves
## 8121    edible  none    broad  enlarging  leaves
## 8122    edible  none    broad  enlarging  leaves
## 8123 poisonous fishy   narrow   tapering  leaves
## 8124    edible  none    broad  enlarging  leaves