Load library to scrape data from University of California Irvine website
library(curl)
## Warning: package 'curl' was built under R version 3.3.3
Load data original data from website into variable originalDataCSV
data_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
original <- curl(data_url)
originalDataCSV<-readLines(original)
Store original CSV data into an R dtaframe
originalData<-read.csv(text=originalDataCSV,header=FALSE)
See first few rows of data set
head(originalData)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Create array with names for all atributes/columns
atributes<-c('classes','cap-shape','capsurface','capcolor','bruises','odor','gillattachment','gillspacing','gillsize','gillcolor','stalkshape','stalkroot','stalksurfaceabovering','stalksurfacebelowring','stalkcolorabovering','stalkcolorbelowring','veiltype','veilcolor','ringnumber','ringtype','sporeprintcolor','population','habitat')
Assign Ccolumn names to data set
colnames(originalData)<-atributes
head(originalData)
## classes cap-shape capsurface capcolor bruises odor gillattachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gillspacing gillsize gillcolor stalkshape stalkroot
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalksurfaceabovering stalksurfacebelowring stalkcolorabovering
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalkcolorbelowring veiltype veilcolor ringnumber ringtype
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## sporeprintcolor population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Subset the data frame to include the classes column (edible or not), and four other columns. Selected were: odor, gill-color, stalk-root and habitat
selectedData<-originalData[c('classes','odor','gillsize','stalkshape','habitat')]
head(selectedData)
## classes odor gillsize stalkshape habitat
## 1 p p n e u
## 2 e a b e g
## 3 e l b e m
## 4 p p n e u
## 5 e n b t g
## 6 e a b e g
Next we replace the abbreviations used in the dataset with meaningful names.
classes<-data.frame(sapply(selectedData$classes,function(x) { gsub("e","edible",x)}))
classes<-data.frame(sapply(classes,function(x) { gsub("p","poisonous",x)}))
odor<-data.frame(sapply(selectedData$odor,function(x) { gsub("a","almond",x)}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="l","anise",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="c","creosote",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="y","fishy",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="f","foul",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="m","musty",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="n","none",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="p","pungent",as.character(x))}))
odor<-data.frame(sapply(odor,function(x) {ifelse(x=="s","spicy",as.character(x))}))
gillsize<-data.frame(sapply(selectedData$gillsize,function(x) {ifelse(x=="b","broad",as.character(x))}))
gillsize<-data.frame(sapply(gillsize,function(x) {ifelse(x=="n","narrow",as.character(x))}))
stalkshape<-data.frame(sapply(selectedData$stalkshape,function(x) {ifelse(x=="e","enlarging",as.character(x))}))
stalkshape<-data.frame(sapply(stalkshape,function(x) {ifelse(x=="t","tapering",as.character(x))}))
habitat<-data.frame(sapply(selectedData$habitat,function(x) {ifelse(x=="g","grasses",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="l","leaves",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="m","meadows",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="p","paths",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="u","urban",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="w","waste",as.character(x))}))
habitat<-data.frame(sapply(habitat,function(x) {ifelse(x=="d","woods",as.character(x))}))
finalData<-data.frame(classes=classes,odor=odor, gillsize=gillsize, stalkshape=stalkshape,habitat=habitat)
colnames(finalData)<-c('classes','odor','gillsize','stalkshape','habitat')
head(finalData)
## classes odor gillsize stalkshape habitat
## 1 poisonous pungent narrow enlarging urban
## 2 edible almond broad enlarging grasses
## 3 edible anise broad enlarging meadows
## 4 poisonous pungent narrow enlarging urban
## 5 edible none broad tapering grasses
## 6 edible almond broad enlarging grasses
tail(finalData)
## classes odor gillsize stalkshape habitat
## 8119 poisonous foul narrow tapering woods
## 8120 edible none broad enlarging leaves
## 8121 edible none broad enlarging leaves
## 8122 edible none broad enlarging leaves
## 8123 poisonous fishy narrow tapering leaves
## 8124 edible none broad enlarging leaves