Pull Data
First step is to pull the data from the site and then put it a data frame
url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
mushrooms <- read.table(url, sep=",", header=FALSE, stringsAsFactors = FALSE)
dim(mushrooms)
## [1] 8124 23
#View a sample of the data
head(mushrooms)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
## 1 p x s n t p f c n k e e s s w w p w o p
## 2 e x s y t a f c b k e c s s w w p w o p
## 3 e b s w t l f c b n e c s s w w p w o p
## 4 p x y w t p f c n n e e s s w w p w o p
## 5 e x s g f n f w b k t e s s w w p w o e
## 6 e x y y t a f c b n e c s s w w p w o p
## V21 V22 V23
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Add Columns
Now that we have the data in the data frame we need to add the column names.
I copied and pasted the attribute information into a txt file in my working directory.
#Load the data from the txt file into a new data frame
mushattr <- read.table("mushroomattr.txt", sep="|", header=TRUE, stringsAsFactors = FALSE)
head(mushattr)
## Index Attribute
## 1 0 class
## 2 1 cap-shape
## 3 2 cap-surface
## 4 3 cap-color
## 5 4 bruises?
## 6 5 odor
## Information
## 1 edible=e,poisonous=p
## 2 bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## 3 fibrous=f,grooves=g,scaly=y,smooth=s
## 4 brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## 5 bruises=t,no=f
## 6 almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
#Add the column names from the attribute data frame
colnames(mushrooms) <- mushattr$Attribute
head(mushrooms)
## class cap-shape cap-surface cap-color bruises? odor gill-attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill-spacing gill-size gill-color stalk-shape stalk-root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore-print-color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Create Subset
mushroomsub <- subset(mushrooms, select= c("class","odor","ring-number","population"))
head(mushroomsub)
## class odor ring-number population
## 1 p p o s
## 2 e a o n
## 3 e l o n
## 4 p p o s
## 5 e n o a
## 6 e a o n
Rename Values in Subset
#Using the revalue function from the plyr we should be able to change the values in the subset
library(plyr)
mushroomsub$class <- revalue(mushroomsub$class, c("e" = "edible"))
mushroomsub$class <- revalue(mushroomsub$class, c("p" = "poisonous"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("a" = "almond"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("l" = "anise"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("c" = "creosote"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("y" = "fishy"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("f" = "foul"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("m" = "musty"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("n" = "none"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("p" = "pungent"))
mushroomsub$odor <- revalue(mushroomsub$odor, c("s" = "spicy"))
mushroomsub$`ring-number` <- revalue(mushroomsub$`ring-number`, c("n" = "none"))
mushroomsub$`ring-number` <- revalue(mushroomsub$`ring-number`, c("o" = "one"))
mushroomsub$`ring-number` <- revalue(mushroomsub$`ring-number`, c("t" = "two"))
mushroomsub$population <- revalue(mushroomsub$population, c("a" = "abundant"))
mushroomsub$population <- revalue(mushroomsub$population, c("c" = "clustered"))
mushroomsub$population <- revalue(mushroomsub$population, c("n" = "numerous"))
mushroomsub$population <- revalue(mushroomsub$population, c("s" = "scattered"))
mushroomsub$population <- revalue(mushroomsub$population, c("v" = "several"))
mushroomsub$population <- revalue(mushroomsub$population, c("y" = "solitary"))
head(mushroomsub)
## class odor ring-number population
## 1 poisonous pungent one scattered
## 2 edible almond one numerous
## 3 edible anise one numerous
## 4 poisonous pungent one scattered
## 5 edible none one abundant
## 6 edible almond one numerous