Let’s first load the Mushroom Dataset into R Dataframe
data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom_df <- read.table(file=data_url, header = FALSE, sep =",",
stringsAsFactors = FALSE)
#View(mushroom_df)
Based on the rules in the data dictionary we care about Attribute odor V6 Attribute spore-print-color V21 Attribute stalk-surface-below-ring V14 Attribute stalk-color-above-ring V13 Attribute habitat V23 Attribute cap-color V4 Attribute population V22
Therefore, these attributes will form our subset
Create Mushroom Dataframe subset(Mushroom_df_sub) and name the columns
mushroom_df_sub <-mushroom_df[c(1,6,4,15,14,21,22,23)]
names(mushroom_df_sub)<- c("Mushroom_class","odor","cap-color","stalk-color-above-ring"
,"stalk-surface-below-ring","spore-print-color","population",
"habitat")
#View top 10 rows of the subset of Mushroom_df
head(mushroom_df_sub, n=10)
## Mushroom_class odor cap-color stalk-color-above-ring
## 1 p p n w
## 2 e a y w
## 3 e l w w
## 4 p p w w
## 5 e n g w
## 6 e a y w
## 7 e a w w
## 8 e l w w
## 9 p p w w
## 10 e a y w
## stalk-surface-below-ring spore-print-color population habitat
## 1 s k s u
## 2 s n n g
## 3 s n n m
## 4 s k s u
## 5 s n a g
## 6 s k n g
## 7 s k n m
## 8 s n s m
## 9 s k v g
## 10 s k s m
Transform Mushroom class field to a more descriptive format
However let’s see distinct values of the mushroom_class and know that we are transforming all the possible values
unique(mushroom_df_sub$Mushroom_class)
## [1] "p" "e"
mushroom_df_sub$Mushroom_class[c(mushroom_df_sub$Mushroom_class =='p')] <- c("poisonous")
mushroom_df_sub$Mushroom_class[c(mushroom_df_sub$Mushroom_class =='e')] <- c("edible")
Transform odor field to a more descriptive format
unique(mushroom_df_sub$odor)
## [1] "p" "a" "l" "n" "f" "c" "y" "s" "m"
lookup_odor<-c(a="almond",l="anise",c="creosote",y="fishy",f="foul", m="musty",n="none",p="pungent",s="spicy")
#Notice if we list the names of the vector we can repeat the colnames
lookup_odor[c('a','l','l')]
## a l l
## "almond" "anise" "anise"
#Create a character vector from the column names of the dataframe
head(lookup_odor[mushroom_df_sub$odor],n=10)
## p a l p n a a
## "pungent" "almond" "anise" "pungent" "none" "almond" "almond"
## l p a
## "anise" "pungent" "almond"
#Replace the odor column with the character vector created but with no names
mushroom_df_sub$odor <- unname(lookup_odor[mushroom_df_sub$odor])
Transform cap_color column
unique(mushroom_df_sub$`cap-color`) #Ensure all field values have description
## [1] "n" "y" "w" "g" "e" "p" "b" "u" "c" "r"
lookup_cap.color <- c(n="brown",b="buff",c="cinnamon",g="gray",r="green", p="pink", u="purple",e="red",w="white",y="yellow" )
mushroom_df_sub$`cap-color` <- unname(lookup_cap.color[mushroom_df_sub$`cap-color`])
Transform rest of columns
lookup_cap.sporeprintcolor <- c(k="black",n="brown",b="buff",h="chocolate",r="green", o="orange",
u="purple",w="white",y="yellow" )
mushroom_df_sub$`spore-print-color` <- unname(lookup_cap.sporeprintcolor[mushroom_df_sub$`spore-print-color`])
unique(mushroom_df_sub$`stalk-color-above-ring`)
## [1] "w" "g" "p" "n" "b" "e" "o" "c" "y"
lookup_stalk_c_abR <- c(n='brown',b='buff',c='cinnamon',g='gray',o='orange',
p='pink',e='red',w='white',y='yellow')
mushroom_df_sub$`stalk-color-above-ring` <- unname(lookup_stalk_c_abR[mushroom_df_sub$`stalk-color-above-ring`])
lookup_stalk_sbr <- c(f='fibrous',y='scaly',k='silky',s='smooth')
mushroom_df_sub$`stalk-surface-below-ring` <- unname(lookup_stalk_sbr[mushroom_df_sub$`stalk-surface-below-ring`])
lookup_population <- c(a='abundant',c='clustered',n='numerous',
s='scattered',v='several',y='solitary')
mushroom_df_sub$population <- unname(lookup_population[mushroom_df_sub$population])
lookup_habitat <- c(g='grasses',l='leaves',m='meadows',p='paths',
u='urban',w='waste',d='woods')
mushroom_df_sub$habitat <- unname(lookup_habitat[mushroom_df_sub$habitat])
Display final result of transformed Data frame top 100
head(mushroom_df_sub,20)
## Mushroom_class odor cap-color stalk-color-above-ring
## 1 poisonous pungent brown white
## 2 edible almond yellow white
## 3 edible anise white white
## 4 poisonous pungent white white
## 5 edible none gray white
## 6 edible almond yellow white
## 7 edible almond white white
## 8 edible anise white white
## 9 poisonous pungent white white
## 10 edible almond yellow white
## 11 edible anise yellow white
## 12 edible almond yellow white
## 13 edible almond yellow white
## 14 poisonous pungent white white
## 15 edible none brown white
## 16 edible none gray white
## 17 edible none white white
## 18 poisonous pungent brown white
## 19 poisonous pungent white white
## 20 poisonous pungent brown white
## stalk-surface-below-ring spore-print-color population habitat
## 1 smooth black scattered urban
## 2 smooth brown numerous grasses
## 3 smooth brown numerous meadows
## 4 smooth black scattered urban
## 5 smooth brown abundant grasses
## 6 smooth black numerous grasses
## 7 smooth black numerous meadows
## 8 smooth brown scattered meadows
## 9 smooth black several grasses
## 10 smooth black scattered meadows
## 11 smooth brown numerous grasses
## 12 smooth black scattered meadows
## 13 smooth brown scattered grasses
## 14 smooth brown several urban
## 15 fibrous black abundant grasses
## 16 smooth brown solitary urban
## 17 smooth brown abundant grasses
## 18 smooth black scattered grasses
## 19 smooth brown scattered urban
## 20 smooth brown scattered urban