Let’s first load the Mushroom Dataset into R Dataframe

data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

mushroom_df <- read.table(file=data_url, header = FALSE, sep =",",
                          stringsAsFactors = FALSE)

#View(mushroom_df)

Based on the rules in the data dictionary we care about Attribute odor V6 Attribute spore-print-color V21 Attribute stalk-surface-below-ring V14 Attribute stalk-color-above-ring V13 Attribute habitat V23 Attribute cap-color V4 Attribute population V22

Therefore, these attributes will form our subset

Create Mushroom Dataframe subset(Mushroom_df_sub) and name the columns

mushroom_df_sub <-mushroom_df[c(1,6,4,15,14,21,22,23)]
names(mushroom_df_sub)<- c("Mushroom_class","odor","cap-color","stalk-color-above-ring"
                           ,"stalk-surface-below-ring","spore-print-color","population",
                           "habitat")

#View top 10 rows of the subset of Mushroom_df
head(mushroom_df_sub, n=10)
##    Mushroom_class odor cap-color stalk-color-above-ring
## 1               p    p         n                      w
## 2               e    a         y                      w
## 3               e    l         w                      w
## 4               p    p         w                      w
## 5               e    n         g                      w
## 6               e    a         y                      w
## 7               e    a         w                      w
## 8               e    l         w                      w
## 9               p    p         w                      w
## 10              e    a         y                      w
##    stalk-surface-below-ring spore-print-color population habitat
## 1                         s                 k          s       u
## 2                         s                 n          n       g
## 3                         s                 n          n       m
## 4                         s                 k          s       u
## 5                         s                 n          a       g
## 6                         s                 k          n       g
## 7                         s                 k          n       m
## 8                         s                 n          s       m
## 9                         s                 k          v       g
## 10                        s                 k          s       m

Transform Mushroom class field to a more descriptive format

However let’s see distinct values of the mushroom_class and know that we are transforming all the possible values

unique(mushroom_df_sub$Mushroom_class)
## [1] "p" "e"
mushroom_df_sub$Mushroom_class[c(mushroom_df_sub$Mushroom_class =='p')] <- c("poisonous")
mushroom_df_sub$Mushroom_class[c(mushroom_df_sub$Mushroom_class =='e')] <- c("edible")

Transform odor field to a more descriptive format

unique(mushroom_df_sub$odor)
## [1] "p" "a" "l" "n" "f" "c" "y" "s" "m"
lookup_odor<-c(a="almond",l="anise",c="creosote",y="fishy",f="foul", m="musty",n="none",p="pungent",s="spicy")

#Notice if we list the names of the vector we can repeat the colnames
lookup_odor[c('a','l','l')]
##        a        l        l 
## "almond"  "anise"  "anise"
#Create a character vector from the column names of the dataframe
head(lookup_odor[mushroom_df_sub$odor],n=10)
##         p         a         l         p         n         a         a 
## "pungent"  "almond"   "anise" "pungent"    "none"  "almond"  "almond" 
##         l         p         a 
##   "anise" "pungent"  "almond"
#Replace the odor column  with the character vector created but with no names

mushroom_df_sub$odor <- unname(lookup_odor[mushroom_df_sub$odor])

Transform cap_color column

unique(mushroom_df_sub$`cap-color`) #Ensure all field values have description
##  [1] "n" "y" "w" "g" "e" "p" "b" "u" "c" "r"
lookup_cap.color <- c(n="brown",b="buff",c="cinnamon",g="gray",r="green", p="pink", u="purple",e="red",w="white",y="yellow" )

mushroom_df_sub$`cap-color` <- unname(lookup_cap.color[mushroom_df_sub$`cap-color`])

Transform rest of columns

lookup_cap.sporeprintcolor <- c(k="black",n="brown",b="buff",h="chocolate",r="green", o="orange",
                      u="purple",w="white",y="yellow" )

mushroom_df_sub$`spore-print-color` <- unname(lookup_cap.sporeprintcolor[mushroom_df_sub$`spore-print-color`])

unique(mushroom_df_sub$`stalk-color-above-ring`)
## [1] "w" "g" "p" "n" "b" "e" "o" "c" "y"
lookup_stalk_c_abR <- c(n='brown',b='buff',c='cinnamon',g='gray',o='orange',
                                  p='pink',e='red',w='white',y='yellow')
mushroom_df_sub$`stalk-color-above-ring` <- unname(lookup_stalk_c_abR[mushroom_df_sub$`stalk-color-above-ring`])


lookup_stalk_sbr <- c(f='fibrous',y='scaly',k='silky',s='smooth')
mushroom_df_sub$`stalk-surface-below-ring` <- unname(lookup_stalk_sbr[mushroom_df_sub$`stalk-surface-below-ring`])

lookup_population <- c(a='abundant',c='clustered',n='numerous',
                                  s='scattered',v='several',y='solitary')
mushroom_df_sub$population <- unname(lookup_population[mushroom_df_sub$population])

lookup_habitat <- c(g='grasses',l='leaves',m='meadows',p='paths',
                                  u='urban',w='waste',d='woods')
mushroom_df_sub$habitat <- unname(lookup_habitat[mushroom_df_sub$habitat])

Display final result of transformed Data frame top 100

 head(mushroom_df_sub,20)
##    Mushroom_class    odor cap-color stalk-color-above-ring
## 1       poisonous pungent     brown                  white
## 2          edible  almond    yellow                  white
## 3          edible   anise     white                  white
## 4       poisonous pungent     white                  white
## 5          edible    none      gray                  white
## 6          edible  almond    yellow                  white
## 7          edible  almond     white                  white
## 8          edible   anise     white                  white
## 9       poisonous pungent     white                  white
## 10         edible  almond    yellow                  white
## 11         edible   anise    yellow                  white
## 12         edible  almond    yellow                  white
## 13         edible  almond    yellow                  white
## 14      poisonous pungent     white                  white
## 15         edible    none     brown                  white
## 16         edible    none      gray                  white
## 17         edible    none     white                  white
## 18      poisonous pungent     brown                  white
## 19      poisonous pungent     white                  white
## 20      poisonous pungent     brown                  white
##    stalk-surface-below-ring spore-print-color population habitat
## 1                    smooth             black  scattered   urban
## 2                    smooth             brown   numerous grasses
## 3                    smooth             brown   numerous meadows
## 4                    smooth             black  scattered   urban
## 5                    smooth             brown   abundant grasses
## 6                    smooth             black   numerous grasses
## 7                    smooth             black   numerous meadows
## 8                    smooth             brown  scattered meadows
## 9                    smooth             black    several grasses
## 10                   smooth             black  scattered meadows
## 11                   smooth             brown   numerous grasses
## 12                   smooth             black  scattered meadows
## 13                   smooth             brown  scattered grasses
## 14                   smooth             brown    several   urban
## 15                  fibrous             black   abundant grasses
## 16                   smooth             brown   solitary   urban
## 17                   smooth             brown   abundant grasses
## 18                   smooth             black  scattered grasses
## 19                   smooth             brown  scattered   urban
## 20                   smooth             brown  scattered   urban