Mushrooms Dataset. A famous—if slightly moldy—dataset about mushrooms can be found in the UCI repository here: https://archive.ics.uci.edu/ml/datasets/Mushroom. The fact that this is such a well-known dataset in the data science community makes it a good dataset to use for comparative benchmarking. For example, if someone was working to build a better decision tree algorithm (or other predictive classifier) to analyze categorical data, this dataset could be useful. A typical problem (which is beyond the scope of this assignment!) is to answer the question, “Which other attribute or attributes are the best predictors of whether a particular mushroom is poisonous or edible?”
#loading required libraries
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(curl)
##
## Attaching package: 'curl'
## The following object is masked from 'package:readr':
##
## parse_date
#data import from the source itself (uci.edu)
mush_data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
get_mush_data <- curl(mush_data_url)
#data acquisition using url supported functions and storing in a data frame
mushroom <- read.csv(get_mush_data, header = TRUE, sep = ',', stringsAsFactors = F)
#view the data frame
head(mushroom)
## p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 e x s y t a f c b k e c s s w w p w o p n n g
## 2 e b s w t l f c b n e c s s w w p w o p n n m
## 3 p x y w t p f c n n e e s s w w p w o p k s u
## 4 e x s g f n f w b k t e s s w w p w o e n a g
## 5 e x y y t a f c b n e c s s w w p w o p k n g
## 6 e b s w t a f c b g e c s s w w p w o p k n m
#view dimensions of data frame
# number of observations = 8123
# number of variables (features or attributes) = 23
# please note that first attiribute is class (label)
dim(mushroom)
## [1] 8123 23
#variable names (features or attributes)
colnames(mushroom)
## [1] "p" "x" "s" "n" "t" "p.1" "f" "c" "n.1" "k" "e"
## [12] "e.1" "s.1" "s.2" "w" "w.1" "p.2" "w.2" "o" "p.3" "k.1" "s.3"
## [23] "u"
# Analyzed the provided data dictionary and renamee the columns per attribute names
mush_df <- rename(mushroom,class=p,cap_shape=x,cap_surface=s,cap_color=n,bruises=t,odor=p.1,gill_attachment=f,gill_spacing=c,gill_size=n.1,gill_color=k,stalk_shape=e,stalk_root=e.1,stalk_surface_above_ring=s.1,stalk_surface_below_ring=s.2,stalk_color_above_ring=w,stalk_color_below_ring=w.1,veil_type=p.2,veil_color=w.2,ring_number=o,ring_type=p.3,spore_print_color=k.1,population=s.3,habitat=u)
#checking dimensions
dim(mush_df)
## [1] 8123 23
# view the new mush_df dataset
head(mush_df)
## class cap_shape cap_surface cap_color bruises odor gill_attachment
## 1 e x s y t a f
## 2 e b s w t l f
## 3 p x y w t p f
## 4 e x s g f n f
## 5 e x y y t a f
## 6 e b s w t a f
## gill_spacing gill_size gill_color stalk_shape stalk_root
## 1 c b k e c
## 2 c b n e c
## 3 c n n e e
## 4 w b k t e
## 5 c b n e c
## 6 c b g e c
## stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk_color_below_ring veil_type veil_color ring_number ring_type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o e
## 5 w p w o p
## 6 w p w o p
## spore_print_color population habitat
## 1 n n g
## 2 n n m
## 3 k s u
## 4 n a g
## 5 k n g
## 6 k n m
#confirming the columns names in the new mush_df dataset
colnames(mush_df)
## [1] "class" "cap_shape"
## [3] "cap_surface" "cap_color"
## [5] "bruises" "odor"
## [7] "gill_attachment" "gill_spacing"
## [9] "gill_size" "gill_color"
## [11] "stalk_shape" "stalk_root"
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring" "stalk_color_below_ring"
## [17] "veil_type" "veil_color"
## [19] "ring_number" "ring_type"
## [21] "spore_print_color" "population"
## [23] "habitat"
#using head parameters to view the first 10 observations
head(mush_df,10)
## class cap_shape cap_surface cap_color bruises odor gill_attachment
## 1 e x s y t a f
## 2 e b s w t l f
## 3 p x y w t p f
## 4 e x s g f n f
## 5 e x y y t a f
## 6 e b s w t a f
## 7 e b y w t l f
## 8 p x y w t p f
## 9 e b s y t a f
## 10 e x y y t l f
## gill_spacing gill_size gill_color stalk_shape stalk_root
## 1 c b k e c
## 2 c b n e c
## 3 c n n e e
## 4 w b k t e
## 5 c b n e c
## 6 c b g e c
## 7 c b n e c
## 8 c n p e e
## 9 c b g e c
## 10 c b g e c
## stalk_surface_above_ring stalk_surface_below_ring
## 1 s s
## 2 s s
## 3 s s
## 4 s s
## 5 s s
## 6 s s
## 7 s s
## 8 s s
## 9 s s
## 10 s s
## stalk_color_above_ring stalk_color_below_ring veil_type veil_color
## 1 w w p w
## 2 w w p w
## 3 w w p w
## 4 w w p w
## 5 w w p w
## 6 w w p w
## 7 w w p w
## 8 w w p w
## 9 w w p w
## 10 w w p w
## ring_number ring_type spore_print_color population habitat
## 1 o p n n g
## 2 o p n n m
## 3 o p k s u
## 4 o e n a g
## 5 o p k n g
## 6 o p k n m
## 7 o p n s m
## 8 o p k v g
## 9 o p k s m
## 10 o p n n g
#creating the subset of mush_df by selecting only few attributes (randomly picked)
mush_subset <- select(mush_df,class,cap_shape,veil_color,population,habitat)
#view first 10 observations of new subset which we just created
head(mush_subset,10)
## class cap_shape veil_color population habitat
## 1 e x w n g
## 2 e b w n m
## 3 p x w s u
## 4 e x w a g
## 5 e x w n g
## 6 e b w n m
## 7 e b w s m
## 8 p x w v g
## 9 e b w s m
## 10 e x w n g
#using mutate function to substitiute values of attributes in each cell according to provided attribute inforamtion for each observation
mush_df_val <- mutate(mush_subset, class = case_when(class=='e'~'edible',class=='p'~'poisonous'),cap_shape=case_when(cap_shape=='b'~'bell',cap_shape=='c'~'conical',cap_shape=='f'~'flat',cap_shape=='x'~'convex',cap_shape=='k'~'knobbed',cap_shape=='s'~'sunken'),veil_color=case_when(veil_color=='n'~'brown',veil_color=='o'~'orange',veil_color=='w'~'white',veil_color=='y'~'yellow'),population=case_when(population=='a'~'abundant',population=='c'~'clustered',population=='n'~'numerous',population=='s'~'scattered',population=='v'~'several', population=='y'~'solitary'),habitat=case_when(habitat=='g'~'grasses',habitat=='l'~'leaves',habitat=='m'~'meadows',habitat=='p'~'paths',habitat=='u'~'urban',habitat=='w'~'waste', habitat=='d'~'woods'))
#checking dimensions and making sure that no observation is lost after the data transformation
dim(mush_df_val)
## [1] 8123 5
#view firt 10 observation of transformed subset
head(mush_df_val,10)
## class cap_shape veil_color population habitat
## 1 edible convex white numerous grasses
## 2 edible bell white numerous meadows
## 3 poisonous convex white scattered urban
## 4 edible convex white abundant grasses
## 5 edible convex white numerous grasses
## 6 edible bell white numerous meadows
## 7 edible bell white scattered meadows
## 8 poisonous convex white several grasses
## 9 edible bell white scattered meadows
## 10 edible convex white numerous grasses
#view data frame details using str function
str(mush_df_val)
## 'data.frame': 8123 obs. of 5 variables:
## $ class : chr "edible" "edible" "poisonous" "edible" ...
## $ cap_shape : chr "convex" "bell" "convex" "convex" ...
## $ veil_color: chr "white" "white" "white" "white" ...
## $ population: chr "numerous" "numerous" "scattered" "abundant" ...
## $ habitat : chr "grasses" "meadows" "urban" "grasses" ...
#summarize the dataset using count
summarize(mush_df_val,n=n())
## n
## 1 8123
sum(mush_df_val$class=='edible')
## [1] 4208
sum(mush_df_val$class=='poisonous')
## [1] 3915
#class distribution - edible
ed_class <- sum(mush_df_val$class=='edible')/nrow(mush_df_val)
ed_cls_dist <- ed_class*100
sprintf('Edible Class : %.1f%%',ed_class*100)
## [1] "Edible Class : 51.8%"
#class distribution - edible
pos_class <- sum(mush_df_val$class=='poisonous')/nrow(mush_df_val)
pos_cls_dist <- pos_class*100
pos_cls_dist
## [1] 48.19648
print(sprintf('Poisonous Class : %.1f%%',pos_cls_dist))
## [1] "Poisonous Class : 48.2%"
mush_df_val %>%
group_by(habitat) %>%
summarize(pop_sd=sd(as.numeric(as.factor(population)))) %>%
arrange(desc(pop_sd))
## # A tibble: 7 x 2
## habitat pop_sd
## <chr> <dbl>
## 1 grasses 1.23
## 2 meadows 0.681
## 3 urban 0.666
## 4 woods 0.583
## 5 paths 0.500
## 6 leaves 0.342
## 7 waste 0
#Checking if there are any duplicate observation in the datasert
count(mush_df_val[duplicated(mush_df),])
## # A tibble: 1 x 1
## n
## <int>
## 1 0
#find counts for class & population and also view the relatioships
count(mush_df_val,class,population)
## # A tibble: 10 x 3
## class population n
## <chr> <chr> <int>
## 1 edible abundant 384
## 2 edible clustered 288
## 3 edible numerous 400
## 4 edible scattered 880
## 5 edible several 1192
## 6 edible solitary 1064
## 7 poisonous clustered 52
## 8 poisonous scattered 367
## 9 poisonous several 2848
## 10 poisonous solitary 648
#check for NAs not available values present in the dataset
table(is.na(mush_df))
##
## FALSE
## 186829
#But we know from the data dictionary that there are missing values for attribute#11
#let's replace ? with NA and check the dataset
data.st.na <- mutate(mush_df,stalk_root=ifelse(stalk_root=='?',NA,stalk_root))
table(is.na(data.st.na))
##
## FALSE TRUE
## 184349 2480
#drill further
count(data.st.na,stalk_root)
## # A tibble: 5 x 2
## stalk_root n
## <chr> <int>
## 1 b 3776
## 2 c 556
## 3 e 1119
## 4 r 192
## 5 <NA> 2480
#checl dimensions
dim(data.st.na)
## [1] 8123 23
#now drop NA values from the dataset
data.mush.new <- drop_na(data.st.na)
#now check new dataset without NA's (NA observations got dropped)
dim(data.mush.new)
## [1] 5643 23
#number of NAs got dropped
8123-5643
## [1] 2480
#finding relationships between habitat and cap color
data.ht.cc <- select(data.mush.new, class,habitat,population,cap_color)
filter(data.ht.cc, habitat=='l', cap_color=='w')
## class habitat population cap_color
## 1 p l c w
## 2 p l c w
## 3 p l c w
## 4 p l c w
## 5 p l c w
## 6 p l c w
## 7 p l c w
## 8 p l c w
#let's try some aggregate habitat with group by and summarize with class
datahc <-mush_df_val %>%
group_by(habitat) %>%
summarize(count=n(),n_distinct(class)) #%>%
#in the result dataset 1 indicates edible and 2 indicates poisonous (R takes care of it)
datahc
## # A tibble: 7 x 3
## habitat count `n_distinct(class)`
## <chr> <int> <int>
## 1 grasses 2148 2
## 2 leaves 832 2
## 3 meadows 292 2
## 4 paths 1144 2
## 5 urban 367 2
## 6 waste 192 1
## 7 woods 3148 2
# from above result dataset , it apears that waste habitat is the only edible class
# let's try to find more in the transformed dataset
data.hb.waste <-filter(mush_df_val, habitat=='waste')
# the population for such combination appears clusterd
count(data.hb.waste, class, habitat, population)
## # A tibble: 1 x 4
## class habitat population n
## <chr> <chr> <chr> <int>
## 1 edible waste clustered 192
data.hb.cl <-filter(mush_df_val, habitat!='waste',class=='edible')
sum(data.hb.cl$class=='edible')
## [1] 4016
sum(mush_df_val$class=='edible')
## [1] 4208
#matches the above calculated count 192
sum(mush_df_val$class=='edible') - sum(data.hb.cl$class=='edible')
## [1] 192
# let's see how spread or data distibution between habitat and population
mush_df_val %>%
group_by(habitat) %>%
summarize(pop_sd=sd(as.numeric(as.factor(population)))) %>%
arrange(desc(pop_sd))
## # A tibble: 7 x 2
## habitat pop_sd
## <chr> <dbl>
## 1 grasses 1.23
## 2 meadows 0.681
## 3 urban 0.666
## 4 woods 0.583
## 5 paths 0.500
## 6 leaves 0.342
## 7 waste 0
#another use of group by and summarize
mush_df_val %>%
group_by(class) %>%
summarize(count=n())
## # A tibble: 2 x 2
## class count
## <chr> <int>
## 1 edible 4208
## 2 poisonous 3915
#total observations with group by
mush_df_val %>%
summarize(count=n())
## count
## 1 8123
#Mushroom Population and habitat
ggplot(mush_df_val, mapping = aes(mush_df_val$population))+
geom_bar(aes(fill=mush_df_val$habitat))+
xlab('Mushroom population')+labs(fill='habitat')+geom_text(stat='count',aes(label = ..count..,y=..count..),vjust=-0.2)+
ggtitle('Mushroom Population and habitat')
#Mushroom habitat and class
ggplot(mush_df_val, mapping = aes(mush_df_val$habitat))+
geom_bar(aes(fill=mush_df_val$class))+
xlab('Mushroom habitat')+labs(fill='class')+geom_text(stat='count',aes(label = ..count..,y=..count..),vjust=-0.2)+
ggtitle('Mushroom habitat and class')
#density plot with habitat and class
ggplot(data=mush_df_val, aes(habitat,colour=class))+
geom_density(alpha=0.1)
#density plot with population and class using fill
ggplot(data=mush_df_val, aes(population,colour=class,fill=population))+
geom_density(alpha=0.1)