DATA607 - Week 1 Assignment

Data Analysis for mushroom dataset

Mushrooms Dataset. A famous—if slightly moldy—dataset about mushrooms can be found in the UCI repository here: https://archive.ics.uci.edu/ml/datasets/Mushroom. The fact that this is such a well-known dataset in the data science community makes it a good dataset to use for comparative benchmarking. For example, if someone was working to build a better decision tree algorithm (or other predictive classifier) to analyze categorical data, this dataset could be useful. A typical problem (which is beyond the scope of this assignment!) is to answer the question, “Which other attribute or attributes are the best predictors of whether a particular mushroom is poisonous or edible?”

Your task is to study the dataset and the associated description of the data (i.e. “data dictionary”). You may need to look around a bit, but it’s there! You should take the data, and create a data frame with a subset of the columns in the dataset. You should include the column that indicates edible or poisonous and three or four other columns. You should also add meaningful column names and replace the abbreviations used in the data—for example, in the appropriate column, “e” might become “edible.” Your deliverable is the R code to perform these transformation tasks.

Environment set up

#loading required libraries
library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.1     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts -------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(curl)

## 
## Attaching package: 'curl'

## The following object is masked from 'package:readr':
## 
##     parse_date

Data Acquisition

#data import from the source itself (uci.edu)
mush_data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
get_mush_data <- curl(mush_data_url)
#data acquisition using url supported functions and storing in a data frame
mushroom <- read.csv(get_mush_data, header = TRUE, sep = ',', stringsAsFactors = F)
#view the data frame
head(mushroom)

##   p x s n t p.1 f c n.1 k e e.1 s.1 s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
## 1 e x s y t   a f c   b k e   c   s   s w   w   p   w o   p   n   n g
## 2 e b s w t   l f c   b n e   c   s   s w   w   p   w o   p   n   n m
## 3 p x y w t   p f c   n n e   e   s   s w   w   p   w o   p   k   s u
## 4 e x s g f   n f w   b k t   e   s   s w   w   p   w o   e   n   a g
## 5 e x y y t   a f c   b n e   c   s   s w   w   p   w o   p   k   n g
## 6 e b s w t   a f c   b g e   c   s   s w   w   p   w o   p   k   n m

#view dimensions of data frame
# number of observations = 8123
# number of variables (features or attributes) = 23
# please note that first attiribute is class (label)
dim(mushroom)

## [1] 8123   23

#variable names (features or attributes)
colnames(mushroom)

##  [1] "p"   "x"   "s"   "n"   "t"   "p.1" "f"   "c"   "n.1" "k"   "e"  
## [12] "e.1" "s.1" "s.2" "w"   "w.1" "p.2" "w.2" "o"   "p.3" "k.1" "s.3"
## [23] "u"

Data Wrangling

# Analyzed the provided data dictionary and renamee the columns per attribute names
mush_df <- rename(mushroom,class=p,cap_shape=x,cap_surface=s,cap_color=n,bruises=t,odor=p.1,gill_attachment=f,gill_spacing=c,gill_size=n.1,gill_color=k,stalk_shape=e,stalk_root=e.1,stalk_surface_above_ring=s.1,stalk_surface_below_ring=s.2,stalk_color_above_ring=w,stalk_color_below_ring=w.1,veil_type=p.2,veil_color=w.2,ring_number=o,ring_type=p.3,spore_print_color=k.1,population=s.3,habitat=u)

#checking dimensions
dim(mush_df)

## [1] 8123   23

# view the new mush_df dataset
head(mush_df)

##   class cap_shape cap_surface cap_color bruises odor gill_attachment
## 1     e         x           s         y       t    a               f
## 2     e         b           s         w       t    l               f
## 3     p         x           y         w       t    p               f
## 4     e         x           s         g       f    n               f
## 5     e         x           y         y       t    a               f
## 6     e         b           s         w       t    a               f
##   gill_spacing gill_size gill_color stalk_shape stalk_root
## 1            c         b          k           e          c
## 2            c         b          n           e          c
## 3            c         n          n           e          e
## 4            w         b          k           t          e
## 5            c         b          n           e          c
## 6            c         b          g           e          c
##   stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk_color_below_ring veil_type veil_color ring_number ring_type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         e
## 5                      w         p          w           o         p
## 6                      w         p          w           o         p
##   spore_print_color population habitat
## 1                 n          n       g
## 2                 n          n       m
## 3                 k          s       u
## 4                 n          a       g
## 5                 k          n       g
## 6                 k          n       m

#confirming the columns names in the new mush_df dataset
colnames(mush_df)

##  [1] "class"                    "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_type"                "veil_color"              
## [19] "ring_number"              "ring_type"               
## [21] "spore_print_color"        "population"              
## [23] "habitat"

#using head parameters to view the first 10 observations
head(mush_df,10)

##    class cap_shape cap_surface cap_color bruises odor gill_attachment
## 1      e         x           s         y       t    a               f
## 2      e         b           s         w       t    l               f
## 3      p         x           y         w       t    p               f
## 4      e         x           s         g       f    n               f
## 5      e         x           y         y       t    a               f
## 6      e         b           s         w       t    a               f
## 7      e         b           y         w       t    l               f
## 8      p         x           y         w       t    p               f
## 9      e         b           s         y       t    a               f
## 10     e         x           y         y       t    l               f
##    gill_spacing gill_size gill_color stalk_shape stalk_root
## 1             c         b          k           e          c
## 2             c         b          n           e          c
## 3             c         n          n           e          e
## 4             w         b          k           t          e
## 5             c         b          n           e          c
## 6             c         b          g           e          c
## 7             c         b          n           e          c
## 8             c         n          p           e          e
## 9             c         b          g           e          c
## 10            c         b          g           e          c
##    stalk_surface_above_ring stalk_surface_below_ring
## 1                         s                        s
## 2                         s                        s
## 3                         s                        s
## 4                         s                        s
## 5                         s                        s
## 6                         s                        s
## 7                         s                        s
## 8                         s                        s
## 9                         s                        s
## 10                        s                        s
##    stalk_color_above_ring stalk_color_below_ring veil_type veil_color
## 1                       w                      w         p          w
## 2                       w                      w         p          w
## 3                       w                      w         p          w
## 4                       w                      w         p          w
## 5                       w                      w         p          w
## 6                       w                      w         p          w
## 7                       w                      w         p          w
## 8                       w                      w         p          w
## 9                       w                      w         p          w
## 10                      w                      w         p          w
##    ring_number ring_type spore_print_color population habitat
## 1            o         p                 n          n       g
## 2            o         p                 n          n       m
## 3            o         p                 k          s       u
## 4            o         e                 n          a       g
## 5            o         p                 k          n       g
## 6            o         p                 k          n       m
## 7            o         p                 n          s       m
## 8            o         p                 k          v       g
## 9            o         p                 k          s       m
## 10           o         p                 n          n       g

#creating the subset of mush_df by selecting only few attributes (randomly picked)
mush_subset <- select(mush_df,class,cap_shape,veil_color,population,habitat)

#view first 10 observations of new subset which we just created
head(mush_subset,10)

##    class cap_shape veil_color population habitat
## 1      e         x          w          n       g
## 2      e         b          w          n       m
## 3      p         x          w          s       u
## 4      e         x          w          a       g
## 5      e         x          w          n       g
## 6      e         b          w          n       m
## 7      e         b          w          s       m
## 8      p         x          w          v       g
## 9      e         b          w          s       m
## 10     e         x          w          n       g

#using mutate function to substitiute values of attributes in each cell according to provided attribute inforamtion for each observation
mush_df_val <- mutate(mush_subset, class = case_when(class=='e'~'edible',class=='p'~'poisonous'),cap_shape=case_when(cap_shape=='b'~'bell',cap_shape=='c'~'conical',cap_shape=='f'~'flat',cap_shape=='x'~'convex',cap_shape=='k'~'knobbed',cap_shape=='s'~'sunken'),veil_color=case_when(veil_color=='n'~'brown',veil_color=='o'~'orange',veil_color=='w'~'white',veil_color=='y'~'yellow'),population=case_when(population=='a'~'abundant',population=='c'~'clustered',population=='n'~'numerous',population=='s'~'scattered',population=='v'~'several', population=='y'~'solitary'),habitat=case_when(habitat=='g'~'grasses',habitat=='l'~'leaves',habitat=='m'~'meadows',habitat=='p'~'paths',habitat=='u'~'urban',habitat=='w'~'waste', habitat=='d'~'woods'))

#checking dimensions and making sure that no observation is lost after the data transformation
dim(mush_df_val)

## [1] 8123    5

#view firt 10 observation of transformed subset
head(mush_df_val,10)

##        class cap_shape veil_color population habitat
## 1     edible    convex      white   numerous grasses
## 2     edible      bell      white   numerous meadows
## 3  poisonous    convex      white  scattered   urban
## 4     edible    convex      white   abundant grasses
## 5     edible    convex      white   numerous grasses
## 6     edible      bell      white   numerous meadows
## 7     edible      bell      white  scattered meadows
## 8  poisonous    convex      white    several grasses
## 9     edible      bell      white  scattered meadows
## 10    edible    convex      white   numerous grasses

#view data frame details using str function
str(mush_df_val)

## 'data.frame':    8123 obs. of  5 variables:
##  $ class     : chr  "edible" "edible" "poisonous" "edible" ...
##  $ cap_shape : chr  "convex" "bell" "convex" "convex" ...
##  $ veil_color: chr  "white" "white" "white" "white" ...
##  $ population: chr  "numerous" "numerous" "scattered" "abundant" ...
##  $ habitat   : chr  "grasses" "meadows" "urban" "grasses" ...

#summarize the dataset using count
summarize(mush_df_val,n=n())

##      n
## 1 8123

sum(mush_df_val$class=='edible')

## [1] 4208

sum(mush_df_val$class=='poisonous')

## [1] 3915

#class distribution - edible
ed_class <- sum(mush_df_val$class=='edible')/nrow(mush_df_val)
ed_cls_dist <- ed_class*100
sprintf('Edible Class : %.1f%%',ed_class*100)

## [1] "Edible Class : 51.8%"

#class distribution - edible
pos_class <- sum(mush_df_val$class=='poisonous')/nrow(mush_df_val)
pos_cls_dist <- pos_class*100
pos_cls_dist

## [1] 48.19648

print(sprintf('Poisonous Class : %.1f%%',pos_cls_dist))

## [1] "Poisonous Class : 48.2%"

mush_df_val %>%
 group_by(habitat) %>%
 summarize(pop_sd=sd(as.numeric(as.factor(population)))) %>%
 arrange(desc(pop_sd))

## # A tibble: 7 x 2
##   habitat pop_sd
##   <chr>    <dbl>
## 1 grasses  1.23 
## 2 meadows  0.681
## 3 urban    0.666
## 4 woods    0.583
## 5 paths    0.500
## 6 leaves   0.342
## 7 waste    0

#Checking if there are any duplicate observation in the datasert
count(mush_df_val[duplicated(mush_df),])

## # A tibble: 1 x 1
##       n
##   <int>
## 1     0

#find counts for class & population and also view the relatioships
count(mush_df_val,class,population)

## # A tibble: 10 x 3
##    class     population     n
##    <chr>     <chr>      <int>
##  1 edible    abundant     384
##  2 edible    clustered    288
##  3 edible    numerous     400
##  4 edible    scattered    880
##  5 edible    several     1192
##  6 edible    solitary    1064
##  7 poisonous clustered     52
##  8 poisonous scattered    367
##  9 poisonous several     2848
## 10 poisonous solitary     648

#check for NAs not available values present in the dataset
table(is.na(mush_df))

## 
##  FALSE 
## 186829

#But we know from the data dictionary that there are missing values for attribute#11
#let's replace ? with NA and check the dataset
data.st.na <- mutate(mush_df,stalk_root=ifelse(stalk_root=='?',NA,stalk_root))

table(is.na(data.st.na))

## 
##  FALSE   TRUE 
## 184349   2480

#drill further
count(data.st.na,stalk_root)

## # A tibble: 5 x 2
##   stalk_root     n
##   <chr>      <int>
## 1 b           3776
## 2 c            556
## 3 e           1119
## 4 r            192
## 5 <NA>        2480

#checl dimensions
dim(data.st.na)

## [1] 8123   23

#now drop NA values from the dataset
data.mush.new <- drop_na(data.st.na)

#now check new dataset without NA's (NA observations got dropped)
dim(data.mush.new)

## [1] 5643   23

#number of NAs got dropped
8123-5643

## [1] 2480

Explore Further…

#finding relationships between habitat and cap color
data.ht.cc <- select(data.mush.new, class,habitat,population,cap_color)
filter(data.ht.cc, habitat=='l', cap_color=='w')

##   class habitat population cap_color
## 1     p       l          c         w
## 2     p       l          c         w
## 3     p       l          c         w
## 4     p       l          c         w
## 5     p       l          c         w
## 6     p       l          c         w
## 7     p       l          c         w
## 8     p       l          c         w

It turns out that all observations : habitat with ‘leaves’ and cap color ‘white’ are posionous and have ‘cluster’ population

#let's try some aggregate habitat with group by and summarize with class 
datahc <-mush_df_val %>%
 group_by(habitat) %>%
 summarize(count=n(),n_distinct(class)) #%>%

#in the result dataset 1 indicates edible and 2 indicates poisonous (R takes care of it)
datahc

## # A tibble: 7 x 3
##   habitat count `n_distinct(class)`
##   <chr>   <int>               <int>
## 1 grasses  2148                   2
## 2 leaves    832                   2
## 3 meadows   292                   2
## 4 paths    1144                   2
## 5 urban     367                   2
## 6 waste     192                   1
## 7 woods    3148                   2

# from above result dataset , it apears that waste habitat is the only edible class
# let's try to find more in the transformed dataset
data.hb.waste <-filter(mush_df_val, habitat=='waste')
# the population for such combination appears clusterd
count(data.hb.waste, class, habitat, population)

## # A tibble: 1 x 4
##   class  habitat population     n
##   <chr>  <chr>   <chr>      <int>
## 1 edible waste   clustered    192

data.hb.cl <-filter(mush_df_val, habitat!='waste',class=='edible')
sum(data.hb.cl$class=='edible')

## [1] 4016

sum(mush_df_val$class=='edible')

## [1] 4208

#matches the above calculated count 192
sum(mush_df_val$class=='edible') - sum(data.hb.cl$class=='edible')

## [1] 192

# let's see how spread or data distibution between habitat and population
mush_df_val %>%
 group_by(habitat) %>%
 summarize(pop_sd=sd(as.numeric(as.factor(population)))) %>%
 arrange(desc(pop_sd))

## # A tibble: 7 x 2
##   habitat pop_sd
##   <chr>    <dbl>
## 1 grasses  1.23 
## 2 meadows  0.681
## 3 urban    0.666
## 4 woods    0.583
## 5 paths    0.500
## 6 leaves   0.342
## 7 waste    0

#another use of group by and summarize
mush_df_val %>%
 group_by(class) %>%
 summarize(count=n())

## # A tibble: 2 x 2
##   class     count
##   <chr>     <int>
## 1 edible     4208
## 2 poisonous  3915

#total observations with group by
mush_df_val %>%
 summarize(count=n())

##   count
## 1  8123

EDA (Exploratory Data Analysis)

Bar Plots

#Mushroom Population and habitat
ggplot(mush_df_val, mapping = aes(mush_df_val$population))+
    geom_bar(aes(fill=mush_df_val$habitat))+
    xlab('Mushroom population')+labs(fill='habitat')+geom_text(stat='count',aes(label = ..count..,y=..count..),vjust=-0.2)+
    ggtitle('Mushroom Population and habitat')

#Mushroom habitat and class
ggplot(mush_df_val, mapping = aes(mush_df_val$habitat))+
    geom_bar(aes(fill=mush_df_val$class))+
    xlab('Mushroom habitat')+labs(fill='class')+geom_text(stat='count',aes(label = ..count..,y=..count..),vjust=-0.2)+
    ggtitle('Mushroom habitat and class')

Density Plots

#density plot with habitat and class
ggplot(data=mush_df_val, aes(habitat,colour=class))+
    geom_density(alpha=0.1)

#density plot with population and class using fill
ggplot(data=mush_df_val, aes(population,colour=class,fill=population))+
    geom_density(alpha=0.1)