The Rpub link is: http://rpubs.com/salmaeng/week1_607

In this study, We want to investigate where is edible and poisonous mushrooms mainly habitat. Also, need to know which species acording to either gill_color or odor exist.

Importing the dataset as a dataframe

library(RCurl)
## Loading required package: bitops
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(htmlTable)
library(magrittr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(ggplot2)
# Importing dataset as a dataframe from the UCI repo.
mushroom_df <- getURL("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",header=FALSE)
frame <- data.frame(read.csv(text = mushroom_df, header = F))
# show the data frame dimention
dim(frame)
## [1] 8124   23

Note: KableExtra library was used to format the ouput dataframe table

# display the first 10 rows only from the data frame.
head(frame, 10) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23
p x s n t p f c n k e e s s w w p w o p k s u
e x s y t a f c b k e c s s w w p w o p n n g
e b s w t l f c b n e c s s w w p w o p n n m
p x y w t p f c n n e e s s w w p w o p k s u
e x s g f n f w b k t e s s w w p w o e n a g
e x y y t a f c b n e c s s w w p w o p k n g
e b s w t a f c b g e c s s w w p w o p k n m
e b y w t l f c b n e c s s w w p w o p n s m
p x y w t p f c n p e e s s w w p w o p k v g
e b s y t a f c b g e c s s w w p w o p k s m

Renaming columns to make it easy to study it - according to UCI Repo

a table with variable name was created and uploaded to my personal GitHub account. After that, that table was imported as a dataframe.

mush_df <- as.data.frame(read.table("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition%26management/week_1/attributes_name.txt",row.names = 1,header = TRUE, sep="\t"))
mush_df <- cbind(Variable=rownames(mush_df), mush_df)
rownames(mush_df)<- NULL
mush_df
##                    Variable
## 1          edible_poisonous
## 2                 cap_shape
## 3               cap_surface
## 4                 cap_color
## 5                   bruises
## 6                      odor
## 7           gill_attachment
## 8              gill_spacing
## 9                 gill_size
## 10               gill_color
## 11              stalk_shape
## 12               stalk_root
## 13 stalk_surface_above_ring
## 14 stalk_surface_below_ring
## 15   stalk_color_above_ring
## 16   stalk_color_below_ring
## 17                veil_type
## 18               veil_color
## 19              ring_number
## 20                ring_type
## 21        spore_print_color
## 22               population
## 23                  habitat
##                                                                                               Values
## 1                                                                               poisonous=p,edible=e
## 2                                                bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## 3                                                               fibrous=f,grooves=g,scaly=y,smooth=s
## 4                    brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## 5                                                                                     bruises=t,no=f
## 6                        almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
## 7                                                           attached=a,descending=d,free=f,notched=n
## 8                                                                        close=c,crowded=w,distant=d
## 9                                                                                   broad=b,narrow=n
## 10 black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
## 11                                                                            enlarging=e,tapering=t
## 12                                   bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
## 13                                                                fibrous=f,scaly=y,silky=k,smooth=s
## 14                                                                fibrous=f,scaly=y,silky=k,smooth=s
## 15                           brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 16                           brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 17                                                                             partial=p,universal=u
## 18                                                                 brown=n,orange=o,white=w,yellow=y
## 19                                                                                none=n,one=o,two=t
## 20                     cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
## 21                     black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
## 22                                abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
## 23                                      grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

Now, rename the columns of frame according to the mush_df and style it.

names(frame) <- mush_df$Variable
head(frame) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous cap_shape cap_surface cap_color bruises odor gill_attachment gill_spacing gill_size gill_color stalk_shape stalk_root stalk_surface_above_ring stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring veil_type veil_color ring_number ring_type spore_print_color population habitat
p x s n t p f c n k e e s s w w p w o p k s u
e x s y t a f c b k e c s s w w p w o p n n g
e b s w t l f c b n e c s s w w p w o p n n m
p x y w t p f c n n e e s s w w p w o p k s u
e x s g f n f w b k t e s s w w p w o e n a g
e x y y t a f c b n e c s s w w p w o p k n g

Then take the subset data that contains poisonous and edible attribute, plus 3-4 other attributes to compare

mushroom_sub_df <- subset(frame, select = c(1, 4, 6, 10, 22, 23))
head(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous cap_color odor gill_color population habitat
p n p k s u
e y a k n g
e w l n n m
p w p n s u
e g n k a g
e y a n n g

Rename letter codes according to dataset key associated with the dataset for more readability

mushroom_sub_df$edible_poisonous <- revalue(mushroom_sub_df$edible_poisonous, c("p"="poisonous", "e"="edible"))

mushroom_sub_df$cap_color <- revalue(mushroom_sub_df$cap_color, c("n"="brown","b"="buff","c"="cinnamon","g"="gray","r"="green", "p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))

mushroom_sub_df$gill_color <- revalue(mushroom_sub_df$gill_color, c("k"="black","n"="brown","b"="buff","h"="chocolate","g"="gray","r"="green","o"="orange","p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))

mushroom_sub_df$odor <- revalue(mushroom_sub_df$odor, c("a"="almond","l"="anise","c"="creosote","y"="fishy","f"="foul","m"="musty","n"="none","p"="pungent","s"="spicy"))

mushroom_sub_df$population <- revalue(mushroom_sub_df$population, c("a"="abundant","c"="clustered","n"="numerous","s"="scattered","v"="several","y"="solitary"))

mushroom_sub_df$habitat <- revalue(mushroom_sub_df$habitat, c("g"="grasses","l"="leaves","m"="meadows","p"="paths","u"="urban","w"="waste","d"="woods"))

head(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous cap_color odor gill_color population habitat
poisonous brown pungent black scattered urban
edible yellow almond black numerous grasses
edible white anise brown numerous meadows
poisonous white pungent brown scattered urban
edible gray none black abundant grasses
edible yellow almond brown numerous grasses

Getting the summary of the subset dataset

summary(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous cap_color odor gill_color population habitat
edible :4208 brown :2284 none :3528 buff :1728 abundant : 384 woods :3148
poisonous:3916 gray :1840 foul :2160 pink :1492 clustered: 340 grasses:2148
NA red :1500 spicy : 576 white :1202 numerous : 400 leaves : 832
NA yellow :1072 fishy : 576 brown :1048 scattered:1248 meadows: 292
NA white :1040 almond : 400 gray : 752 several :4040 paths :1144
NA buff : 168 anise : 400 chocolate: 732 solitary :1712 urban : 368
NA (Other): 220 (Other): 484 (Other) :1170 NA waste : 192

Visualizing the dataset using graphics and charts

c1 <- ggplot(mushroom_sub_df, aes(x = edible_poisonous, y = habitat))
c1 + geom_jitter(aes(colour = edible_poisonous)) + labs(title="The majority of habitat per each mushrom type")

### As shown in the jitter plot, we can conclude that both grass and woods are the most suitable environments for growing both edible and poisonous mushrooms. However, paths is more convenient for growing poisonous type. In the other hands, meadows is more suitable to grow an edible mushrooms.

In the following analysis, I will go indetails of each type to get insights of quantities for each type.

mush_pois_df <- subset(mushroom_sub_df, mushroom_sub_df$edible_poisonous == "poisonous" & mushroom_sub_df$cap_color == "brown")
c <- ggplot(mush_pois_df, aes(cap_color, habitat))
c + geom_count(aes(color = ..n.., size = ..n..)) + guides(color = 'legend') + ggtitle("Habitat environment for the brown poisonous mushroom") 

# get mean and median
mush_ed_df <- subset(mushroom_sub_df, mushroom_sub_df$edible_poisonous == "edible" & mushroom_sub_df$cap_color == "brown")
c <- ggplot(mush_ed_df, aes(cap_color, habitat))
c + geom_count(aes(color = ..n.., size = ..n..)) + guides(color = 'legend') + ggtitle("Habitat environment for the brown edible mushroom")

For mushrooms with gill_color brown edible type, it seems that it can habitat in all places - 6 places with numbers ranging from 100 in the leaves to 500 in the woods. However, the poisonous type can habitat in leaves, paths, and woods with a less number - around 300.