The Rpub link is: http://rpubs.com/salmaeng/week1_607
library(RCurl)
## Loading required package: bitops
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(htmlTable)
library(magrittr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(ggplot2)
# Importing dataset as a dataframe from the UCI repo.
mushroom_df <- getURL("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",header=FALSE)
frame <- data.frame(read.csv(text = mushroom_df, header = F))
# show the data frame dimention
dim(frame)
## [1] 8124 23
# display the first 10 rows only from the data frame.
head(frame, 10) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
e | x | y | y | t | a | f | c | b | n | e | c | s | s | w | w | p | w | o | p | k | n | g |
e | b | s | w | t | a | f | c | b | g | e | c | s | s | w | w | p | w | o | p | k | n | m |
e | b | y | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | s | m |
p | x | y | w | t | p | f | c | n | p | e | e | s | s | w | w | p | w | o | p | k | v | g |
e | b | s | y | t | a | f | c | b | g | e | c | s | s | w | w | p | w | o | p | k | s | m |
mush_df <- as.data.frame(read.table("https://raw.githubusercontent.com/salma71/MSDS_2019/master/Fall2019/aquisition%26management/week_1/attributes_name.txt",row.names = 1,header = TRUE, sep="\t"))
mush_df <- cbind(Variable=rownames(mush_df), mush_df)
rownames(mush_df)<- NULL
mush_df
## Variable
## 1 edible_poisonous
## 2 cap_shape
## 3 cap_surface
## 4 cap_color
## 5 bruises
## 6 odor
## 7 gill_attachment
## 8 gill_spacing
## 9 gill_size
## 10 gill_color
## 11 stalk_shape
## 12 stalk_root
## 13 stalk_surface_above_ring
## 14 stalk_surface_below_ring
## 15 stalk_color_above_ring
## 16 stalk_color_below_ring
## 17 veil_type
## 18 veil_color
## 19 ring_number
## 20 ring_type
## 21 spore_print_color
## 22 population
## 23 habitat
## Values
## 1 poisonous=p,edible=e
## 2 bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s
## 3 fibrous=f,grooves=g,scaly=y,smooth=s
## 4 brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
## 5 bruises=t,no=f
## 6 almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
## 7 attached=a,descending=d,free=f,notched=n
## 8 close=c,crowded=w,distant=d
## 9 broad=b,narrow=n
## 10 black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
## 11 enlarging=e,tapering=t
## 12 bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
## 13 fibrous=f,scaly=y,silky=k,smooth=s
## 14 fibrous=f,scaly=y,silky=k,smooth=s
## 15 brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 16 brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
## 17 partial=p,universal=u
## 18 brown=n,orange=o,white=w,yellow=y
## 19 none=n,one=o,two=t
## 20 cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
## 21 black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
## 22 abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
## 23 grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d
names(frame) <- mush_df$Variable
head(frame) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_shape | cap_surface | cap_color | bruises | odor | gill_attachment | gill_spacing | gill_size | gill_color | stalk_shape | stalk_root | stalk_surface_above_ring | stalk_surface_below_ring | stalk_color_above_ring | stalk_color_below_ring | veil_type | veil_color | ring_number | ring_type | spore_print_color | population | habitat |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
e | x | y | y | t | a | f | c | b | n | e | c | s | s | w | w | p | w | o | p | k | n | g |
mushroom_sub_df <- subset(frame, select = c(1, 4, 6, 10, 22, 23))
head(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_color | odor | gill_color | population | habitat |
---|---|---|---|---|---|
p | n | p | k | s | u |
e | y | a | k | n | g |
e | w | l | n | n | m |
p | w | p | n | s | u |
e | g | n | k | a | g |
e | y | a | n | n | g |
mushroom_sub_df$edible_poisonous <- revalue(mushroom_sub_df$edible_poisonous, c("p"="poisonous", "e"="edible"))
mushroom_sub_df$cap_color <- revalue(mushroom_sub_df$cap_color, c("n"="brown","b"="buff","c"="cinnamon","g"="gray","r"="green", "p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))
mushroom_sub_df$gill_color <- revalue(mushroom_sub_df$gill_color, c("k"="black","n"="brown","b"="buff","h"="chocolate","g"="gray","r"="green","o"="orange","p"="pink","u"="purple","e"="red","w"="white","y"="yellow"))
mushroom_sub_df$odor <- revalue(mushroom_sub_df$odor, c("a"="almond","l"="anise","c"="creosote","y"="fishy","f"="foul","m"="musty","n"="none","p"="pungent","s"="spicy"))
mushroom_sub_df$population <- revalue(mushroom_sub_df$population, c("a"="abundant","c"="clustered","n"="numerous","s"="scattered","v"="several","y"="solitary"))
mushroom_sub_df$habitat <- revalue(mushroom_sub_df$habitat, c("g"="grasses","l"="leaves","m"="meadows","p"="paths","u"="urban","w"="waste","d"="woods"))
head(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_color | odor | gill_color | population | habitat |
---|---|---|---|---|---|
poisonous | brown | pungent | black | scattered | urban |
edible | yellow | almond | black | numerous | grasses |
edible | white | anise | brown | numerous | meadows |
poisonous | white | pungent | brown | scattered | urban |
edible | gray | none | black | abundant | grasses |
edible | yellow | almond | brown | numerous | grasses |
summary(mushroom_sub_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_color | odor | gill_color | population | habitat | |
---|---|---|---|---|---|---|
edible :4208 | brown :2284 | none :3528 | buff :1728 | abundant : 384 | woods :3148 | |
poisonous:3916 | gray :1840 | foul :2160 | pink :1492 | clustered: 340 | grasses:2148 | |
NA | red :1500 | spicy : 576 | white :1202 | numerous : 400 | leaves : 832 | |
NA | yellow :1072 | fishy : 576 | brown :1048 | scattered:1248 | meadows: 292 | |
NA | white :1040 | almond : 400 | gray : 752 | several :4040 | paths :1144 | |
NA | buff : 168 | anise : 400 | chocolate: 732 | solitary :1712 | urban : 368 | |
NA | (Other): 220 | (Other): 484 | (Other) :1170 | NA | waste : 192 |
c1 <- ggplot(mushroom_sub_df, aes(x = edible_poisonous, y = habitat))
c1 + geom_jitter(aes(colour = edible_poisonous)) + labs(title="The majority of habitat per each mushrom type")
### As shown in the jitter plot, we can conclude that both grass and woods are the most suitable environments for growing both edible and poisonous mushrooms. However, paths is more convenient for growing poisonous type. In the other hands, meadows is more suitable to grow an edible mushrooms.
mush_pois_df <- subset(mushroom_sub_df, mushroom_sub_df$edible_poisonous == "poisonous" & mushroom_sub_df$cap_color == "brown")
c <- ggplot(mush_pois_df, aes(cap_color, habitat))
c + geom_count(aes(color = ..n.., size = ..n..)) + guides(color = 'legend') + ggtitle("Habitat environment for the brown poisonous mushroom")
# get mean and median
mush_ed_df <- subset(mushroom_sub_df, mushroom_sub_df$edible_poisonous == "edible" & mushroom_sub_df$cap_color == "brown")
c <- ggplot(mush_ed_df, aes(cap_color, habitat))
c + geom_count(aes(color = ..n.., size = ..n..)) + guides(color = 'legend') + ggtitle("Habitat environment for the brown edible mushroom")