Load necessary libraries
library(plyr)
library(htmlTable)
library(magrittr)
library(kableExtra)
library(corrplot)
## corrplot 0.84 loaded
Note: ‘kableExtra’ package has been used to format tabular data outputs in scrollable (horizontal & vertical) box.
Loaded Mashroom data from GitHub location into a Data Frame and generate a summary view of the loaded data
mashroom_df <- read.csv("https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Source%20Files/Mashroom%20Data/agaricus-lepiota.data.txt",header=FALSE,sep= ",",na.strings = "?",quote = "")
head(mashroom_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | V10 | V11 | V12 | V13 | V14 | V15 | V16 | V17 | V18 | V19 | V20 | V21 | V22 | V23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
e | x | y | y | t | a | f | c | b | n | e | c | s | s | w | w | p | w | o | p | k | n | g |
Loaded Mashroom Data Dictionary into a data frame from GitHub location and display
mash_dictionary_df <- as.data.frame(read.table("https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Source%20Files/Mashroom%20Data/Data_Dictionary.txt",row.names = 1,header = TRUE, sep="\t"))
mash_dictionary_df <- cbind(Variable=rownames(mash_dictionary_df), mash_dictionary_df)
rownames(mash_dictionary_df)<- NULL
mash_dictionary_df
Variable | Values |
---|---|
edible_poisonous | poisonous=p,edible=e |
cap_shape | bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s |
cap_surface | fibrous=f,grooves=g,scaly=y,smooth=s |
cap_color | brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y |
bruises | bruises=t,no=f |
odor | almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s |
gill_attachment | attached=a,descending=d,free=f,notched=n |
gill_spacing | close=c,crowded=w,distant=d |
gill_size | broad=b,narrow=n |
gill_color | black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y |
stalk_shape | enlarging=e,tapering=t |
stalk_root | bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=? |
stalk_surface_above_ring | fibrous=f,scaly=y,silky=k,smooth=s |
stalk_surface_below_ring | fibrous=f,scaly=y,silky=k,smooth=s |
stalk_color_above_ring | brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y |
stalk_color_below_ring | brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y |
veil_type | partial=p,universal=u |
veil_color | brown=n,orange=o,white=w,yellow=y |
ring_number | none=n,one=o,two=t |
ring_type | cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z |
spore_print_color | black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y |
population | abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y |
habitat | grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d |
Renamed columns in the base Mashroom data frame based on the ‘Column_Name’ variable of the Mashroom Data Dictionary
names(mashroom_df) <- mash_dictionary_df$Variable
head(mashroom_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_shape | cap_surface | cap_color | bruises | odor | gill_attachment | gill_spacing | gill_size | gill_color | stalk_shape | stalk_root | stalk_surface_above_ring | stalk_surface_below_ring | stalk_color_above_ring | stalk_color_below_ring | veil_type | veil_color | ring_number | ring_type | spore_print_color | population | habitat |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
p | x | s | n | t | p | f | c | n | k | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | y | t | a | f | c | b | k | e | c | s | s | w | w | p | w | o | p | n | n | g |
e | b | s | w | t | l | f | c | b | n | e | c | s | s | w | w | p | w | o | p | n | n | m |
p | x | y | w | t | p | f | c | n | n | e | e | s | s | w | w | p | w | o | p | k | s | u |
e | x | s | g | f | n | f | w | b | k | t | e | s | s | w | w | p | w | o | e | n | a | g |
e | x | y | y | t | a | f | c | b | n | e | c | s | s | w | w | p | w | o | p | k | n | g |
Replace Coded values in all the columns of the Mashroom data frame with Meaningful Values based on the data dictionary and generate a summary view with few rows
edible_poisonous | cap_shape | cap_surface | cap_color | bruises | odor | gill_attachment | gill_spacing | gill_size | gill_color | stalk_shape | stalk_root | stalk_surface_above_ring | stalk_surface_below_ring | stalk_color_above_ring | stalk_color_below_ring | veil_type | veil_color | ring_number | ring_type | spore_print_color | population | habitat |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
poisonous | convex | smooth | brown | bruises | pungent | free | close | narrow | black | enlarging | equal | smooth | smooth | white | white | partial | white | one | pendant | black | scattered | urban |
edible | convex | smooth | yellow | bruises | almond | free | close | broad | black | enlarging | club | smooth | smooth | white | white | partial | white | one | pendant | brown | numerous | grasses |
edible | bell | smooth | white | bruises | anise | free | close | broad | brown | enlarging | club | smooth | smooth | white | white | partial | white | one | pendant | brown | numerous | meadows |
poisonous | convex | scaly | white | bruises | pungent | free | close | narrow | brown | enlarging | equal | smooth | smooth | white | white | partial | white | one | pendant | black | scattered | urban |
edible | convex | smooth | gray | no | none | free | crowded | broad | black | tapering | equal | smooth | smooth | white | white | partial | white | one | evanescent | brown | abundant | grasses |
edible | convex | scaly | yellow | bruises | almond | free | close | broad | brown | enlarging | club | smooth | smooth | white | white | partial | white | one | pendant | black | numerous | grasses |
Selected a subset of following 6 columns including the ‘edible_poisonous’ attribute from the base Mashroom data frame - - edible_poisonous - cap_color - odor - spore_print_color - population - habitat
sub_mashroom_df <- subset(mashroom_df, select=c(1,4,6,21,22,23))
head(sub_mashroom_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_color | odor | spore_print_color | population | habitat |
---|---|---|---|---|---|
poisonous | brown | pungent | black | scattered | urban |
edible | yellow | almond | brown | numerous | grasses |
edible | white | anise | brown | numerous | meadows |
poisonous | white | pungent | black | scattered | urban |
edible | gray | none | brown | abundant | grasses |
edible | yellow | almond | black | numerous | grasses |
Generated a summary of the Mashroom data analysis subset
summary(sub_mashroom_df) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
edible_poisonous | cap_color |
| spore_print_color |
|
| |
---|---|---|---|---|---|---|
edible :4208 | brown :2284 | none :3528 | white :2388 | abundant : 384 | woods :3148 | |
poisonous:3916 | gray :1840 | foul :2160 | brown :1968 | clustered: 340 | grasses:2148 | |
NA | red :1500 | spicy : 576 | black :1872 | numerous : 400 | leaves : 832 | |
NA | yellow :1072 | fishy : 576 | chocolate:1632 | scattered:1248 | meadows: 292 | |
NA | white :1040 | almond : 400 | green : 72 | several :4040 | paths :1144 | |
NA | buff : 168 | anise : 400 | buff : 48 | solitary :1712 | urban : 368 | |
NA | (Other): 220 | (Other): 484 | (Other) : 144 | NA | waste : 192 |
Due to the presence of all categorical variables in the data set, I wanted them to be associated with numeric IDs to do further association/correlation analysis to identify impact on ‘Poisonous’ or ‘Edible’ classification.
sub_mashroom_df$edible_poisonous_id <- as.integer(mapvalues(sub_mashroom_df$edible_poisonous,c("poisonous","edible"),1:2))
sub_mashroom_df$cap_color_id <- as.integer(mapvalues(sub_mashroom_df$cap_color,c("brown","buff","cinnamon","gray","green","pink","purple","red","white","yellow"),1:10))
sub_mashroom_df$odor_id <- as.integer(mapvalues(sub_mashroom_df$odor,c("almond","anise","creosote","fishy","foul","musty","none","pungent","spicy"),1:9))
sub_mashroom_df$spore_print_color_id <- as.integer(mapvalues(sub_mashroom_df$spore_print_color,c("black","brown","buff","chocolate","green","orange","purple","white","yellow"),1:9))
sub_mashroom_df$population_id <- as.integer(mapvalues(sub_mashroom_df$population,c("abundant","clustered","numerous","scattered","several","solitary"),1:6))
sub_mashroom_df$habitat_id <- as.integer(mapvalues(sub_mashroom_df$habitat,c("grasses","leaves","meadows","paths","urban","waste","woods"),1:7))
str(sub_mashroom_df)
## 'data.frame': 8124 obs. of 12 variables:
## $ edible_poisonous : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_color : Factor w/ 10 levels "buff","cinnamon",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ odor : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ spore_print_color : Factor w/ 9 levels "buff","chocolate",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "abundant","clustered",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "woods","grasses",..: 6 2 4 6 2 2 4 4 2 4 ...
## $ edible_poisonous_id : int 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_color_id : int 5 10 9 9 4 10 9 9 9 10 ...
## $ odor_id : int 7 1 4 7 6 1 1 4 7 1 ...
## $ spore_print_color_id: int 3 4 4 3 4 3 3 4 3 3 ...
## $ population_id : int 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat_id : int 6 2 4 6 2 2 4 4 2 4 ...
A Correlation plot generated for selected variables to capture possible association/relationship between ‘edible_poisonous’ and any other variables
df <- subset(sub_mashroom_df,select=c(7,8,9,10,11,12))
M <- cor(df)
corrplot(M, type = "upper", order = "hclust", sig.level = 0.01, insig = "blank")
Based on the above Correlation plot and sample hypothesis shared by the previous researchers, I have selected to do further analysis on association between ‘habitat’, ‘cap_color’ and ‘edible_poisonous’ variables -
I performed row based subsetting of the data frame to filter for ‘poisonous’ mashrooms with cap_color as ‘white’ -
poison_mashroom_df <- subset(sub_mashroom_df, sub_mashroom_df$edible_poisonous == "poisonous" & sub_mashroom_df$cap_color =="white" )
poison_habitat <- table(poison_mashroom_df$habitat)
poison_habitat_ratio <- poison_habitat/sum(poison_habitat)
barplot(poison_habitat_ratio, main="Habitat Distribution for Poisonous Mashrooms")
I performed row based subsetting of the data frame to filter for ‘edible’ mashrooms with cap_color as ‘white’ -
edible_mashroom_df <- subset(sub_mashroom_df, sub_mashroom_df$edible_poisonous == "edible" & sub_mashroom_df$cap_color =="white" )
edible_habitat <- table(edible_mashroom_df$habitat)
edible_habitat_ratio <- edible_habitat/sum(edible_habitat)
barplot(edible_habitat_ratio, main="Habitat Distribution for Edible Mashrooms")
Based on the above two Bar plots, I am adding another hypothesis that mashrooms with habitat as ‘urban’ and cap_color as ‘white’ can also be considered as ‘poisonous’ with a high degree of accuracy.