Packages used:
library(ggplot2)
Read mushrooms csv file from github raw file:
mushrooms <- read.csv(file = "https://raw.githubusercontent.com/miachen410/Mushrooms/master/mushrooms.csv", header = FALSE)
# Data doesn't have column names, so header is set to FALSE to avoid reading first row as headers
Assign column names to mushrooms data frame:
names(mushrooms) <- c("edibility", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat")
Change first column name from “edibility”" to “class”:
names(mushrooms)[names(mushrooms)=="edibility"] <- "class"
Look at the first 6 rows of the data using head() function:
head(mushrooms)
## class cap-shape cap-surface cap-color bruises odor gill-attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill-spacing gill-size gill-color stalk-shape stalk-root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore-print-color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
Look at the structure of the variables using str() function:
str(mushrooms)
## 'data.frame': 8124 obs. of 23 variables:
## $ class : Factor w/ 2 levels "e","p": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap-shape : Factor w/ 6 levels "b","c","f","k",..: 6 6 1 6 6 6 1 1 6 1 ...
## $ cap-surface : Factor w/ 4 levels "f","g","s","y": 3 3 3 4 3 4 3 4 4 3 ...
## $ cap-color : Factor w/ 10 levels "b","c","e","g",..: 5 10 9 9 4 10 9 9 9 10 ...
## $ bruises : Factor w/ 2 levels "f","t": 2 2 2 2 1 2 2 2 2 2 ...
## $ odor : Factor w/ 9 levels "a","c","f","l",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill-attachment : Factor w/ 2 levels "a","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ gill-spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill-size : Factor w/ 2 levels "b","n": 2 1 1 2 1 1 1 1 2 1 ...
## $ gill-color : Factor w/ 12 levels "b","e","g","h",..: 5 5 6 6 5 6 3 6 8 3 ...
## $ stalk-shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk-root : Factor w/ 5 levels "?","b","c","e",..: 4 3 3 4 4 3 3 3 4 3 ...
## $ stalk-surface-above-ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk-surface-below-ring: Factor w/ 4 levels "f","k","s","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ stalk-color-above-ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ stalk-color-below-ring : Factor w/ 9 levels "b","c","e","g",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ veil-type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil-color : Factor w/ 4 levels "n","o","w","y": 3 3 3 3 3 3 3 3 3 3 ...
## $ ring-number : Factor w/ 3 levels "n","o","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ ring-type : Factor w/ 5 levels "e","f","l","n",..: 5 5 5 5 1 5 5 5 5 5 ...
## $ spore-print-color : Factor w/ 9 levels "b","h","k","n",..: 3 4 4 3 4 3 3 4 3 3 ...
## $ population : Factor w/ 6 levels "a","c","n","s",..: 4 3 3 4 1 3 3 4 5 4 ...
## $ habitat : Factor w/ 7 levels "d","g","l","m",..: 6 2 4 6 2 2 4 4 2 4 ...
Look at a brief summary of the dataset: how are data distributed in each category?
summary(mushrooms)
## class cap-shape cap-surface cap-color bruises odor
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## gill-attachment gill-spacing gill-size gill-color stalk-shape
## a: 210 c:6812 b:5612 b :1728 e:3516
## f:7914 w:1312 n:2512 p :1492 t:4608
## w :1202
## n :1048
## g : 752
## h : 732
## (Other):1170
## stalk-root stalk-surface-above-ring stalk-surface-below-ring
## ?:2480 f: 552 f: 600
## b:3776 k:2372 k:2304
## c: 556 s:5176 s:4936
## e:1120 y: 24 y: 284
## r: 192
##
##
## stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## w :4464 w :4384 p:8124 n: 96
## p :1872 p :1872 o: 96
## g : 576 g : 576 w:7924
## n : 448 n : 512 y: 8
## b : 432 b : 432
## o : 192 o : 192
## (Other): 140 (Other): 156
## ring-number ring-type spore-print-color population habitat
## n: 36 e:2776 w :2388 a: 384 d:3148
## o:7488 f: 48 n :1968 c: 340 g:2148
## t: 600 l:1296 k :1872 n: 400 l: 832
## n: 36 h :1632 s:1248 m: 292
## p:3968 r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
Replace abbreviations with full terms for all values using gsub(): edible=e, poisonous=p Assign to corresponding column in data frame:
mushrooms$class <- gsub("e", "edible", mushrooms$class)
mushrooms$class <- gsub("p", "poisonous", mushrooms$class)
Let’s look at some traits of edible mushrooms:
edible <- subset(mushrooms, class == "edible")
summary(edible)
## class cap-shape cap-surface cap-color bruises
## Length:4208 b: 404 f:1560 n :1264 f:1456
## Class :character c: 0 g: 0 g :1032 t:2752
## Mode :character f:1596 s:1144 w : 720
## k: 228 y:1504 e : 624
## s: 32 y : 400
## x:1948 p : 56
## (Other): 112
## odor gill-attachment gill-spacing gill-size gill-color
## n :3408 a: 192 c:3008 b:3920 w :956
## a : 400 f:4016 w:1200 n: 288 n :936
## l : 400 p :852
## c : 0 u :444
## f : 0 k :344
## m : 0 g :248
## (Other): 0 (Other):428
## stalk-shape stalk-root stalk-surface-above-ring stalk-surface-below-ring
## e:1616 ?: 720 f: 408 f: 456
## t:2592 b:1920 k: 144 k: 144
## c: 512 s:3640 s:3400
## e: 864 y: 16 y: 208
## r: 192
##
##
## stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## w :2752 w :2704 p:4208 n: 96
## g : 576 g : 576 o: 96
## p : 576 p : 576 w:4016
## o : 192 o : 192 y: 0
## e : 96 e : 96
## n : 16 n : 64
## (Other): 0 (Other): 0
## ring-number ring-type spore-print-color population habitat
## n: 0 e:1008 n :1744 a: 384 d:1880
## o:3680 f: 48 k :1648 c: 288 g:1408
## t: 528 l: 0 w : 576 n: 400 l: 240
## n: 0 b : 48 s: 880 m: 256
## p:3152 h : 48 v:1192 p: 136
## o : 48 y:1064 u: 96
## (Other): 96 w: 192
Then look at traits of poisonous mushrooms and compare those with edible mushrooms:
poisonous <- subset(mushrooms, class == "poisonous")
summary(poisonous)
## class cap-shape cap-surface cap-color bruises
## Length:3916 b: 48 f: 760 n :1020 f:3292
## Class :character c: 4 g: 4 e : 876 t: 624
## Mode :character f:1556 s:1412 g : 808
## k: 600 y:1740 y : 672
## s: 0 w : 320
## x:1708 b : 120
## (Other): 100
## odor gill-attachment gill-spacing gill-size gill-color
## f :2160 a: 18 c:3804 b:1692 b :1728
## s : 576 f:3898 w: 112 n:2224 p : 640
## y : 576 h : 528
## p : 256 g : 504
## c : 192 w : 246
## n : 120 n : 112
## (Other): 36 (Other): 158
## stalk-shape stalk-root stalk-surface-above-ring stalk-surface-below-ring
## e:1900 ?:1760 f: 144 f: 144
## t:2016 b:1856 k:2228 k:2160
## c: 44 s:1536 s:1536
## e: 256 y: 8 y: 76
## r: 0
##
##
## stalk-color-above-ring stalk-color-below-ring veil-type veil-color
## w :1712 w :1680 p:3916 n: 0
## p :1296 p :1296 o: 0
## b : 432 n : 448 w:3908
## n : 432 b : 432 y: 8
## c : 36 c : 36
## y : 8 y : 24
## (Other): 0 (Other): 0
## ring-number ring-type spore-print-color population habitat
## n: 36 e:1768 w :1812 a: 0 d:1268
## o:3808 f: 0 h :1584 c: 52 g: 740
## t: 72 l:1296 k : 224 n: 0 l: 592
## n: 36 n : 224 s: 368 m: 36
## p: 816 r : 72 v:2848 p:1008
## b : 0 y: 648 u: 272
## (Other): 0 w: 0
odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
mushrooms$odor <- gsub("^a$", "almond", mushrooms$odor)
mushrooms$odor <- gsub("^l$", "anise", mushrooms$odor)
mushrooms$odor <- gsub("^c$", "creosote", mushrooms$odor)
mushrooms$odor <- gsub("^y$", "fishy", mushrooms$odor)
mushrooms$odor <- gsub("^f$", "foul", mushrooms$odor)
mushrooms$odor <- gsub("^m$", "musty", mushrooms$odor)
mushrooms$odor <- gsub("^n$", "none", mushrooms$odor)
mushrooms$odor <- gsub("^p$", "pungent", mushrooms$odor)
mushrooms$odor <- gsub("^s$", "spicy", mushrooms$odor)
spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
mushrooms$'spore-print-color' <- gsub("^k$", "black", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^n$", "brown", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^b$", "buff", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^h$", "chocolate", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^r$", "green", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^o$", "orange", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^u$", "purple", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^w$", "white", mushrooms$'spore-print-color')
mushrooms$'spore-print-color' <- gsub("^y$", "yellow", mushrooms$'spore-print-color')
population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
mushrooms$population <- gsub("^a$", "abundant", mushrooms$population)
mushrooms$population <- gsub("^c$", "clustered", mushrooms$population)
mushrooms$population <- gsub("^n$", "numerous", mushrooms$population)
mushrooms$population <- gsub("^s$", "scattered", mushrooms$population)
mushrooms$population <- gsub("^v$", "several", mushrooms$population)
mushrooms$population <- gsub("^y$", "solitary", mushrooms$population)
Create a subset containing only “class”, “odor”, “spore-print-color” and “population” for analysis:
analysis <- subset(mushrooms, select = c("class", "odor", "spore-print-color", "population"))
summary(analysis)
## class odor spore-print-color
## Length:8124 Length:8124 Length:8124
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## population
## Length:8124
## Class :character
## Mode :character
str(analysis)
## 'data.frame': 8124 obs. of 4 variables:
## $ class : chr "poisonous" "edible" "edible" "poisonous" ...
## $ odor : chr "pungent" "almond" "anise" "pungent" ...
## $ spore-print-color: chr "black" "brown" "brown" "black" ...
## $ population : chr "scattered" "numerous" "numerous" "scattered" ...
Change variable types to factor:
analysis$class <- as.factor(analysis$class)
analysis$odor <- as.factor(analysis$odor)
analysis$'spore-print-color' <- as.factor(analysis$'spore-print-color')
analysis$population <- as.factor(analysis$population)
summary(analysis)
## class odor spore-print-color population
## edible :4208 none :3528 white :2388 abundant : 384
## poisonous:3916 foul :2160 brown :1968 clustered: 340
## fishy : 576 black :1872 numerous : 400
## spicy : 576 chocolate:1632 scattered:1248
## almond : 400 green : 72 several :4040
## anise : 400 buff : 48 solitary :1712
## (Other): 484 (Other) : 144
odor <- subset(analysis, select = c("class", "odor"))
a <- ggplot(odor, aes(x = odor, color = class, fill = class)) + geom_bar(position = "stack")
a
spore <- subset(analysis, select = c("class", "spore-print-color"))
b <- ggplot(spore, aes(x = `spore-print-color`, color = class, fill = class)) + geom_bar(position = "stack")
b
population <- subset(analysis, select = c("class", "population"))
c <- ggplot(population, aes(x = population, color = class, fill = class)) + geom_bar(position = "stack")
c