In collaboration with Bryce O’Connor in Part 1
mushroom<- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data",
header=TRUE,
na.strings = "?")
mushroom<- read.csv(file = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header = FALSE, sep = ",",strip.white = TRUE,
stringsAsFactors = TRUE,
col.names = c("class","cap-shape","cap-surface","cap-color","bruises",
"odor","gill-attachment","gill-spacing","gill-size",
"gill-color","stalk-shape","stalk-root","stalk-surface-above-ring",
"stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring",
"veil-type","veil-color","ring-number","ring-type","spore-print-color",
"population","habitat"))
summary(mushroom)
## class cap.shape cap.surface cap.color bruises odor
## e:4208 b: 452 f:2320 n :2284 f:4748 n :3528
## p:3916 c: 4 g: 4 g :1840 t:3376 f :2160
## f:3152 s:2556 e :1500 s : 576
## k: 828 y:3244 y :1072 y : 576
## s: 32 w :1040 a : 400
## x:3656 b : 168 l : 400
## (Other): 220 (Other): 484
## gill.attachment gill.spacing gill.size gill.color stalk.shape stalk.root
## a: 210 c:6812 b:5612 b :1728 e:3516 ?:2480
## f:7914 w:1312 n:2512 p :1492 t:4608 b:3776
## w :1202 c: 556
## n :1048 e:1120
## g : 752 r: 192
## h : 732
## (Other):1170
## stalk.surface.above.ring stalk.surface.below.ring stalk.color.above.ring
## f: 552 f: 600 w :4464
## k:2372 k:2304 p :1872
## s:5176 s:4936 g : 576
## y: 24 y: 284 n : 448
## b : 432
## o : 192
## (Other): 140
## stalk.color.below.ring veil.type veil.color ring.number ring.type
## w :4384 p:8124 n: 96 n: 36 e:2776
## p :1872 o: 96 o:7488 f: 48
## g : 576 w:7924 t: 600 l:1296
## n : 512 y: 8 n: 36
## b : 432 p:3968
## o : 192
## (Other): 156
## spore.print.color population habitat
## w :2388 a: 384 d:3148
## n :1968 c: 340 g:2148
## k :1872 n: 400 l: 832
## h :1632 s:1248 m: 292
## r : 72 v:4040 p:1144
## b : 48 y:1712 u: 368
## (Other): 144 w: 192
shroom.datalevels <- cbind.data.frame(Variable=names(mushroom), Total_Levels=sapply(mushroom,function(x){as.numeric(length(levels(x)))}))
shroom.datalevels
## Variable Total_Levels
## class class 2
## cap.shape cap.shape 6
## cap.surface cap.surface 4
## cap.color cap.color 10
## bruises bruises 2
## odor odor 9
## gill.attachment gill.attachment 2
## gill.spacing gill.spacing 2
## gill.size gill.size 2
## gill.color gill.color 12
## stalk.shape stalk.shape 2
## stalk.root stalk.root 5
## stalk.surface.above.ring stalk.surface.above.ring 4
## stalk.surface.below.ring stalk.surface.below.ring 4
## stalk.color.above.ring stalk.color.above.ring 9
## stalk.color.below.ring stalk.color.below.ring 9
## veil.type veil.type 1
## veil.color veil.color 4
## ring.number ring.number 3
## ring.type ring.type 5
## spore.print.color spore.print.color 9
## population population 6
## habitat habitat 7
levels(mushroom$class)<- c("edible","poisonous")
levels(mushroom$cap.shape)<-c("bell","conical","flat","knobbed","sunken","convex")
levels(mushroom$cap.surface)<- c("fibrous","grooves","smooth","scaly")
levels(mushroom$cap.color)<- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")
levels(mushroom$bruises)<- c("bruisesno","bruisesyes")
levels(mushroom$odor)<-c("almond","creosote","foul","anise","musty","nosmell","pungent","spicy","fishy")
levels(mushroom$gill.attachment)<- c("attached","free")
levels(mushroom$gill.spacing)<- c("close","crowded")
levels(mushroom$gill.size)<-c("broad","narrow")
levels(mushroom$gill.color)<- c("buff","red","gray","chocolate","black","brown","orange","pink","green","purple","white","yellow")
levels(mushroom$stalk.shape)<- c("enlarging","tapering")
levels(mushroom$stalk.root)<- c("missing","bulbous","club","equal","rooted")
levels(mushroom$stalk.surface.above.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom$stalk.surface.below.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom$stalk.color.above.ring)<- c("buff","cinnamon","red","gray","brown", "orange","pink","white","yellow")
levels(mushroom$stalk.color.below.ring)<- c("buff","cinnamon","red","gray","brown","orange","pink","white","yellow")
levels(mushroom$veil.type)<-c("partial")
levels(mushroom$veil.color)<- c("brown","orange","white","yellow")
levels(mushroom$ring.number)<-c("none","one","two")
levels(mushroom$ring.type)<- c("evanescent","flaring","large","none","pendant")
levels(mushroom$spore.print.color)<- c("buff","chocolate","black","brown","orange","green","purple","white","yellow")
levels(mushroom$population)<- c("abundant","clustered","numerous","scattered","several","solitary")
levels(mushroom$habitat)<-c("woods","grasses","leaves","meadows","paths","urban","waste")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages -------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts ----------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#p <- ggplot(data = mushroom, aes(x=cap.shape, y=cap.surface, color=class))
#p + geom_bar(alpha=0.3) + scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))
#What is he smoking, what does this mean and this code definitly does not make the same graph
#Bryce and I worked together to make a plot that looks more like the picture on the blog
ggplot(data=mushroom, aes(x=cap.shape, fill = class))+
geom_bar(position = 'dodge')
#At this point I've decided to abandon this guys code becuase this is nonsense
#I tried this before I found out the graph was a mosiac plot, so this was my attempt to make the data work
#habitat <- ggplot(mushroom, mapping = aes(x=population , y=habitat, fill=class))+
#geom_bin2d(binwidth = 0.2)
#its close but not what I was looking for
Ok I found a link to this guys blog and he went trhough this same project but with the correct code. From this point I was using code from this blog #https://duttashi.github.io/blog/to-eat-or-not-to-eat/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+RStoriesDataSpeak+%28R+%E2%80%93+Stories+Data+Speak%29
library(vcd)
## Warning: package 'vcd' was built under R version 3.6.3
## Loading required package: grid
table(mushroom$habitat, mushroom$class)
##
## edible poisonous
## woods 1880 1268
## grasses 1408 740
## leaves 240 592
## meadows 256 36
## paths 136 1008
## urban 96 272
## waste 192 0
mosaicplot(~ habitat+class, data = mushroom,cex.axis = 0.9, shade = TRUE,
main="Bivariate data visualization",
sub = "Relationship between mushroom habitat and class",
las=2, off=10,border="chocolate",xlab="habitat", ylab="class" )
table(mushroom$population, mushroom$class)
##
## edible poisonous
## abundant 384 0
## clustered 288 52
## numerous 400 0
## scattered 880 368
## several 1192 2848
## solitary 1064 648
mosaicplot(~ population+class, data = mushroom,
cex.axis = 0.9, shade = TRUE,
main="Bivariate data visualization",
sub = "Relationship between mushroom population and class",
las=2, off=10,border="chocolate",xlab="population", ylab="class")
#Data Analysis
chisq.test(mushroom$cap.shape, mushroom$cap.surface, correct = FALSE)
## Warning in chisq.test(mushroom$cap.shape, mushroom$cap.surface, correct =
## FALSE): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mushroom$cap.shape and mushroom$cap.surface
## X-squared = 1011.5, df = 15, p-value < 2.2e-16
chisq.test(mushroom$habitat, mushroom$odor, correct = FALSE)
## Warning in chisq.test(mushroom$habitat, mushroom$odor, correct = FALSE): Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mushroom$habitat and mushroom$odor
## X-squared = 6675.1, df = 48, p-value < 2.2e-16
library(GoodmanKruskal)
## Warning: package 'GoodmanKruskal' was built under R version 3.6.3
varset1<- c("cap.shape","cap.surface","habitat","odor","class")
mushroomFrame1<- subset(mushroom, select = varset1)
GKmatrix1<- GKtauDataframe(mushroomFrame1)
plot(GKmatrix1, corrColors = "blue")
#make your own graphics
library(ggplot2)
library(treemapify)
## Warning: package 'treemapify' was built under R version 3.6.3
#shroom.tree <- (data=mushroom, aes(area = mushroom$cap.color, fill = mushroom$class))+
# geom_treemap()
#For some reason that wont work and for the life of me I cant figure out why
mosaicplot(~ cap.color+class, data = mushroom,cex.axis = 0.9, shade = TRUE,
main = "Relationship between mushroom cap color and class",
las=2, off=10,border="chocolate",xlab="cap color", ylab="class" )
ggplot(data=mushroom, aes(x=cap.color, fill = class))+
geom_bar(position = 'dodge')
from these two graphics it is clear that cap color by itself is not a great indicator of whether a mushroom is edible
#I couldn't get the method from Rpub to work so I used a diffrent method for creating trees
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(ggplot2)
library(tidyverse)
library (caTools)
## Warning: package 'caTools' was built under R version 3.6.3
set.seed(1)
shroom.sample = sample.split(mushroom,SplitRatio = 0.70)
shroom.train = subset(mushroom, shroom.sample ==TRUE)
shroom.test = subset(mushroom, shroom.sample == FALSE)
shroom.tree <- rpart(class~., data=shroom.train, control = rpart.control(cp = .01))
summary(shroom.tree)
## Call:
## rpart(formula = class ~ ., data = shroom.train, control = rpart.control(cp = 0.01))
## n= 5652
##
## CP nsplit rel error xerror xstd
## 1 0.96627566 0 1.00000000 1.00000000 0.013770998
## 2 0.01942815 1 0.03372434 0.03372434 0.003487272
## 3 0.01000000 2 0.01429619 0.01429619 0.002281310
##
## Variable importance
## odor spore.print.color gill.color
## 25 19 15
## stalk.surface.above.ring ring.type stalk.surface.below.ring
## 13 13 13
##
## Node number 1: 5652 observations, complexity param=0.9662757
## predicted class=edible expected loss=0.482661 P(node) =1
## class counts: 2924 2728
## probabilities: 0.517 0.483
## left son=2 (3016 obs) right son=3 (2636 obs)
## Primary splits:
## odor splits as LRRLRLRRR, improve=2644.2140, (0 missing)
## spore.print.color splits as LRLLLRLRL, improve=1559.6140, (0 missing)
## gill.color splits as RLRRLLLLRLLL, improve=1081.6470, (0 missing)
## stalk.surface.above.ring splits as LRLL, improve= 982.3427, (0 missing)
## stalk.surface.below.ring splits as LRLL, improve= 922.0275, (0 missing)
## Surrogate splits:
## spore.print.color splits as LRLLLLLRL, agree=0.864, adj=0.709, (0 split)
## gill.color splits as RLRRLLLLLLLL, agree=0.814, adj=0.601, (0 split)
## stalk.surface.above.ring splits as LRLL, agree=0.783, adj=0.534, (0 split)
## ring.type splits as RLRRL, agree=0.782, adj=0.532, (0 split)
## stalk.surface.below.ring splits as LRLL, agree=0.781, adj=0.530, (0 split)
##
## Node number 2: 3016 observations, complexity param=0.01942815
## predicted class=edible expected loss=0.03050398 P(node) =0.5336164
## class counts: 2924 92
## probabilities: 0.969 0.031
## left son=4 (2963 obs) right son=5 (53 obs)
## Primary splits:
## spore.print.color splits as LLLLLRLLL, improve=101.41390, (0 missing)
## stalk.color.below.ring splits as --LLLLLLR, improve= 43.56869, (0 missing)
## gill.color splits as -LLLLLLLRLLL, improve= 28.33862, (0 missing)
## cap.color splits as RLLLLRLLLL, improve= 21.36008, (0 missing)
## stalk.color.above.ring splits as --LLLLLLR, improve= 15.07876, (0 missing)
## Surrogate splits:
## gill.color splits as -LLLLLLLRLLL, agree=0.987, adj=0.283, (0 split)
##
## Node number 3: 2636 observations
## predicted class=poisonous expected loss=0 P(node) =0.4663836
## class counts: 0 2636
## probabilities: 0.000 1.000
##
## Node number 4: 2963 observations
## predicted class=edible expected loss=0.01316234 P(node) =0.5242392
## class counts: 2924 39
## probabilities: 0.987 0.013
##
## Node number 5: 53 observations
## predicted class=poisonous expected loss=0 P(node) =0.009377212
## class counts: 0 53
## probabilities: 0.000 1.000
plot(shroom.tree)
text(shroom.tree, pretty=0)
#This is a very large tree with 35 nodes and becuase I am using this method I don't think I can use the cross validation method
#need to find cp that results in lowest error
plotcp(shroom.tree)
printcp(shroom.tree)
##
## Classification tree:
## rpart(formula = class ~ ., data = shroom.train, control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] odor spore.print.color
##
## Root node error: 2728/5652 = 0.48266
##
## n= 5652
##
## CP nsplit rel error xerror xstd
## 1 0.966276 0 1.000000 1.000000 0.0137710
## 2 0.019428 1 0.033724 0.033724 0.0034873
## 3 0.010000 2 0.014296 0.014296 0.0022813
shroom.tree$cptable[which.min(shroom.tree$cptable[, "xerror"]), "CP"]
## [1] 0.01
#I kept changing the cp till I found the one with the smallest xerror whihc was .0005
shroom.tree.prune <- rpart(class~., data=shroom.train, control = rpart.control(cp = .0005))
rpart.plot(shroom.tree.prune, extra = 104, box.palette = "GnBu",
branch.lty = 3, shadow.col = "gray", nn = TRUE)