In collaboration with Bryce O’Connor in Part 1

Part 1

mushroom<- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", 
                      header=TRUE,
                      na.strings = "?")

mushroom<- read.csv(file = "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data", header = FALSE, sep = ",",strip.white = TRUE,
                         stringsAsFactors = TRUE, 
                         col.names = c("class","cap-shape","cap-surface","cap-color","bruises",
                                       "odor","gill-attachment","gill-spacing","gill-size",
                                       "gill-color","stalk-shape","stalk-root","stalk-surface-above-ring",
                                       "stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring",
                                       "veil-type","veil-color","ring-number","ring-type","spore-print-color",
                                       "population","habitat"))
summary(mushroom)
##  class    cap.shape cap.surface   cap.color    bruises       odor     
##  e:4208   b: 452    f:2320      n      :2284   f:4748   n      :3528  
##  p:3916   c:   4    g:   4      g      :1840   t:3376   f      :2160  
##           f:3152    s:2556      e      :1500            s      : 576  
##           k: 828    y:3244      y      :1072            y      : 576  
##           s:  32                w      :1040            a      : 400  
##           x:3656                b      : 168            l      : 400  
##                                 (Other): 220            (Other): 484  
##  gill.attachment gill.spacing gill.size   gill.color   stalk.shape stalk.root
##  a: 210          c:6812       b:5612    b      :1728   e:3516      ?:2480    
##  f:7914          w:1312       n:2512    p      :1492   t:4608      b:3776    
##                                         w      :1202               c: 556    
##                                         n      :1048               e:1120    
##                                         g      : 752               r: 192    
##                                         h      : 732                         
##                                         (Other):1170                         
##  stalk.surface.above.ring stalk.surface.below.ring stalk.color.above.ring
##  f: 552                   f: 600                   w      :4464          
##  k:2372                   k:2304                   p      :1872          
##  s:5176                   s:4936                   g      : 576          
##  y:  24                   y: 284                   n      : 448          
##                                                    b      : 432          
##                                                    o      : 192          
##                                                    (Other): 140          
##  stalk.color.below.ring veil.type veil.color ring.number ring.type
##  w      :4384           p:8124    n:  96     n:  36      e:2776   
##  p      :1872                     o:  96     o:7488      f:  48   
##  g      : 576                     w:7924     t: 600      l:1296   
##  n      : 512                     y:   8                 n:  36   
##  b      : 432                                            p:3968   
##  o      : 192                                                     
##  (Other): 156                                                     
##  spore.print.color population habitat 
##  w      :2388      a: 384     d:3148  
##  n      :1968      c: 340     g:2148  
##  k      :1872      n: 400     l: 832  
##  h      :1632      s:1248     m: 292  
##  r      :  72      v:4040     p:1144  
##  b      :  48      y:1712     u: 368  
##  (Other): 144                 w: 192
shroom.datalevels <- cbind.data.frame(Variable=names(mushroom), Total_Levels=sapply(mushroom,function(x){as.numeric(length(levels(x)))}))
shroom.datalevels
##                                          Variable Total_Levels
## class                                       class            2
## cap.shape                               cap.shape            6
## cap.surface                           cap.surface            4
## cap.color                               cap.color           10
## bruises                                   bruises            2
## odor                                         odor            9
## gill.attachment                   gill.attachment            2
## gill.spacing                         gill.spacing            2
## gill.size                               gill.size            2
## gill.color                             gill.color           12
## stalk.shape                           stalk.shape            2
## stalk.root                             stalk.root            5
## stalk.surface.above.ring stalk.surface.above.ring            4
## stalk.surface.below.ring stalk.surface.below.ring            4
## stalk.color.above.ring     stalk.color.above.ring            9
## stalk.color.below.ring     stalk.color.below.ring            9
## veil.type                               veil.type            1
## veil.color                             veil.color            4
## ring.number                           ring.number            3
## ring.type                               ring.type            5
## spore.print.color               spore.print.color            9
## population                             population            6
## habitat                                   habitat            7
levels(mushroom$class)<- c("edible","poisonous")
levels(mushroom$cap.shape)<-c("bell","conical","flat","knobbed","sunken","convex") 
levels(mushroom$cap.surface)<- c("fibrous","grooves","smooth","scaly")
levels(mushroom$cap.color)<- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")
levels(mushroom$bruises)<- c("bruisesno","bruisesyes")
levels(mushroom$odor)<-c("almond","creosote","foul","anise","musty","nosmell","pungent","spicy","fishy")
levels(mushroom$gill.attachment)<- c("attached","free")
levels(mushroom$gill.spacing)<- c("close","crowded")
levels(mushroom$gill.size)<-c("broad","narrow")
levels(mushroom$gill.color)<- c("buff","red","gray","chocolate","black","brown","orange","pink","green","purple","white","yellow")
levels(mushroom$stalk.shape)<- c("enlarging","tapering")
levels(mushroom$stalk.root)<- c("missing","bulbous","club","equal","rooted")
levels(mushroom$stalk.surface.above.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom$stalk.surface.below.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom$stalk.color.above.ring)<- c("buff","cinnamon","red","gray","brown", "orange","pink","white","yellow")
levels(mushroom$stalk.color.below.ring)<- c("buff","cinnamon","red","gray","brown","orange","pink","white","yellow")
levels(mushroom$veil.type)<-c("partial")
levels(mushroom$veil.color)<- c("brown","orange","white","yellow")
levels(mushroom$ring.number)<-c("none","one","two")
levels(mushroom$ring.type)<- c("evanescent","flaring","large","none","pendant")
levels(mushroom$spore.print.color)<- c("buff","chocolate","black","brown","orange","green","purple","white","yellow")
levels(mushroom$population)<- c("abundant","clustered","numerous","scattered","several","solitary")
levels(mushroom$habitat)<-c("woods","grasses","leaves","meadows","paths","urban","waste")

Data visualisation

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages -------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'stringr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.2
## -- Conflicts ----------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#p <- ggplot(data = mushroom, aes(x=cap.shape, y=cap.surface, color=class))
#p + geom_bar(alpha=0.3) + scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))
#What is he smoking, what does this mean and this code definitly does not make the same graph
#Bryce and I worked together to make a plot that looks more like the picture on the blog
ggplot(data=mushroom, aes(x=cap.shape, fill = class))+
  geom_bar(position = 'dodge')

#At this point I've decided to abandon this guys code becuase this is nonsense
#I tried this before I found out the graph was a mosiac plot, so this was my attempt to make the data work
#habitat <- ggplot(mushroom, mapping = aes(x=population , y=habitat, fill=class))+
#geom_bin2d(binwidth = 0.2) 
#its close but not what I was looking for

Ok I found a link to this guys blog and he went trhough this same project but with the correct code. From this point I was using code from this blog #https://duttashi.github.io/blog/to-eat-or-not-to-eat/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+RStoriesDataSpeak+%28R+%E2%80%93+Stories+Data+Speak%29

library(vcd)
## Warning: package 'vcd' was built under R version 3.6.3
## Loading required package: grid
table(mushroom$habitat, mushroom$class) 
##          
##           edible poisonous
##   woods     1880      1268
##   grasses   1408       740
##   leaves     240       592
##   meadows    256        36
##   paths      136      1008
##   urban       96       272
##   waste      192         0
mosaicplot(~ habitat+class, data = mushroom,cex.axis = 0.9, shade = TRUE, 
             main="Bivariate data visualization",
             sub = "Relationship between mushroom habitat and class",
             las=2, off=10,border="chocolate",xlab="habitat", ylab="class" )

table(mushroom$population, mushroom$class)
##            
##             edible poisonous
##   abundant     384         0
##   clustered    288        52
##   numerous     400         0
##   scattered    880       368
##   several     1192      2848
##   solitary    1064       648
mosaicplot(~ population+class, data = mushroom,
             cex.axis = 0.9, shade = TRUE, 
             main="Bivariate data visualization",
             sub = "Relationship between mushroom population and class",
             las=2, off=10,border="chocolate",xlab="population", ylab="class")

#Data Analysis

chisq.test(mushroom$cap.shape, mushroom$cap.surface, correct = FALSE)
## Warning in chisq.test(mushroom$cap.shape, mushroom$cap.surface, correct =
## FALSE): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mushroom$cap.shape and mushroom$cap.surface
## X-squared = 1011.5, df = 15, p-value < 2.2e-16
chisq.test(mushroom$habitat, mushroom$odor, correct = FALSE)
## Warning in chisq.test(mushroom$habitat, mushroom$odor, correct = FALSE): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mushroom$habitat and mushroom$odor
## X-squared = 6675.1, df = 48, p-value < 2.2e-16
library(GoodmanKruskal)
## Warning: package 'GoodmanKruskal' was built under R version 3.6.3
varset1<- c("cap.shape","cap.surface","habitat","odor","class")
mushroomFrame1<- subset(mushroom, select = varset1)
GKmatrix1<- GKtauDataframe(mushroomFrame1)
plot(GKmatrix1, corrColors = "blue")

Part 2

#make your own graphics

library(ggplot2)
library(treemapify)
## Warning: package 'treemapify' was built under R version 3.6.3
#shroom.tree <- (data=mushroom, aes(area = mushroom$cap.color, fill = mushroom$class))+
 # geom_treemap()
#For some reason that wont work and for the life of me I cant figure out why

mosaicplot(~ cap.color+class, data = mushroom,cex.axis = 0.9, shade = TRUE, 
           main = "Relationship between mushroom cap color and class",
           las=2, off=10,border="chocolate",xlab="cap color", ylab="class" )

ggplot(data=mushroom, aes(x=cap.color, fill = class))+
  geom_bar(position = 'dodge')

from these two graphics it is clear that cap color by itself is not a great indicator of whether a mushroom is edible

Part 3

fit a classification model

#I couldn't get the method from Rpub to work so I used a diffrent method for creating trees
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
library(ggplot2)
library(tidyverse)
library (caTools)
## Warning: package 'caTools' was built under R version 3.6.3
set.seed(1)

shroom.sample = sample.split(mushroom,SplitRatio = 0.70)
shroom.train = subset(mushroom, shroom.sample ==TRUE)
shroom.test = subset(mushroom, shroom.sample == FALSE)

shroom.tree <- rpart(class~., data=shroom.train, control = rpart.control(cp = .01))
summary(shroom.tree)
## Call:
## rpart(formula = class ~ ., data = shroom.train, control = rpart.control(cp = 0.01))
##   n= 5652 
## 
##           CP nsplit  rel error     xerror        xstd
## 1 0.96627566      0 1.00000000 1.00000000 0.013770998
## 2 0.01942815      1 0.03372434 0.03372434 0.003487272
## 3 0.01000000      2 0.01429619 0.01429619 0.002281310
## 
## Variable importance
##                     odor        spore.print.color               gill.color 
##                       25                       19                       15 
## stalk.surface.above.ring                ring.type stalk.surface.below.ring 
##                       13                       13                       13 
## 
## Node number 1: 5652 observations,    complexity param=0.9662757
##   predicted class=edible     expected loss=0.482661  P(node) =1
##     class counts:  2924  2728
##    probabilities: 0.517 0.483 
##   left son=2 (3016 obs) right son=3 (2636 obs)
##   Primary splits:
##       odor                     splits as  LRRLRLRRR,    improve=2644.2140, (0 missing)
##       spore.print.color        splits as  LRLLLRLRL,    improve=1559.6140, (0 missing)
##       gill.color               splits as  RLRRLLLLRLLL, improve=1081.6470, (0 missing)
##       stalk.surface.above.ring splits as  LRLL,         improve= 982.3427, (0 missing)
##       stalk.surface.below.ring splits as  LRLL,         improve= 922.0275, (0 missing)
##   Surrogate splits:
##       spore.print.color        splits as  LRLLLLLRL,    agree=0.864, adj=0.709, (0 split)
##       gill.color               splits as  RLRRLLLLLLLL, agree=0.814, adj=0.601, (0 split)
##       stalk.surface.above.ring splits as  LRLL,         agree=0.783, adj=0.534, (0 split)
##       ring.type                splits as  RLRRL,        agree=0.782, adj=0.532, (0 split)
##       stalk.surface.below.ring splits as  LRLL,         agree=0.781, adj=0.530, (0 split)
## 
## Node number 2: 3016 observations,    complexity param=0.01942815
##   predicted class=edible     expected loss=0.03050398  P(node) =0.5336164
##     class counts:  2924    92
##    probabilities: 0.969 0.031 
##   left son=4 (2963 obs) right son=5 (53 obs)
##   Primary splits:
##       spore.print.color      splits as  LLLLLRLLL,    improve=101.41390, (0 missing)
##       stalk.color.below.ring splits as  --LLLLLLR,    improve= 43.56869, (0 missing)
##       gill.color             splits as  -LLLLLLLRLLL, improve= 28.33862, (0 missing)
##       cap.color              splits as  RLLLLRLLLL,   improve= 21.36008, (0 missing)
##       stalk.color.above.ring splits as  --LLLLLLR,    improve= 15.07876, (0 missing)
##   Surrogate splits:
##       gill.color splits as  -LLLLLLLRLLL, agree=0.987, adj=0.283, (0 split)
## 
## Node number 3: 2636 observations
##   predicted class=poisonous  expected loss=0  P(node) =0.4663836
##     class counts:     0  2636
##    probabilities: 0.000 1.000 
## 
## Node number 4: 2963 observations
##   predicted class=edible     expected loss=0.01316234  P(node) =0.5242392
##     class counts:  2924    39
##    probabilities: 0.987 0.013 
## 
## Node number 5: 53 observations
##   predicted class=poisonous  expected loss=0  P(node) =0.009377212
##     class counts:     0    53
##    probabilities: 0.000 1.000
plot(shroom.tree)
text(shroom.tree, pretty=0)

#This is a very large tree with 35 nodes and becuase I am using this method I don't think I can use the cross validation method
#need to find cp that results in lowest error

plotcp(shroom.tree)

printcp(shroom.tree)
## 
## Classification tree:
## rpart(formula = class ~ ., data = shroom.train, control = rpart.control(cp = 0.01))
## 
## Variables actually used in tree construction:
## [1] odor              spore.print.color
## 
## Root node error: 2728/5652 = 0.48266
## 
## n= 5652 
## 
##         CP nsplit rel error   xerror      xstd
## 1 0.966276      0  1.000000 1.000000 0.0137710
## 2 0.019428      1  0.033724 0.033724 0.0034873
## 3 0.010000      2  0.014296 0.014296 0.0022813
shroom.tree$cptable[which.min(shroom.tree$cptable[, "xerror"]), "CP"]
## [1] 0.01
#I kept changing the cp till I found the one with the smallest xerror whihc was .0005
shroom.tree.prune <- rpart(class~., data=shroom.train, control = rpart.control(cp = .0005))

rpart.plot(shroom.tree.prune, extra = 104, box.palette = "GnBu", 
           branch.lty = 3, shadow.col = "gray", nn = TRUE)