Mycology Data
Part 1: R-blogger example
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
theURL<- "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom.data<- read.csv(file = theURL, header = FALSE, sep = ",",strip.white = TRUE,
stringsAsFactors = TRUE,
col.names = c("class","cap-shape","cap-surface","cap-color","bruises",
"odor","gill-attachment","gill-spacing","gill-size",
"gill-color","stalk-shape","stalk-root","stalk-surface-above-ring",
"stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring",
"veil-type","veil-color","ring-number","ring-type","spore-print-color",
"population","habitat"))
mushroom.data.levels<-cbind.data.frame(Variable=names(mushroom.data), Total_Levels=sapply(mushroom.data,function(x){as.numeric(length(levels(x)))}))
print(mushroom.data.levels)
## Variable Total_Levels
## class class 2
## cap.shape cap.shape 6
## cap.surface cap.surface 4
## cap.color cap.color 10
## bruises bruises 2
## odor odor 9
## gill.attachment gill.attachment 2
## gill.spacing gill.spacing 2
## gill.size gill.size 2
## gill.color gill.color 12
## stalk.shape stalk.shape 2
## stalk.root stalk.root 5
## stalk.surface.above.ring stalk.surface.above.ring 4
## stalk.surface.below.ring stalk.surface.below.ring 4
## stalk.color.above.ring stalk.color.above.ring 9
## stalk.color.below.ring stalk.color.below.ring 9
## veil.type veil.type 1
## veil.color veil.color 4
## ring.number ring.number 3
## ring.type ring.type 5
## spore.print.color spore.print.color 9
## population population 6
## habitat habitat 7
mushroom.data$gill.attachment<- NULL
levels(mushroom.data$class)<- c("edible","poisonous")
levels(mushroom.data$cap.shape)<-c("bell","conical","flat","knobbed","sunken","convex")
levels(mushroom.data$cap.surface)<- c("fibrous","grooves","smooth","scaly")
levels(mushroom.data$cap.color)<- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")
levels(mushroom.data$bruises)<- c("bruisesno","bruisesyes")
levels(mushroom.data$odor)<-c("almond","creosote","foul","anise","musty","nosmell","pungent","spicy","fishy")
levels(mushroom.data$gill.spacing)<- c("close","crowded")
levels(mushroom.data$gill.size)<-c("broad","narrow")
levels(mushroom.data$gill.color)<- c("buff","red","gray","chocolate","black","brown","orange","pink","green","purple","white","yellow")
levels(mushroom.data$stalk.shape)<- c("enlarging","tapering")
table(mushroom.data$stalk.root)
##
## ? b c e r
## 2480 3776 556 1120 192
levels(mushroom.data$stalk.root)<- c("missing","bulbous","club","equal","rooted")
levels(mushroom.data$stalk.surface.above.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom.data$stalk.surface.below.ring)<-c("fibrous","silky","smooth","scaly")
levels(mushroom.data$stalk.color.above.ring)<- c("buff","cinnamon","red","gray","brown", "orange","pink","white","yellow")
levels(mushroom.data$stalk.color.below.ring)<- c("buff","cinnamon","red","gray","brown", "orange","pink","white","yellow")
levels(mushroom.data$veil.type)<-c("partial")
levels(mushroom.data$veil.color)<- c("brown","orange","white","yellow")
levels(mushroom.data$ring.number)<-c("none","one","two")
levels(mushroom.data$ring.type)<- c("evanescent","flaring","large","none","pendant")
levels(mushroom.data$spore.print.color)<- c("buff","chocolate","black","brown","orange","green","purple","white","yellow")
levels(mushroom.data$population)<- c("abundant","clustered","numerous","scattered","several","solitary")
levels(mushroom.data$habitat)<-c("woods","grasses","leaves","meadows","paths","urban","waste")
p<- ggplot(data = mushroom.data, aes(x=cap.shape, y=cap.surface, color=class))
p + geom_jitter(alpha=0.3) + scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))
p<- ggplot(data = mushroom.data, aes(x=population, y=habitat, color=class))
p + geom_jitter(alpha=0.3) +
scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))
p<- ggplot(data = mushroom.data, aes(x=habitat, y=habitat, color=class))
p + geom_jitter(alpha=0.3) +scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))
chisq.test(mushroom.data$cap.shape, mushroom.data$cap.surface, correct = FALSE)
## Warning in chisq.test(mushroom.data$cap.shape, mushroom.data$cap.surface, : Chi-
## squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mushroom.data$cap.shape and mushroom.data$cap.surface
## X-squared = 1011.5, df = 15, p-value < 2.2e-16
chisq.test(mushroom.data$habitat, mushroom.data$odor, correct = FALSE)
## Warning in chisq.test(mushroom.data$habitat, mushroom.data$odor, correct =
## FALSE): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mushroom.data$habitat and mushroom.data$odor
## X-squared = 6675.1, df = 48, p-value < 2.2e-16
#install.packages("GoodmanKruskal")
library(GoodmanKruskal)
varset1<- c("cap.shape","cap.surface","habitat","odor","class")
mushroomFrame1<- subset(mushroom.data, select = varset1)
GKmatrix1<- GKtauDataframe(mushroomFrame1)
plot(GKmatrix1, corrColors = "blue")
summary(mushroom.data)
## class cap.shape cap.surface cap.color
## edible :4208 bell : 452 fibrous:2320 brown :2284
## poisonous:3916 conical: 4 grooves: 4 gray :1840
## flat :3152 smooth :2556 red :1500
## knobbed: 828 scaly :3244 yellow :1072
## sunken : 32 white :1040
## convex :3656 buff : 168
## (Other): 220
## bruises odor gill.spacing gill.size gill.color
## bruisesno :4748 nosmell:3528 close :6812 broad :5612 buff :1728
## bruisesyes:3376 foul :2160 crowded:1312 narrow:2512 pink :1492
## spicy : 576 white :1202
## fishy : 576 brown :1048
## almond : 400 gray : 752
## anise : 400 chocolate: 732
## (Other): 484 (Other) :1170
## stalk.shape stalk.root stalk.surface.above.ring
## enlarging:3516 missing:2480 fibrous: 552
## tapering :4608 bulbous:3776 silky :2372
## club : 556 smooth :5176
## equal :1120 scaly : 24
## rooted : 192
##
##
## stalk.surface.below.ring stalk.color.above.ring stalk.color.below.ring
## fibrous: 600 white :4464 white :4384
## silky :2304 pink :1872 pink :1872
## smooth :4936 gray : 576 gray : 576
## scaly : 284 brown : 448 brown : 512
## buff : 432 buff : 432
## orange : 192 orange : 192
## (Other): 140 (Other): 156
## veil.type veil.color ring.number ring.type spore.print.color
## partial:8124 brown : 96 none: 36 evanescent:2776 white :2388
## orange: 96 one :7488 flaring : 48 brown :1968
## white :7924 two : 600 large :1296 black :1872
## yellow: 8 none : 36 chocolate:1632
## pendant :3968 green : 72
## buff : 48
## (Other) : 144
## population habitat
## abundant : 384 woods :3148
## clustered: 340 grasses:2148
## numerous : 400 leaves : 832
## scattered:1248 meadows: 292
## several :4040 paths :1144
## solitary :1712 urban : 368
## waste : 192
Part 2: Mushroom data graphic
The graphic is not clear on markdown so I copied the graphic into a document and explained the visual and relationship to mushroom data there.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ stringr 1.4.0
## ✓ tidyr 1.0.2 ✓ forcats 0.4.0
## ✓ purrr 0.3.3
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
URL <-"http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushroom <- read.csv(file=URL, header = F, sep=',')
names(mushroom) <- c('edible','cap_shape','cap_surface','cap_color','bruises','odor','gill_attach','gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_abo','stalk_surface_bel','stalk_color_abo','stalk_color_bel','veil_type','veil_color','ring_number','ring_type','spore_color','pop','habitat')
missed <- rep(NA,8124)
for(i in 1:8124){
ifelse(mushroom$stalk_root[i] == '?',missed[i] <- 1, missed[i] <- 0)
}
mushroom$missed <- as.factor(missed)
train.m <- sample(8124,4062)
mush.train <- mushroom[train.m,]
mush.test <- mushroom[-train.m,]
mush.model <- rpart(edible~.,data=mush.train)
mush.model
## n= 4062
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4062 1943 e (0.52166420 0.47833580)
## 2) odor=a,l,n 2182 63 e (0.97112741 0.02887259)
## 4) spore_color=b,h,k,n,o,u,w,y 2143 24 e (0.98880075 0.01119925) *
## 5) spore_color=r 39 0 p (0.00000000 1.00000000) *
## 3) odor=c,f,m,p,s,y 1880 0 p (0.00000000 1.00000000) *
feature_ditribution_plot <- function(data){
#initialize the plot list
plots <- list()
#loop for all features except edibility
for (i in 1:(ncol(data)-1))
{
#calculate the count of the feature values per edibility classification
summarized_data <- data %>% group_by(edible, .[,i+1]) %>% summarise(n = n())
#rename the feature column(it's easier to handle in the further code)
names(summarized_data)[2] <- "attr"
#create the plot for the current feature
plot <- summarized_data %>% ggplot(aes(attr , edible)) + geom_point(aes(size=n)) +
xlab(names(data)[i+1]) + ylab("Edibility")
#add the plot to the plot list
plots[[i]] <- plot
}
#remove the unnecessary variables
rm(summarized_data, i, plot)
#draw all the created plots in a grid with 3 columns
grid.arrange(grobs=plots,ncol=3)
}
feature_ditribution_plot(mush.train)
Part 3: Classification Model
This classification model was created for the mushroom dataset. The dataset was recorded using The Audubon Society Field Guide to North American Mushrooms (1981). G.H. Lincoff (Pres.), New York: Alfred A. Knopf and can be found at:https://archive.ics.uci.edu/ml/datasets/mushroom. This dataset includes 8,124 observations and has 22 attributes that characterise each mushroom. A classification tree allows us to use the set of attributes to predict a qualitative response, and in this case, whether a mushroom is poisonous or edible (class).
library(ISLR)
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
tree.mushroom<-tree(class~. -class, mushroom.data)
summary(tree.mushroom)
##
## Classification tree:
## tree(formula = class ~ . - class, data = mushroom.data)
## Variables actually used in tree construction:
## [1] "odor" "spore.print.color" "stalk.color.below.ring"
## [4] "stalk.root"
## Number of terminal nodes: 5
## Residual mean deviance: 0.01429 = 116 / 8119
## Misclassification error rate: 0.0009847 = 8 / 8124
plot(tree.mushroom)
text(tree.mushroom, pretty=0)
I fit a tree using the the mushroom dataset and utilized all attributes to predict class (poisonous or edible) except for the attribute class itself. The unpruned tree includes odor, spore.print.color, stalk.color.below.ring, and stalk.root as the attributes useful in classifying if a mushroom is poisonous or edible.
set.seed(2)
train<-sample(1:nrow(mushroom.data), 4062)
mushroom.test<-mushroom.data[-train]
dim(mushroom.test)
## [1] 8124 12
Next, I want to develop other trees that are less complex and train and test them in order to understand if they are accurate predicators for classifying mushrooms for the desired attribute. I used half the observations to train and half the observations to test.
class.test<-mushroom.data$class[-train]
tree.mushroom<-tree(class~.-class, mushroom.data, subset=train)
tree.pred.all<-predict(tree.mushroom, newdata=mushroom.data, type="class")
length(tree.pred.all)
## [1] 8124
I created a model using the training data.
tree.pred<-tree.pred.all[-train]
cm<-table(tree.pred, class.test)
cm
## class.test
## tree.pred edible poisonous
## edible 2100 3
## poisonous 0 1959
I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, only 3 mushrooms were classified incorrectly. According to the confusion matrix, 3 mushrooms were predicted to be edible but were actually poisonous. I would suggest that it is worse to predict that a mushroom is edible and have it actually be poisonous than to predict that a mushroom is poisonous and have it actually be edible. 2100 mushrooms were predicted to be edible and were edible and 1959 mushrooms were predicted to be poisonous and were poisonous.
sum(diag(cm))/sum(cm)
## [1] 0.9992614
1-sum(diag(cm))/sum(cm)
## [1] 0.0007385524
I calculated the misclassification error to be 0.000738 which is very small.This suggests this model is very good at classifying mushrooms. I will continue with developing other models with varying nodes and complexities in order to find the best model.
set.seed(3)
cv.mushroom<-cv.tree(tree.mushroom, FUN=prune.misclass)
names(cv.mushroom)
## [1] "size" "dev" "k" "method"
par(mfrow=c(1,2))
plot(cv.mushroom$size, cv.mushroom$dev, type="b")
plot(cv.mushroom$k, cv.mushroom$dev, type="b")
This plot suggests that error decreases greatly as you go from having 1 to 2 terminal nodes in the model. After this, there seem to only be only small decreases in error shown on the plot as terminal nodes increase to 3 and 4. Also, the plot shows that going from 2 terminal nodes to 3 increases the cost of complexity greatly.
par(mfrow=c(1,1))
prune.mushroom<-prune.misclass(tree.mushroom, best=2)
plot(prune.mushroom)
text(prune.mushroom, pretty=0)
I created a classification tree with 2 terminal nodes.
tree.pred2<-predict(prune.mushroom, newdata = mushroom.data, type="class")
test.pred<-tree.pred2[-train]
cm<-table(test.pred, class.test)
cm
## class.test
## test.pred edible poisonous
## edible 2100 54
## poisonous 0 1908
I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, 54 mushrooms were classified incorrectly. According to the confusion matrix, 54 mushrooms were predicted to be edible but were actually poisonous. 2100 mushrooms were predicted to be edible and were edible and 1908 mushrooms were predicted to be poisonous and were poisonous.
sum(diag(cm))/sum(cm)
## [1] 0.9867061
1-sum(diag(cm))/sum(cm)
## [1] 0.01329394
I calculated the misclassification error to be 0.01329394 which is very small.This suggests this model is very good at classifying mushrooms.This error, however, is slightly higher than the model with 4 terminal nodes. I will now create a classification tree with 3 terminal nodes.
par(mfrow=c(1,1))
prune.mushroom2<-prune.misclass(tree.mushroom, best=3)
plot(prune.mushroom2)
text(prune.mushroom2, pretty=0)
I created a classification tree with 3 terminal nodes.
tree.pred3<-predict(prune.mushroom2, newdata = mushroom.data, type="class")
test.pred1<-tree.pred3[-train]
cm<-table(test.pred1, class.test)
cm
## class.test
## test.pred1 edible poisonous
## edible 2100 21
## poisonous 0 1941
I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, 21 mushrooms were classified incorrectly. According to the confusion matrix, 21 mushrooms were predicted to be edible but were actually poisonous. 2100 mushrooms were predicted to be edible and were edible and 1941 mushrooms were predicted to be poisonous and were poisonous.
sum(diag(cm))/sum(cm)
## [1] 0.9948301
1-sum(diag(cm))/sum(cm)
## [1] 0.005169867
I calculated the misclassification error to be 0.005169867 which is very small. This suggests this model is very good at classifying mushrooms.This error is slightly higher than the model with 4 terminal nodes, and is slightly lower than the model with 2 terminals nodes. I will now create a classification tree with 5 terminal nodes.
par(mfrow=c(1,1))
prune.mushroom4<-prune.misclass(tree.mushroom, best=5)
plot(prune.mushroom4)
text(prune.mushroom4, pretty=0)
I created a classification tree with 5 terminal nodes.
tree.pred5<-predict(prune.mushroom4, newdata = mushroom.data, type="class")
test.pred3<-tree.pred3[-train]
cm<-table(test.pred3, class.test)
cm
## class.test
## test.pred3 edible poisonous
## edible 2100 21
## poisonous 0 1941
I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, 21 mushrooms were classified incorrectly. According to the confusion matrix, 21 mushrooms were predicted to be edible but were actually poisonous. 2100 mushrooms were predicted to be edible and were edible and 1941 mushrooms were predicted to be poisonous and were poisonous.
sum(diag(cm))/sum(cm)
## [1] 0.9948301
1-sum(diag(cm))/sum(cm)
## [1] 0.005169867
I calculated the misclassification error to be 0.005169867 which is very small. This suggests this model is very good at classifying mushrooms.This error is the same as the tree with 3 terminal nodes.
Overall, the model with the least amount of error has 4 terminal nodes but it is only marginally more accurate than the models with 3 and 5 terminal nodes, which have the exact same error rate. Therefore, I would suggest that the 3 terminal nodes model is the best classifcation tree to use because it is the second most accurate predictor along with 5, but it is not over-complex. I am concerned that the model with 4 terminal nodes, while it has the lowest misclassification error, also has a higher cost of complexity. The model with 3 terminal nodes has a balance between low error and complexity. This model also suggests that odor and spore.print.color are the most important attributes for classifying if a mushroom is edible or poisonous. It is clear that odor alone is an important predictor in regards to the edibility of mushrooms because for the tree with only 2 terminal nodes (only the attribute odor), there were still only 54 misclassifications.