MATH 239 Classification Final

Mycology Data

Part 1: R-blogger example

library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

theURL<- "http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

mushroom.data<- read.csv(file = theURL, header = FALSE, sep = ",",strip.white = TRUE,
                         stringsAsFactors = TRUE, 
                         col.names = c("class","cap-shape","cap-surface","cap-color","bruises",
                                       "odor","gill-attachment","gill-spacing","gill-size",
                                       "gill-color","stalk-shape","stalk-root","stalk-surface-above-ring",
                                       "stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring",
                                       "veil-type","veil-color","ring-number","ring-type","spore-print-color",
                                       "population","habitat"))
mushroom.data.levels<-cbind.data.frame(Variable=names(mushroom.data), Total_Levels=sapply(mushroom.data,function(x){as.numeric(length(levels(x)))}))

print(mushroom.data.levels)

##                                          Variable Total_Levels
## class                                       class            2
## cap.shape                               cap.shape            6
## cap.surface                           cap.surface            4
## cap.color                               cap.color           10
## bruises                                   bruises            2
## odor                                         odor            9
## gill.attachment                   gill.attachment            2
## gill.spacing                         gill.spacing            2
## gill.size                               gill.size            2
## gill.color                             gill.color           12
## stalk.shape                           stalk.shape            2
## stalk.root                             stalk.root            5
## stalk.surface.above.ring stalk.surface.above.ring            4
## stalk.surface.below.ring stalk.surface.below.ring            4
## stalk.color.above.ring     stalk.color.above.ring            9
## stalk.color.below.ring     stalk.color.below.ring            9
## veil.type                               veil.type            1
## veil.color                             veil.color            4
## ring.number                           ring.number            3
## ring.type                               ring.type            5
## spore.print.color               spore.print.color            9
## population                             population            6
## habitat                                   habitat            7

mushroom.data$gill.attachment<- NULL
levels(mushroom.data$class)<- c("edible","poisonous")

levels(mushroom.data$cap.shape)<-c("bell","conical","flat","knobbed","sunken","convex")

levels(mushroom.data$cap.surface)<- c("fibrous","grooves","smooth","scaly")

levels(mushroom.data$cap.color)<- c("buff","cinnamon","red","gray","brown","pink","green","purple","white","yellow")

levels(mushroom.data$bruises)<- c("bruisesno","bruisesyes")

levels(mushroom.data$odor)<-c("almond","creosote","foul","anise","musty","nosmell","pungent","spicy","fishy")

levels(mushroom.data$gill.spacing)<- c("close","crowded")

levels(mushroom.data$gill.size)<-c("broad","narrow")

levels(mushroom.data$gill.color)<- c("buff","red","gray","chocolate","black","brown","orange","pink","green","purple","white","yellow")

levels(mushroom.data$stalk.shape)<- c("enlarging","tapering")

table(mushroom.data$stalk.root)

## 
##    ?    b    c    e    r 
## 2480 3776  556 1120  192

levels(mushroom.data$stalk.root)<- c("missing","bulbous","club","equal","rooted")

levels(mushroom.data$stalk.surface.above.ring)<-c("fibrous","silky","smooth","scaly")

levels(mushroom.data$stalk.surface.below.ring)<-c("fibrous","silky","smooth","scaly")

levels(mushroom.data$stalk.color.above.ring)<- c("buff","cinnamon","red","gray","brown",                "orange","pink","white","yellow")

levels(mushroom.data$stalk.color.below.ring)<- c("buff","cinnamon","red","gray","brown",      "orange","pink","white","yellow")

levels(mushroom.data$veil.type)<-c("partial")

levels(mushroom.data$veil.color)<- c("brown","orange","white","yellow")

levels(mushroom.data$ring.number)<-c("none","one","two")

levels(mushroom.data$ring.type)<- c("evanescent","flaring","large","none","pendant")

levels(mushroom.data$spore.print.color)<- c("buff","chocolate","black","brown","orange","green","purple","white","yellow")

levels(mushroom.data$population)<- c("abundant","clustered","numerous","scattered","several","solitary")

levels(mushroom.data$habitat)<-c("woods","grasses","leaves","meadows","paths","urban","waste")

p<- ggplot(data = mushroom.data, aes(x=cap.shape, y=cap.surface, color=class))
p + geom_jitter(alpha=0.3) + scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))

p<- ggplot(data = mushroom.data, aes(x=population, y=habitat, color=class))
 p + geom_jitter(alpha=0.3) +  
  scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))

 p<- ggplot(data = mushroom.data, aes(x=habitat, y=habitat, color=class))
 p + geom_jitter(alpha=0.3) +scale_color_manual(breaks = c('edible','poisonous'),values=c('darkgreen','red'))

 chisq.test(mushroom.data$cap.shape, mushroom.data$cap.surface, correct = FALSE)

## Warning in chisq.test(mushroom.data$cap.shape, mushroom.data$cap.surface, : Chi-
## squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mushroom.data$cap.shape and mushroom.data$cap.surface
## X-squared = 1011.5, df = 15, p-value < 2.2e-16

 chisq.test(mushroom.data$habitat, mushroom.data$odor, correct = FALSE)

## Warning in chisq.test(mushroom.data$habitat, mushroom.data$odor, correct =
## FALSE): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mushroom.data$habitat and mushroom.data$odor
## X-squared = 6675.1, df = 48, p-value < 2.2e-16

 #install.packages("GoodmanKruskal")
 library(GoodmanKruskal)
varset1<- c("cap.shape","cap.surface","habitat","odor","class")
mushroomFrame1<- subset(mushroom.data, select = varset1)
GKmatrix1<- GKtauDataframe(mushroomFrame1)
plot(GKmatrix1, corrColors = "blue")

summary(mushroom.data)

##        class        cap.shape     cap.surface     cap.color   
##  edible   :4208   bell   : 452   fibrous:2320   brown  :2284  
##  poisonous:3916   conical:   4   grooves:   4   gray   :1840  
##                   flat   :3152   smooth :2556   red    :1500  
##                   knobbed: 828   scaly  :3244   yellow :1072  
##                   sunken :  32                  white  :1040  
##                   convex :3656                  buff   : 168  
##                                                 (Other): 220  
##        bruises          odor       gill.spacing   gill.size        gill.color  
##  bruisesno :4748   nosmell:3528   close  :6812   broad :5612   buff     :1728  
##  bruisesyes:3376   foul   :2160   crowded:1312   narrow:2512   pink     :1492  
##                    spicy  : 576                                white    :1202  
##                    fishy  : 576                                brown    :1048  
##                    almond : 400                                gray     : 752  
##                    anise  : 400                                chocolate: 732  
##                    (Other): 484                                (Other)  :1170  
##     stalk.shape     stalk.root   stalk.surface.above.ring
##  enlarging:3516   missing:2480   fibrous: 552            
##  tapering :4608   bulbous:3776   silky  :2372            
##                   club   : 556   smooth :5176            
##                   equal  :1120   scaly  :  24            
##                   rooted : 192                           
##                                                          
##                                                          
##  stalk.surface.below.ring stalk.color.above.ring stalk.color.below.ring
##  fibrous: 600             white  :4464           white  :4384          
##  silky  :2304             pink   :1872           pink   :1872          
##  smooth :4936             gray   : 576           gray   : 576          
##  scaly  : 284             brown  : 448           brown  : 512          
##                           buff   : 432           buff   : 432          
##                           orange : 192           orange : 192          
##                           (Other): 140           (Other): 156          
##    veil.type     veil.color   ring.number      ring.type    spore.print.color
##  partial:8124   brown :  96   none:  36   evanescent:2776   white    :2388   
##                 orange:  96   one :7488   flaring   :  48   brown    :1968   
##                 white :7924   two : 600   large     :1296   black    :1872   
##                 yellow:   8               none      :  36   chocolate:1632   
##                                           pendant   :3968   green    :  72   
##                                                             buff     :  48   
##                                                             (Other)  : 144   
##      population      habitat    
##  abundant : 384   woods  :3148  
##  clustered: 340   grasses:2148  
##  numerous : 400   leaves : 832  
##  scattered:1248   meadows: 292  
##  several  :4040   paths  :1144  
##  solitary :1712   urban  : 368  
##                   waste  : 192

Part 2: Mushroom data graphic

The graphic is not clear on markdown so I copied the graphic into a document and explained the visual and relationship to mushroom data there.

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ tibble  2.1.3     ✓ stringr 1.4.0
## ✓ tidyr   1.0.2     ✓ forcats 0.4.0
## ✓ purrr   0.3.3

## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggplot2)
library(rpart)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

URL <-"http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

mushroom <- read.csv(file=URL, header = F, sep=',')
names(mushroom) <- c('edible','cap_shape','cap_surface','cap_color','bruises','odor','gill_attach','gill_spacing','gill_size','gill_color','stalk_shape','stalk_root','stalk_surface_abo','stalk_surface_bel','stalk_color_abo','stalk_color_bel','veil_type','veil_color','ring_number','ring_type','spore_color','pop','habitat')
missed <- rep(NA,8124)
for(i in 1:8124){
  ifelse(mushroom$stalk_root[i] == '?',missed[i] <- 1, missed[i] <- 0)
}
mushroom$missed <- as.factor(missed)

train.m <- sample(8124,4062)
mush.train <- mushroom[train.m,]
mush.test <- mushroom[-train.m,]

mush.model <- rpart(edible~.,data=mush.train)
mush.model

## n= 4062 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 4062 1943 e (0.52166420 0.47833580)  
##   2) odor=a,l,n 2182   63 e (0.97112741 0.02887259)  
##     4) spore_color=b,h,k,n,o,u,w,y 2143   24 e (0.98880075 0.01119925) *
##     5) spore_color=r 39    0 p (0.00000000 1.00000000) *
##   3) odor=c,f,m,p,s,y 1880    0 p (0.00000000 1.00000000) *

feature_ditribution_plot <- function(data){
  
  #initialize the plot list
  plots <- list()
  
  #loop for all features except edibility
  for (i in 1:(ncol(data)-1))
  {
    #calculate the count of the feature values per edibility classification
    summarized_data <- data %>% group_by(edible, .[,i+1]) %>% summarise(n = n())
    #rename the feature column(it's easier to handle in the further code)
    names(summarized_data)[2] <- "attr"
    #create the plot for the current feature 
    plot <- summarized_data %>% ggplot(aes(attr , edible)) + geom_point(aes(size=n)) +
      xlab(names(data)[i+1]) + ylab("Edibility")
 
    #add the plot to the plot list
    plots[[i]] <- plot
  }
  #remove the unnecessary variables
  rm(summarized_data, i, plot)
  
  #draw all the created plots in a grid with 3 columns
  grid.arrange(grobs=plots,ncol=3)
}
feature_ditribution_plot(mush.train)

Part 3: Classification Model

This classification model was created for the mushroom dataset. The dataset was recorded using The Audubon Society Field Guide to North American Mushrooms (1981). G.H. Lincoff (Pres.), New York: Alfred A. Knopf and can be found at:https://archive.ics.uci.edu/ml/datasets/mushroom. This dataset includes 8,124 observations and has 22 attributes that characterise each mushroom. A classification tree allows us to use the set of attributes to predict a qualitative response, and in this case, whether a mushroom is poisonous or edible (class).

library(ISLR)
library(tree)

## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

tree.mushroom<-tree(class~. -class, mushroom.data)
summary(tree.mushroom)

## 
## Classification tree:
## tree(formula = class ~ . - class, data = mushroom.data)
## Variables actually used in tree construction:
## [1] "odor"                   "spore.print.color"      "stalk.color.below.ring"
## [4] "stalk.root"            
## Number of terminal nodes:  5 
## Residual mean deviance:  0.01429 = 116 / 8119 
## Misclassification error rate: 0.0009847 = 8 / 8124

plot(tree.mushroom)
text(tree.mushroom, pretty=0)

I fit a tree using the the mushroom dataset and utilized all attributes to predict class (poisonous or edible) except for the attribute class itself. The unpruned tree includes odor, spore.print.color, stalk.color.below.ring, and stalk.root as the attributes useful in classifying if a mushroom is poisonous or edible.

set.seed(2)
train<-sample(1:nrow(mushroom.data), 4062)

mushroom.test<-mushroom.data[-train]
dim(mushroom.test)

## [1] 8124   12

Next, I want to develop other trees that are less complex and train and test them in order to understand if they are accurate predicators for classifying mushrooms for the desired attribute. I used half the observations to train and half the observations to test.

class.test<-mushroom.data$class[-train]
tree.mushroom<-tree(class~.-class, mushroom.data, subset=train)
tree.pred.all<-predict(tree.mushroom, newdata=mushroom.data, type="class")
length(tree.pred.all)

## [1] 8124

I created a model using the training data.

tree.pred<-tree.pred.all[-train]

cm<-table(tree.pred, class.test)
cm

##            class.test
## tree.pred   edible poisonous
##   edible      2100         3
##   poisonous      0      1959

I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, only 3 mushrooms were classified incorrectly. According to the confusion matrix, 3 mushrooms were predicted to be edible but were actually poisonous. I would suggest that it is worse to predict that a mushroom is edible and have it actually be poisonous than to predict that a mushroom is poisonous and have it actually be edible. 2100 mushrooms were predicted to be edible and were edible and 1959 mushrooms were predicted to be poisonous and were poisonous.

sum(diag(cm))/sum(cm)

## [1] 0.9992614

1-sum(diag(cm))/sum(cm)

## [1] 0.0007385524

I calculated the misclassification error to be 0.000738 which is very small.This suggests this model is very good at classifying mushrooms. I will continue with developing other models with varying nodes and complexities in order to find the best model.

set.seed(3)
cv.mushroom<-cv.tree(tree.mushroom, FUN=prune.misclass)
names(cv.mushroom)

## [1] "size"   "dev"    "k"      "method"

par(mfrow=c(1,2))
plot(cv.mushroom$size, cv.mushroom$dev, type="b")
plot(cv.mushroom$k, cv.mushroom$dev, type="b")

This plot suggests that error decreases greatly as you go from having 1 to 2 terminal nodes in the model. After this, there seem to only be only small decreases in error shown on the plot as terminal nodes increase to 3 and 4. Also, the plot shows that going from 2 terminal nodes to 3 increases the cost of complexity greatly.

par(mfrow=c(1,1))
prune.mushroom<-prune.misclass(tree.mushroom, best=2)
plot(prune.mushroom)
text(prune.mushroom, pretty=0)

I created a classification tree with 2 terminal nodes.

tree.pred2<-predict(prune.mushroom, newdata = mushroom.data, type="class")
test.pred<-tree.pred2[-train]
cm<-table(test.pred, class.test)
cm

##            class.test
## test.pred   edible poisonous
##   edible      2100        54
##   poisonous      0      1908

I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, 54 mushrooms were classified incorrectly. According to the confusion matrix, 54 mushrooms were predicted to be edible but were actually poisonous. 2100 mushrooms were predicted to be edible and were edible and 1908 mushrooms were predicted to be poisonous and were poisonous.

sum(diag(cm))/sum(cm)

## [1] 0.9867061

1-sum(diag(cm))/sum(cm)

## [1] 0.01329394

I calculated the misclassification error to be 0.01329394 which is very small.This suggests this model is very good at classifying mushrooms.This error, however, is slightly higher than the model with 4 terminal nodes. I will now create a classification tree with 3 terminal nodes.

par(mfrow=c(1,1))
prune.mushroom2<-prune.misclass(tree.mushroom, best=3)
plot(prune.mushroom2)
text(prune.mushroom2, pretty=0)

I created a classification tree with 3 terminal nodes.

tree.pred3<-predict(prune.mushroom2, newdata = mushroom.data, type="class")
test.pred1<-tree.pred3[-train]
cm<-table(test.pred1, class.test)
cm

##            class.test
## test.pred1  edible poisonous
##   edible      2100        21
##   poisonous      0      1941

I developed a confusion matrix for this model to calibrate the output of the model and assess the outcomes of the predictions. Once testing data was applied, 21 mushrooms were classified incorrectly. According to the confusion matrix, 21 mushrooms were predicted to be edible but were actually poisonous. 2100 mushrooms were predicted to be edible and were edible and 1941 mushrooms were predicted to be poisonous and were poisonous.

sum(diag(cm))/sum(cm)

## [1] 0.9948301

1-sum(diag(cm))/sum(cm)

## [1] 0.005169867

I calculated the misclassification error to be 0.005169867 which is very small. This suggests this model is very good at classifying mushrooms.This error is slightly higher than the model with 4 terminal nodes, and is slightly lower than the model with 2 terminals nodes. I will now create a classification tree with 5 terminal nodes.

par(mfrow=c(1,1))
prune.mushroom4<-prune.misclass(tree.mushroom, best=5)
plot(prune.mushroom4)
text(prune.mushroom4, pretty=0)

I created a classification tree with 5 terminal nodes.

tree.pred5<-predict(prune.mushroom4, newdata = mushroom.data, type="class")
test.pred3<-tree.pred3[-train]
cm<-table(test.pred3, class.test)
cm

##            class.test
## test.pred3  edible poisonous
##   edible      2100        21
##   poisonous      0      1941

sum(diag(cm))/sum(cm)

## [1] 0.9948301

1-sum(diag(cm))/sum(cm)

## [1] 0.005169867

I calculated the misclassification error to be 0.005169867 which is very small. This suggests this model is very good at classifying mushrooms.This error is the same as the tree with 3 terminal nodes.

Overall, the model with the least amount of error has 4 terminal nodes but it is only marginally more accurate than the models with 3 and 5 terminal nodes, which have the exact same error rate. Therefore, I would suggest that the 3 terminal nodes model is the best classifcation tree to use because it is the second most accurate predictor along with 5, but it is not over-complex. I am concerned that the model with 4 terminal nodes, while it has the lowest misclassification error, also has a higher cost of complexity. The model with 3 terminal nodes has a balance between low error and complexity. This model also suggests that odor and spore.print.color are the most important attributes for classifying if a mushroom is edible or poisonous. It is clear that odor alone is an important predictor in regards to the edibility of mushrooms because for the tree with only 2 terminal nodes (only the attribute odor), there were still only 54 misclassifications.

MATH 239 Classification Final

Lauren Collar

5/12/2020