# these are the required packages and some initial manipulation of the data
library(ggplot2)
library(e1071)
library(stringr)
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
mushroom = read.csv("mushrooms.csv")
colnames(mushroom) = str_replace_all(colnames(mushroom), "\\.", "_")
mushroom$veil_type = NULL  # remove veil_type because all are partial, unnecessary column
mushroom$class = sapply(mushroom$class, function(x){ifelse(x=='e', 'edible', 'poisonous')})

This analysis is on a dataset containing information on over 8,000 species of mushrooms. Each mushroom species is classified as either poisonous or edible and has various other characteristics that can be used in determining the edibility of the mushroom. Two primary machine learning techniques were used to model the mushroom dataset, Support Vector Machines and Decision Trees.

Model Creation

“80-20 split”

This next chunk of code splits the mushroom dataset into a training and a testing set. This allows us to build our model with the training set and test its accuracy with the testing set. This “80-20 split” is a traditional way of building and testing models.

set.seed(40) # setting the seed so that we get reproducible results
mushroom[,'train'] <- ifelse(runif(nrow(mushroom))<0.8,1,0)
#separate training and test sets
trainset <- mushroom[mushroom$train==1,]
testset <- mushroom[mushroom$train==0,]
#get column index of train flag
trainColNum <- grep('train',names(trainset))
#remove train flag column from train and test sets
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]

#get column index of predicted variable in dataset
typeColNum <- grep('Class',names(mushroom))

Decision Tree model

Now we will create a model and train it with our training subset of data. The decision tree model from the “rpart” package is a powerful, yet simple, tool. This type of model is best used for categorization.

# complexity factor set to .0005
tree = rpart(class~., data=trainset, control = rpart.control(cp = .0005)) 

tree_pred = predict(tree, testset, type='class')
mean(tree_pred==testset$class) # percent of the test set that was correctly predicted, with this seed the model happens to predict with 100% accuracy
## [1] 1
table(tree_pred, testset$class) # confusion matrix of the test set and the model predictions
##            
## tree_pred   edible poisonous
##   edible       865         0
##   poisonous      0       797
# no mushrooms were missclassified in the test set
tree_pred_full = predict(tree, mushroom, type='class')
mean(tree_pred_full==mushroom$class) # percent of full data set that was correctly predicted
## [1] 1
table(tree_pred_full, mushroom$class) # confusion matrix of the full data set and the model predictions
##               
## tree_pred_full edible poisonous
##      edible      4208         0
##      poisonous      0      3916

Decision Tree Visualized

The decision tree model is easily visualized. Here you can see how the model makes decisions and ultimately predicts whether a mushroom is poisonous or edible. This visual in particular was created using the rattle package.

Plot of odor vs. spore print color

Now that we have visualized our decision tree model, we can see that odor and then the spore print color are two of the most significant identifiers in determining whether or not a mushroom is poisonous or edible. Now we can plot these two characteristics and visualize the characteristics that indicate whether or not a mushroom is poisonous or edible.

odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y

Support Vector Machine model

The next machine learning technique we will use is called a Support Vector Machine with the aid of the “e1701” package. The SVM creates a hyperplane that separates the classifications of a dataset. In this particular model, the hyperplane will be created with all of the columns available. As this is a many-dimensional hyperplane, we have no good way of visualizing this model. The hyperplanes can be linear or radial, depending on how well the separation of the classification is defined. This model will be purely for pedagogical purposes as we have already achieved a perfect model with the decision tree and need go no further.

svm_model <- svm(class~., data=trainset, type='C-classification', kernel='radial') # create svm model
# we set the kernel to radial as this data set does not a have a linear plane that can be drawn

pred_train <-predict(svm_model,trainset) # predicting with the new SVM model
mean(pred_train==trainset$class)  # percentage of trainset predicted correctly by svm
## [1] 0.9972145
pred_test <-predict(svm_model,testset) # predicting with the new SVM model
mean(pred_test==testset$class)    # percentage of testset predicted correctly by svm
## [1] 0.9969916
table(pred_test, testset$class)  # confusion matrix of the predictions of the svm and the test data
##            
## pred_test   edible poisonous
##   edible       865         5
##   poisonous      0       792