Activity 05: Naive Bayes Classification

I chose the mushroom data set from UCI Machine Learning Repository after running into issues trying to use the Hear Patients data set, specifically the “processed.cleveland.data”" file.

I adapted the tutorial in the resource link provided in the Activity 5 Post https://eight2late.wordpress.com/2015/11...n-using-r/ in order to generate the nb_multiple_runs function, so I didn’t elaborate on each step in the function definition.

#load in depencies
library(e1071)
library(gridExtra)

#read in data set from current working directory
mushrooms<-read.table(list.files()[4],sep=",")

#store a vector of column names
mush_names<-c("Class","cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat")

#supply column names 
colnames(mushrooms)<-mush_names

#preview data
head(mushrooms)
##   Class cap-shape cap-surface cap-color bruises odor gill-attachment
## 1     p         x           s         n       t    p               f
## 2     e         x           s         y       t    a               f
## 3     e         b           s         w       t    l               f
## 4     p         x           y         w       t    p               f
## 5     e         x           s         g       f    n               f
## 6     e         x           y         y       t    a               f
##   gill-spacing gill-size gill-color stalk-shape stalk-root
## 1            c         n          k           e          e
## 2            c         b          k           e          c
## 3            c         b          n           e          c
## 4            c         n          n           e          e
## 5            w         b          k           t          e
## 6            c         b          n           e          c
##   stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1                        s                        s                      w
## 2                        s                        s                      w
## 3                        s                        s                      w
## 4                        s                        s                      w
## 5                        s                        s                      w
## 6                        s                        s                      w
##   stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1                      w         p          w           o         p
## 2                      w         p          w           o         p
## 3                      w         p          w           o         p
## 4                      w         p          w           o         p
## 5                      w         p          w           o         e
## 6                      w         p          w           o         p
##   spore-print-color population habitat
## 1                 k          s       u
## 2                 n          n       g
## 3                 n          n       m
## 4                 k          s       u
## 5                 n          a       g
## 6                 k          n       g
#define nb_multiple_runs function 
nb_multiple_runs <- function(train_fraction,n){
  fraction_correct<-rep(NA,n)
  for (i in 1:n){
    mushrooms[,"train"]<-ifelse(runif(nrow(mushrooms))<train_fraction,1,0)
    trainColNum<-grep("train",names(mushrooms))
    trainMushrooms<-mushrooms[mushrooms$train==1,-trainColNum]
    testMushrooms<-mushrooms[mushrooms$train==0,-trainColNum]
    nb_model<-naiveBayes(Class~.,data=trainMushrooms)
    nb_test_predict<-predict(nb_model,testMushrooms[,-1])
    fraction_correct[i]<- mean(nb_test_predict==testMushrooms$Class)
  }
  return(fraction_correct)
}

#train on 80% of the dataset and repeat 20 times
fraction_correct_predictions <- nb_multiple_runs(0.80,20)

#summarize prediction results
summary(fraction_correct_predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9364  0.9393  0.9409  0.9420  0.9446  0.9505
#Standard Deviation
sd(fraction_correct_predictions)
## [1] 0.003860792
#summarize results in a single table
summaryTable<-as.data.frame(cbind(summary(fraction_correct_predictions),sd(fraction_correct_predictions)))

#change column 1 values into percent format
summaryTable[,1]<-paste0(summaryTable[,1]*100,"%")

#supply column names
colnames(summaryTable)<-c("accuracy","STD")

#generate table figure using gridExtra
grid.arrange(tableGrob(summaryTable))