I chose the mushroom data set from UCI Machine Learning Repository after running into issues trying to use the Hear Patients data set, specifically the “processed.cleveland.data”" file.
I adapted the tutorial in the resource link provided in the Activity 5 Post https://eight2late.wordpress.com/2015/11...n-using-r/ in order to generate the nb_multiple_runs function, so I didn’t elaborate on each step in the function definition.
#load in depencies
library(e1071)
library(gridExtra)
#read in data set from current working directory
mushrooms<-read.table(list.files()[4],sep=",")
#store a vector of column names
mush_names<-c("Class","cap-shape","cap-surface","cap-color","bruises","odor","gill-attachment","gill-spacing","gill-size","gill-color","stalk-shape","stalk-root","stalk-surface-above-ring","stalk-surface-below-ring","stalk-color-above-ring","stalk-color-below-ring","veil-type","veil-color","ring-number","ring-type","spore-print-color","population","habitat")
#supply column names
colnames(mushrooms)<-mush_names
#preview data
head(mushrooms)
## Class cap-shape cap-surface cap-color bruises odor gill-attachment
## 1 p x s n t p f
## 2 e x s y t a f
## 3 e b s w t l f
## 4 p x y w t p f
## 5 e x s g f n f
## 6 e x y y t a f
## gill-spacing gill-size gill-color stalk-shape stalk-root
## 1 c n k e e
## 2 c b k e c
## 3 c b n e c
## 4 c n n e e
## 5 w b k t e
## 6 c b n e c
## stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring
## 1 s s w
## 2 s s w
## 3 s s w
## 4 s s w
## 5 s s w
## 6 s s w
## stalk-color-below-ring veil-type veil-color ring-number ring-type
## 1 w p w o p
## 2 w p w o p
## 3 w p w o p
## 4 w p w o p
## 5 w p w o e
## 6 w p w o p
## spore-print-color population habitat
## 1 k s u
## 2 n n g
## 3 n n m
## 4 k s u
## 5 n a g
## 6 k n g
#define nb_multiple_runs function
nb_multiple_runs <- function(train_fraction,n){
fraction_correct<-rep(NA,n)
for (i in 1:n){
mushrooms[,"train"]<-ifelse(runif(nrow(mushrooms))<train_fraction,1,0)
trainColNum<-grep("train",names(mushrooms))
trainMushrooms<-mushrooms[mushrooms$train==1,-trainColNum]
testMushrooms<-mushrooms[mushrooms$train==0,-trainColNum]
nb_model<-naiveBayes(Class~.,data=trainMushrooms)
nb_test_predict<-predict(nb_model,testMushrooms[,-1])
fraction_correct[i]<- mean(nb_test_predict==testMushrooms$Class)
}
return(fraction_correct)
}
#train on 80% of the dataset and repeat 20 times
fraction_correct_predictions <- nb_multiple_runs(0.80,20)
#summarize prediction results
summary(fraction_correct_predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9364 0.9393 0.9409 0.9420 0.9446 0.9505
#Standard Deviation
sd(fraction_correct_predictions)
## [1] 0.003860792
#summarize results in a single table
summaryTable<-as.data.frame(cbind(summary(fraction_correct_predictions),sd(fraction_correct_predictions)))
#change column 1 values into percent format
summaryTable[,1]<-paste0(summaryTable[,1]*100,"%")
#supply column names
colnames(summaryTable)<-c("accuracy","STD")
#generate table figure using gridExtra
grid.arrange(tableGrob(summaryTable))