abalone <- read.csv("abalone.csv", header =T, na.strings=c("","NA"))
suppressWarnings(suppressMessages(library(dplyr)))
#create an age column
train1 <- abalone %>%
mutate(age=case_when(
rings %in% 1:13 ~ "young",
rings %in% 14:30 ~ "old"
))
#convert AGE into factor
train1$age <- as.factor(train1$age)
set.seed(100)
#randomize data
train1 <- train1[sample(1:4177),]
# Take sample
# trainset = 60% , testset = 40% (any %)
trainset <- train1[1:2506, ]
testset <- train1[2507:4177, ]
# remove rings since i've already created an AGE FACTOR above
trainset1 <- trainset[c(-9)]
str(trainset1)
'data.frame': 2506 obs. of 9 variables:
$ sex : Factor w/ 3 levels "F","I","M": 2 2 1 2 1 3 3 2 3 2 ...
$ length : num 0.5 0.42 0.55 0.295 0.645 0.505 0.74 0.37 0.51 0.295 ...
$ diameter: num 0.375 0.3 0.415 0.225 0.51 0.385 0.535 0.27 0.405 0.225 ...
$ height : num 0.125 0.105 0.135 0.08 0.18 0.115 0.185 0.095 0.13 0.09 ...
$ weight : num 0.57 0.316 0.775 0.124 1.619 ...
$ shucked : num 0.259 0.1255 0.302 0.0485 0.7815 ...
$ viscera : num 0.124 0.07 0.179 0.032 0.322 ...
$ shell : num 0.157 0.103 0.26 0.04 0.468 ...
$ age : Factor w/ 2 levels "old","young": 2 2 1 2 2 2 2 2 2 2 ...
suppressWarnings(suppressMessages(library(naivebayes)))
bayes2 <- naive_bayes(age ~., data = trainset1)
bayes2
===================== Naive Bayes =====================
Call:
naive_bayes.formula(formula = age ~ ., data = trainset1)
A priori probabilities:
old young
0.1213089 0.8786911
Tables:
sex old young
F 0.4375000 0.3024523
I 0.1118421 0.3410536
M 0.4506579 0.3564941
length old young
mean 0.58437500 0.51701181
sd 0.07543124 0.12109177
diameter old young
mean 0.4627632 0.4015781
sd 0.0634677 0.0994117
height old young
mean 0.16828947 0.13650091
sd 0.02945380 0.04363673
weight old young
mean 1.1082434 0.7906989
sd 0.4268078 0.4799089
# ... and 3 more tables
old young
0.1213089 0.8786911
bayes2$tables
$sex
sex old young
F 0.4375000 0.3024523
I 0.1118421 0.3410536
M 0.4506579 0.3564941
$length
length old young
mean 0.58437500 0.51701181
sd 0.07543124 0.12109177
$diameter
diameter old young
mean 0.4627632 0.4015781
sd 0.0634677 0.0994117
$height
height old young
mean 0.16828947 0.13650091
sd 0.02945380 0.04363673
$weight
weight old young
mean 1.1082434 0.7906989
sd 0.4268078 0.4799089
$shucked
shucked old young
mean 0.4197533 0.3509986
sd 0.1717971 0.2243031
$viscera
viscera old young
mean 0.23354441 0.17352520
sd 0.09201213 0.10859887
$shell
shell old young
mean 0.3550789 0.2234444
sd 0.1420152 0.1294044
# compute the accuracy on the testset
pred <- predict(bayes2, testset)
mean(pred == testset$age)
[1] 0.6648713
table(pred,testset$age)
pred old young
old 129 503
young 57 982