This solution explains how iris data can be explored and used with naive bayes theory to predict the species.
Load the required package
library('TSA')
## Warning: package 'TSA' was built under R version 3.4.3
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.4.3
## Loading required package: locfit
## Warning: package 'locfit' was built under R version 3.4.3
## locfit 1.5-9.1 2013-03-22
## Loading required package: mgcv
## Warning: package 'mgcv' was built under R version 3.4.3
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 3.4.3
## This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.4.3
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library('forecast')
## Warning: package 'forecast' was built under R version 3.4.3
##
## Attaching package: 'forecast'
## The following object is masked from 'package:nlme':
##
## getResponse
library('tseries')
library('ggplot2') # visualization
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.3
library('scales') # visualization
library('dplyr') # data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.3
## Loading required package: lattice
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.3
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.3
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# library('ROCR')
# library('randomForest')
# library('corrr')
# library('corrplot')
# library('glue')
# library('caTools')
# library('data.table')
# require("knitr")
# require("geosphere")
# require("gmapsdistance")
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.3
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
#source("distance.R")
library('car')
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Loading required package: cluster
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.3
library('cluster')
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.3
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.3
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.3
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library('RColorBrewer')
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library('ROCR')
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
## The following object is masked from 'package:caret':
##
## lift
## The following object is masked from 'package:car':
##
## some
## The following object is masked from 'package:scales':
##
## discard
library('tidyr')
library('ggplot2')
library('dummies')
## dummies-1.5.6 provided by Decision Patterns
library('corrplot')
library('usdm')
## Warning: package 'usdm' was built under R version 3.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.4.3
## Loading required package: raster
##
## Attaching package: 'raster'
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:MASS':
##
## area, select
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:nlme':
##
## getData
##
## Attaching package: 'usdm'
## The following object is masked from 'package:car':
##
## vif
## The following object is masked from 'package:nlme':
##
## Variogram
library('e1071')
## Warning: package 'e1071' was built under R version 3.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:raster':
##
## interpolate
## The following objects are masked from 'package:TSA':
##
## kurtosis, skewness
library('ElemStatLearn')
## Warning: package 'ElemStatLearn' was built under R version 3.4.3
You can also embed plots, for example:
mydata <- read.csv('iris.csv')
summary(mydata)
## sepal.length sepal.width petal.length petal.width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.550 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.300 Median :1.300
## Mean :5.839 Mean :3.058 Mean :3.741 Mean :1.191
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## class
## setosa :51
## versicolor:50
## virginica :50
##
##
##
table(mydata$class)
##
## setosa versicolor virginica
## 51 50 50
Split the data into training and test data
#Partitioning Data Sets
#Partition train and val
#We will use this throughout so that samples are comparable
set.seed(3451)
pd<-sample(2,nrow(mydata),replace=TRUE, prob=c(0.7,0.3))
train<-mydata[pd==1,]
val<-mydata[pd==2,]
qplot(train$sepal.length, train$petal.width, colour = train$class, data=train)
Separating the Predictor variables into matrix formar
train.2fact<-train[,c(1,3,5)]
val.2fact<-val[,c(1,3,5)]
Use Naive Bayes theory to calculate the predictor
NB.1<-naiveBayes(x=train.2fact[-3], y=train.2fact$class)
class(NB.1)
## [1] "naiveBayes"
summary(NB.1)
## Length Class Mode
## apriori 3 table numeric
## tables 2 -none- list
## levels 3 -none- character
## call 3 -none- call
print(NB.1)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = train.2fact[-3], y = train.2fact$class)
##
## A-priori probabilities:
## train.2fact$class
## setosa versicolor virginica
## 0.3454545 0.3272727 0.3272727
##
## Conditional probabilities:
## sepal.length
## train.2fact$class [,1] [,2]
## setosa 5.007895 0.3096481
## versicolor 5.841667 0.4912811
## virginica 6.605556 0.6973191
##
## petal.length
## train.2fact$class [,1] [,2]
## setosa 1.465789 0.1863973
## versicolor 4.177778 0.4691093
## virginica 5.580556 0.5746151
Predict the category for the training data
#pedict
y_pred<-predict(NB.1,newdata=val.2fact[-3])
val.2fact$SpeciesPredicted <- y_pred
length(val.2fact$Species)
## [1] 41
length(val.2fact$SpeciesPredicted)
## [1] 41
#Confusion matrix
CofusionIris <- table(actualclass=val.2fact$Species, predictedclass=val.2fact$SpeciesPredicted)
CofusionIrismatrix <- confusionMatrix(CofusionIris)
print(CofusionIrismatrix)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass setosa versicolor virginica
## setosa 13 0 0
## versicolor 0 12 0
## virginica 0 0 16
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.914, 1)
## No Information Rate : 0.3902
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3171 0.2927 0.3902
## Detection Rate 0.3171 0.2927 0.3902
## Detection Prevalence 0.3171 0.2927 0.3902
## Balanced Accuracy 1.0000 1.0000 1.0000