data("iris")
head(iris)
str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
any(is.na(iris))
[1] FALSE
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
Species
setosa :50
versicolor:50
virginica :50
library(psych)
pairs.panels(iris[1:4], gap=0,bg= c("blue", "green", "red")[iris$Species],pch=21)
data partition
set.seed(345)
ir<- sample(2,nrow(iris),replace = TRUE, prob = c(0.75,0.25))
train <- iris[ir == 1,]
test <- iris[ir == 2,]
Linear discriminant analysis
library(MASS)
lnr <- lda(Species~., train)
lnr
Call:
lda(Species ~ ., data = train)
Prior probabilities of groups:
setosa versicolor virginica
0.3008850 0.3451327 0.3539823
Group means:
Sepal.Length Sepal.Width Petal.Length Petal.Width
setosa 4.947059 3.391176 1.455882 0.2441176
versicolor 5.943590 2.797436 4.282051 1.3487179
virginica 6.630000 2.975000 5.592500 2.0375000
Coefficients of linear discriminants:
LD1 LD2
Sepal.Length 0.9525357 0.4570886
Sepal.Width 1.6182574 2.1122989
Petal.Length -2.3789015 -1.2138097
Petal.Width -2.5903620 2.9972039
Proportion of trace:
LD1 LD2
0.9917 0.0083
percentage seprations achived by the first discriminant function is 99.04% i.e; very high.
attributes(lnr)
$names
[1] "prior" "counts" "means" "scaling" "lev" "svd" "N"
[8] "call" "terms" "xlevels"
$class
[1] "lda"
lnr$prior
setosa versicolor virginica
0.3008850 0.3451327 0.3539823
lnr$counts
setosa versicolor virginica
34 39 40
lnr$scaling
LD1 LD2
Sepal.Length 0.9525357 0.4570886
Sepal.Width 1.6182574 2.1122989
Petal.Length -2.3789015 -1.2138097
Petal.Width -2.5903620 2.9972039
Stacked Histogram of discriminanr function values
prd <- predict(lnr, train)
ldahist(data = prd$x[,1], g = train$Species)
#install.packages("devtools")
library(devtools)
#install_github("fawda123/ggord")
library(ggord)
ggord(lnr, train$Species, ylim = c(-5, 5 ))
confusion matrix and accuracy on train data
p.train <- predict(lnr, train)$class
table1 <- table(predicted = p.train, Actual = train$Species)
table1
Actual
predicted setosa versicolor virginica
setosa 34 0 0
versicolor 0 37 1
virginica 0 2 39
sum(diag(table1)/sum(table1))
[1] 0.9734513
confusion matrix and accuracy on test data
p.test <- predict(lnr, test)$class
table2 <- table(predicted = p.test, Actual = test$Species)
table2
Actual
predicted setosa versicolor virginica
setosa 16 0 0
versicolor 0 11 0
virginica 0 0 10
sum(diag(table2)/sum(table2))
[1] 1