Load Library
library(klaR)
## Loading required package: MASS
library(psych)
library(MASS)
library(ggord)
## Warning: package 'ggord' was built under R version 4.1.3
library(devtools)
## Loading required package: usethis
Getting Data
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Total 150 observations and 5 variables contains in the iris dataset.
Social Network Analysis in R
pairs.panels(iris[1:4],
gap = 0,
bg = c("red", "green", "blue")[iris$Species],
pch = 21)
The plot, scatter diagram, histogram, and correlation values are now visible.
Data Partition
set.seed(123)
ind <- sample(2, nrow(iris),
replace = TRUE,
prob = c(0.6, 0.4))
training <- iris[ind==1,]
testing <- iris[ind==2,]
Linear Discriminant Analysis
linear <- lda(Species~., training)
linear
## Call:
## lda(Species ~ ., data = training)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3370787 0.3370787 0.3258427
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 4.946667 3.380000 1.443333 0.250000
## versicolor 5.943333 2.803333 4.240000 1.316667
## virginica 6.527586 2.920690 5.489655 2.048276
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.3629008 0.05215114
## Sepal.Width 2.2276982 1.47580354
## Petal.Length -1.7854533 -1.60918547
## Petal.Width -3.9745504 4.10534268
##
## Proportion of trace:
## LD1 LD2
## 0.9932 0.0068
attributes(linear)
## $names
## [1] "prior" "counts" "means" "scaling" "lev" "svd" "N"
## [8] "call" "terms" "xlevels"
##
## $class
## [1] "lda"
linear$prior
## setosa versicolor virginica
## 0.3370787 0.3370787 0.3258427
linear$counts
## setosa versicolor virginica
## 30 30 29
linear$scaling
## LD1 LD2
## Sepal.Length 0.3629008 0.05215114
## Sepal.Width 2.2276982 1.47580354
## Petal.Length -1.7854533 -1.60918547
## Petal.Width -3.9745504 4.10534268
linear$counts
## setosa versicolor virginica
## 30 30 29
linear$lev
## [1] "setosa" "versicolor" "virginica"
linear$svd
## [1] 41.250027 3.413112
linear$N
## [1] 89
linear$call
## lda(formula = Species ~ ., data = training)
linear$terms
## Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
## attr(,"variables")
## list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## attr(,"factors")
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Species 0 0 0 0
## Sepal.Length 1 0 0 0
## Sepal.Width 0 1 0 0
## Petal.Length 0 0 1 0
## Petal.Width 0 0 0 1
## attr(,"term.labels")
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## attr(,"order")
## [1] 1 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,".Environment")
## <environment: R_GlobalEnv>
## attr(,"predvars")
## list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## attr(,"dataClasses")
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## "factor" "numeric" "numeric" "numeric" "numeric"
linear$xlevels
## named list()
linear$lda
## NULL
Histogram
p <- predict(linear, training)
ldahist(data = p$x[,1], g = training$Species)
The histograms in this section are based on ld1. There are no obvious overlaps between the first and second species, or the first and third species. However, there was considerable overlap between the second and third species.
ldahist(data = p$x[,2], g = training$Species)
The lda2 histogram shows total overlap, which is not ideal.
Bi-Plot
ggord(linear, training$Species, ylim = c(-10, 10))
Based on LD1 and LD2, a biplot was created. Setosa was well distinguished, and there was some overlap between Versicolor and virginica. According to the arrows, sepal width and length are more important for setosa, whereas petal width and length are more important for versicolor and virginica.
Partition Plot
partimat(Species~., data = training, method = "lda")
partimat(Species~., data = training, method = "qda")
Confusion matrix and accuracy training data
p1 <- predict(linear, training)$class
tab <- table(Predicted = p1, Actual = training$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 30 0 0
## versicolor 0 30 0
## virginica 0 0 29
sum(diag(tab))/sum(tab)
## [1] 1
Confusion matrix and accuracy on test data
p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)
tab1
## Actual
## Predicted setosa versicolor virginica
## setosa 20 0 0
## versicolor 0 19 1
## virginica 0 1 20
sum(diag(tab1))/sum(tab1)
## [1] 0.9672131
CONCLUSION: The histogram and biplot provide useful insights and aid in interpretation, and if the group covariance matrices are not significantly different, the linear discriminant analysis will perform similarly to the quadratic. Non-linear issues are not amenable to LDA.