#### Discriminant and Classification Analysis of iris data
# Load latticeExtra library, need it for trellis display
require(latticeExtra)
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
## Loading required package: lattice
# Load klaR library which loads MASS library first then klaR
require(klaR)
## Loading required package: klaR
## Loading required package: MASS
search() # inspect search paths
## [1] ".GlobalEnv" "package:klaR" "package:MASS"
## [4] "package:latticeExtra" "package:lattice" "package:RColorBrewer"
## [7] "package:knitr" "package:stats" "package:graphics"
## [10] "package:grDevices" "package:utils" "package:datasets"
## [13] "package:methods" "Autoloads" "package:base"
### Start plots
# Get trellis graphics parameters from the theme "superpose.symbol"
# to be used with panel function panel.superpose
super.sym <- trellis.par.get("superpose.symbol")
# Scatterplot matrix of all numerical variables by Species
splom(~iris[-5], groups = Species, data = iris,
panel = panel.superpose,
key = list(title = "Three Varieties of Iris",
columns = 3,
points = list(pch = super.sym$pch[1:3],
col = super.sym$col[1:3]),
text = list(c("Setosa", "Versicolor", "Virginica")))
)
bwplot(Sepal.Length+Sepal.Width+Petal.Length+Petal.Width ~ Species, data=iris,
allow.multiple=T, # allow multiple responses (lhs)
outer=T, # draw multiple plots separately
pch=16, col="cyan", # plot character, color,
alpha=0.5, # and semi-transparancy color for median
scales="free") # use scale individually for each panel
### Start Linear discriminant analysis (from MASS library)
lda(Species ~ ., data=iris) -> irisLDA
# Confusion matrix: rows = actual, columns = classified into
table( Actual=(actual<-iris$Species),
Classified=(classified<-predict(irisLDA)$class) )
## Classified
## Actual setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 1 49
# APparent Error Rate (error rate based on train data, underestimating!!)
mean(is_miss <- actual != classified) # APER
## [1] 0.02
# Posterior probabilities of missclassified objects
(as.data.frame(round(predict(irisLDA)$posterior[is_miss,], 4)) -> p)
## setosa versicolor virginica
## 71 0 0.2532 0.7468
## 84 0 0.1434 0.8566
## 134 0 0.7294 0.2706
# Actual membership
(as.character(actual[is_miss]) -> a)
## [1] "versicolor" "versicolor" "virginica"
# Classified membership
(as.character(classified[is_miss]) -> c)
## [1] "virginica" "virginica" "versicolor"
# Summary of missclassified individuals
cbind(p, actual=a, classified=c)
## setosa versicolor virginica actual classified
## 71 0 0.2532 0.7468 versicolor virginica
## 84 0 0.1434 0.8566 versicolor virginica
## 134 0 0.7294 0.2706 virginica versicolor
## Cross Validation confusion matrix based on holdout procedure (Jackknife
# method or Lachenbruch's method or delete-1 procedure)
table(actual, update(irisLDA, CV=T)$class->irisLDAcv)
##
## actual setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 1 49
mean(actual != irisLDAcv) # expected AER
## [1] 0.02
# Confusion matrix when unequal priors assumed
table(actual, predict(update(irisLDA,prior=c(4,3,3)/10))$class)
##
## actual setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 1 49
### Start Quadratic discriminant analysis (from MASS library)
qda(Species ~ ., data=iris) -> irisQDA
# Confusion matrix: rows = actual, columns = classified into
table(actual<-iris$Species, classified<-predict(irisQDA)$class)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 1 49
# APparent Error Rate (error rate based on train data, underestimated!!)
mean(is_miss <- actual != classified) # APER
## [1] 0.02
# Posterior probabilities of missclassified objects
(as.data.frame(round(predict(irisQDA)$posterior[is_miss,], 4)) -> p)
## setosa versicolor virginica
## 71 0 0.3359 0.6641
## 84 0 0.1543 0.8457
## 134 0 0.6050 0.3950
# Actual membership
(as.character(actual[is_miss]) -> a)
## [1] "versicolor" "versicolor" "virginica"
# Classified membership
(as.character(classified[is_miss]) -> c)
## [1] "virginica" "virginica" "versicolor"
# Summary of missclassified individuals
cbind(p, actual=a, classified=c)
## setosa versicolor virginica actual classified
## 71 0 0.3359 0.6641 versicolor virginica
## 84 0 0.1543 0.8457 versicolor virginica
## 134 0 0.6050 0.3950 virginica versicolor
## Cross Validation confusion matrix based on holdout procedure (Jackknife
# method or Lachenbruch's method or delete-1 procedure)
table(actual, update(irisQDA, CV=T)$class->irisQDAcv)
##
## actual setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 1 49
mean(actual != irisQDAcv) # expected AER
## [1] 0.02667
### Start minimum distance method
# Variance-covariance matrices of the three species
(cov(subset(iris,subset=Species=='setosa',select=-Species)) -> S1)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.12425 0.099216 0.016355 0.010331
## Sepal.Width 0.09922 0.143690 0.011698 0.009298
## Petal.Length 0.01636 0.011698 0.030159 0.006069
## Petal.Width 0.01033 0.009298 0.006069 0.011106
(cov(subset(iris,subset=Species=='versicolor',select=-Species)) -> S2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.26643 0.08518 0.18290 0.05578
## Sepal.Width 0.08518 0.09847 0.08265 0.04120
## Petal.Length 0.18290 0.08265 0.22082 0.07310
## Petal.Width 0.05578 0.04120 0.07310 0.03911
(cov(subset(iris,subset=Species=='virginica',select=-Species)) -> S3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.40434 0.09376 0.30329 0.04909
## Sepal.Width 0.09376 0.10400 0.07138 0.04763
## Petal.Length 0.30329 0.07138 0.30459 0.04882
## Petal.Width 0.04909 0.04763 0.04882 0.07543
# Degrees of freedom for the 3 species
(table(iris$Species)-1 -> dof)
##
## setosa versicolor virginica
## 49 49 49
# Pooled variance-covariance matrix
((dof[1]*S1+dof[2]*S2+dof[3]*S3)/sum(dof) -> S)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.26501 0.09272 0.16751 0.03840
## Sepal.Width 0.09272 0.11539 0.05524 0.03271
## Petal.Length 0.16751 0.05524 0.18519 0.04267
## Petal.Width 0.03840 0.03271 0.04267 0.04188
# Sample mean vectors for the 3 species
(colMeans(subset(iris,subset=Species=='setosa',select=-Species)) -> m1)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.006 3.428 1.462 0.246
(colMeans(subset(iris,subset=Species=='versicolor',select=-Species)) -> m2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.936 2.770 4.260 1.326
(colMeans(subset(iris,subset=Species=='virginica',select=-Species)) -> m3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 6.588 2.974 5.552 2.026
# Squared Mahalanobis distances of all objects to the 3 centers
d1 <- mahalanobis(iris[,-5], m1, S)
d2 <- mahalanobis(iris[,-5], m2, S)
d3 <- mahalanobis(iris[,-5], m3, S)
# Now place these into a data frame.
d <- data.frame(setosa=d1, versicolor=d2, virginica=d3)
# Do ties occur for any object?
any(apply(d,1,function(x)length(unique(x))<3))
## [1] FALSE
# Add classified results
d <- cbind(d, classified=names(d)[apply(d,1,which.min)])
# Add actual identities
d <- cbind(d, actual=iris$Species)
# Report missclassified individuals
subset(d, subset=classified!=actual)
## setosa versicolor virginica classified actual
## 71 130.9 8.670 6.507 virginica versicolor
## 84 149.0 8.439 4.864 virginica versicolor
## 134 133.1 5.253 7.236 versicolor virginica
### Using indicator matrix of response in linear regression
#
# Create a new data frame by adding indicator matrix response to iris data
x <- cbind(iris, nnet::class.ind(iris$Species))
# Fit linear model to indicator response for 'setosa'
x.lm <- lm(setosa~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data=x)
# See what components we have in the fitted linear model object
names(x.lm)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
# Now, get all fitted responses based on indicator matrix of response
y <- cbind(setosa=x.lm$fitted,
versicolor=qr.fitted(x.lm$qr,x[,'versicolor'],k=x.lm$rank),
virginica=qr.fitted(x.lm$qr,x[,'virginica'],k=x.lm$qr$rank))
# Classified membership
classified <- levels(iris$Species)[apply(y,1,which.max)]
# Indices of missclassified flowers
(which(iris$Species!=classified)->imiss)
## [1] 51 52 53 57 62 65 66 67 71 76 78 79 85 86 87 89 108
## [18] 109 120 123 130 134 135
# Actual vs missclassified membership
data.frame(actual=iris$Species,classified=classified)[imiss,]
## actual classified
## 51 versicolor virginica
## 52 versicolor virginica
## 53 versicolor virginica
## 57 versicolor virginica
## 62 versicolor virginica
## 65 versicolor virginica
## 66 versicolor virginica
## 67 versicolor virginica
## 71 versicolor virginica
## 76 versicolor virginica
## 78 versicolor virginica
## 79 versicolor virginica
## 85 versicolor virginica
## 86 versicolor virginica
## 87 versicolor virginica
## 89 versicolor virginica
## 108 virginica versicolor
## 109 virginica versicolor
## 120 virginica versicolor
## 123 virginica versicolor
## 130 virginica versicolor
## 134 virginica versicolor
## 135 virginica versicolor
length(imiss)/nrow(iris) # APER
## [1] 0.1533
### 3-D rotatable plot, may give insight as what variables to use
# in discriminant and classification analysis
# require rgl package
require(rgl)
## Loading required package: rgl
# Hold left mouse button and drag, move
# Hold right mouse button to zoom in/out
plot3d(iris,col=c("red","green3","blue")[as.numeric(iris$Species)])
plot3d(iris[2:4],col=c("red","green3","blue")[as.numeric(iris$Species)])
plot3d(subset(iris,subset=Species!='setosa'),
col=c("green3","blue")[as.numeric(iris$Species)])