Discriminant Analysis of Iris Data

#### Discriminant and Classification Analysis of iris data
# Load latticeExtra library, need it for trellis display
require(latticeExtra)
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
## Loading required package: lattice
# Load klaR library which loads MASS library first then klaR
require(klaR)
## Loading required package: klaR
## Loading required package: MASS
search()   # inspect search paths
##  [1] ".GlobalEnv"           "package:klaR"         "package:MASS"        
##  [4] "package:latticeExtra" "package:lattice"      "package:RColorBrewer"
##  [7] "package:knitr"        "package:stats"        "package:graphics"    
## [10] "package:grDevices"    "package:utils"        "package:datasets"    
## [13] "package:methods"      "Autoloads"            "package:base"
### Start plots
# Get trellis graphics parameters from the theme "superpose.symbol"
# to be used with panel function panel.superpose
super.sym <- trellis.par.get("superpose.symbol")
# Scatterplot matrix of all numerical variables by Species
splom(~iris[-5], groups = Species, data = iris,
      panel = panel.superpose,
      key = list(title = "Three Varieties of Iris",
                 columns = 3, 
                 points = list(pch = super.sym$pch[1:3],
                               col = super.sym$col[1:3]),
                 text = list(c("Setosa", "Versicolor", "Virginica")))
)

plot of chunk unnamed-chunk-1

bwplot(Sepal.Length+Sepal.Width+Petal.Length+Petal.Width ~ Species, data=iris,
       allow.multiple=T,   # allow multiple responses (lhs)
       outer=T,            # draw multiple plots separately
       pch=16, col="cyan", # plot character, color,
       alpha=0.5,          #   and semi-transparancy color for median
       scales="free")      # use scale individually for each panel

plot of chunk unnamed-chunk-1

### Start Linear discriminant analysis (from MASS library)
lda(Species ~ ., data=iris) -> irisLDA
# Confusion matrix: rows = actual, columns = classified into
table( Actual=(actual<-iris$Species), 
       Classified=(classified<-predict(irisLDA)$class) )
##             Classified
## Actual       setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
# APparent Error Rate  (error rate based on train data, underestimating!!)
mean(is_miss <- actual != classified) # APER
## [1] 0.02
# Posterior probabilities of missclassified objects
(as.data.frame(round(predict(irisLDA)$posterior[is_miss,], 4)) -> p)
##     setosa versicolor virginica
## 71       0     0.2532    0.7468
## 84       0     0.1434    0.8566
## 134      0     0.7294    0.2706
# Actual membership
(as.character(actual[is_miss]) -> a)
## [1] "versicolor" "versicolor" "virginica"
# Classified membership
(as.character(classified[is_miss]) -> c)
## [1] "virginica"  "virginica"  "versicolor"
# Summary of missclassified individuals
cbind(p, actual=a, classified=c)
##     setosa versicolor virginica     actual classified
## 71       0     0.2532    0.7468 versicolor  virginica
## 84       0     0.1434    0.8566 versicolor  virginica
## 134      0     0.7294    0.2706  virginica versicolor
## Cross Validation confusion matrix based on holdout procedure (Jackknife
# method or Lachenbruch's method or delete-1 procedure)
table(actual, update(irisLDA, CV=T)$class->irisLDAcv)
##             
## actual       setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
mean(actual != irisLDAcv) # expected AER
## [1] 0.02
# Confusion matrix when unequal priors assumed
table(actual, predict(update(irisLDA,prior=c(4,3,3)/10))$class)
##             
## actual       setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
### Start Quadratic discriminant analysis (from MASS library)
qda(Species ~ ., data=iris) -> irisQDA
# Confusion matrix: rows = actual, columns = classified into
table(actual<-iris$Species, classified<-predict(irisQDA)$class)
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
# APparent Error Rate  (error rate based on train data, underestimated!!)
mean(is_miss <- actual != classified) # APER
## [1] 0.02
# Posterior probabilities of missclassified objects
(as.data.frame(round(predict(irisQDA)$posterior[is_miss,], 4)) -> p)
##     setosa versicolor virginica
## 71       0     0.3359    0.6641
## 84       0     0.1543    0.8457
## 134      0     0.6050    0.3950
# Actual membership
(as.character(actual[is_miss]) -> a)
## [1] "versicolor" "versicolor" "virginica"
# Classified membership
(as.character(classified[is_miss]) -> c)
## [1] "virginica"  "virginica"  "versicolor"
# Summary of missclassified individuals
cbind(p, actual=a, classified=c)
##     setosa versicolor virginica     actual classified
## 71       0     0.3359    0.6641 versicolor  virginica
## 84       0     0.1543    0.8457 versicolor  virginica
## 134      0     0.6050    0.3950  virginica versicolor
## Cross Validation confusion matrix based on holdout procedure (Jackknife
# method or Lachenbruch's method or delete-1 procedure)
table(actual, update(irisQDA, CV=T)$class->irisQDAcv)
##             
## actual       setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          1        49
mean(actual != irisQDAcv) # expected AER
## [1] 0.02667
### Start minimum distance method
# Variance-covariance matrices of the three species
(cov(subset(iris,subset=Species=='setosa',select=-Species)) -> S1)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      0.12425    0.099216     0.016355    0.010331
## Sepal.Width       0.09922    0.143690     0.011698    0.009298
## Petal.Length      0.01636    0.011698     0.030159    0.006069
## Petal.Width       0.01033    0.009298     0.006069    0.011106
(cov(subset(iris,subset=Species=='versicolor',select=-Species)) -> S2)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      0.26643     0.08518      0.18290     0.05578
## Sepal.Width       0.08518     0.09847      0.08265     0.04120
## Petal.Length      0.18290     0.08265      0.22082     0.07310
## Petal.Width       0.05578     0.04120      0.07310     0.03911
(cov(subset(iris,subset=Species=='virginica',select=-Species)) -> S3)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      0.40434     0.09376      0.30329     0.04909
## Sepal.Width       0.09376     0.10400      0.07138     0.04763
## Petal.Length      0.30329     0.07138      0.30459     0.04882
## Petal.Width       0.04909     0.04763      0.04882     0.07543
# Degrees of freedom for the 3 species
(table(iris$Species)-1 -> dof)
## 
##     setosa versicolor  virginica 
##         49         49         49
# Pooled variance-covariance matrix
((dof[1]*S1+dof[2]*S2+dof[3]*S3)/sum(dof) -> S)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length      0.26501     0.09272      0.16751     0.03840
## Sepal.Width       0.09272     0.11539      0.05524     0.03271
## Petal.Length      0.16751     0.05524      0.18519     0.04267
## Petal.Width       0.03840     0.03271      0.04267     0.04188
# Sample mean vectors for the 3 species
(colMeans(subset(iris,subset=Species=='setosa',select=-Species)) -> m1)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.006        3.428        1.462        0.246
(colMeans(subset(iris,subset=Species=='versicolor',select=-Species)) -> m2)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        5.936        2.770        4.260        1.326
(colMeans(subset(iris,subset=Species=='virginica',select=-Species)) -> m3)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        6.588        2.974        5.552        2.026
# Squared Mahalanobis distances of all objects to the 3 centers
d1 <- mahalanobis(iris[,-5], m1, S)
d2 <- mahalanobis(iris[,-5], m2, S)
d3 <- mahalanobis(iris[,-5], m3, S)
# Now place these into a data frame.
d <- data.frame(setosa=d1, versicolor=d2, virginica=d3)
# Do ties occur for any object?
any(apply(d,1,function(x)length(unique(x))<3))
## [1] FALSE
# Add classified results
d <- cbind(d, classified=names(d)[apply(d,1,which.min)])
# Add actual identities
d <- cbind(d, actual=iris$Species)
# Report missclassified individuals
subset(d, subset=classified!=actual)
##     setosa versicolor virginica classified     actual
## 71   130.9      8.670     6.507  virginica versicolor
## 84   149.0      8.439     4.864  virginica versicolor
## 134  133.1      5.253     7.236 versicolor  virginica
### Using indicator matrix of response in linear regression
#
# Create a new data frame by adding indicator matrix response to iris data
x <- cbind(iris, nnet::class.ind(iris$Species))
# Fit linear model to indicator response for 'setosa'
x.lm <- lm(setosa~Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data=x)
# See what components we have in the fitted linear model object
names(x.lm)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
# Now, get all fitted responses based on indicator matrix of response
y <- cbind(setosa=x.lm$fitted,
           versicolor=qr.fitted(x.lm$qr,x[,'versicolor'],k=x.lm$rank),
           virginica=qr.fitted(x.lm$qr,x[,'virginica'],k=x.lm$qr$rank))
# Classified membership
classified <- levels(iris$Species)[apply(y,1,which.max)]
# Indices of missclassified flowers
(which(iris$Species!=classified)->imiss)
##  [1]  51  52  53  57  62  65  66  67  71  76  78  79  85  86  87  89 108
## [18] 109 120 123 130 134 135
# Actual vs missclassified membership
data.frame(actual=iris$Species,classified=classified)[imiss,]
##         actual classified
## 51  versicolor  virginica
## 52  versicolor  virginica
## 53  versicolor  virginica
## 57  versicolor  virginica
## 62  versicolor  virginica
## 65  versicolor  virginica
## 66  versicolor  virginica
## 67  versicolor  virginica
## 71  versicolor  virginica
## 76  versicolor  virginica
## 78  versicolor  virginica
## 79  versicolor  virginica
## 85  versicolor  virginica
## 86  versicolor  virginica
## 87  versicolor  virginica
## 89  versicolor  virginica
## 108  virginica versicolor
## 109  virginica versicolor
## 120  virginica versicolor
## 123  virginica versicolor
## 130  virginica versicolor
## 134  virginica versicolor
## 135  virginica versicolor
length(imiss)/nrow(iris)   # APER
## [1] 0.1533
### 3-D rotatable plot, may give insight as what variables to use
# in discriminant and classification analysis
# require rgl package
require(rgl)
## Loading required package: rgl
# Hold left mouse button and drag, move
# Hold right mouse button to zoom in/out
plot3d(iris,col=c("red","green3","blue")[as.numeric(iris$Species)])
plot3d(iris[2:4],col=c("red","green3","blue")[as.numeric(iris$Species)])
plot3d(subset(iris,subset=Species!='setosa'),
       col=c("green3","blue")[as.numeric(iris$Species)])