summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
# Load necessary libraries
library(MASS)
# Check the number of observations for each species
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
# Define colors for species
lookup <- c(setosa='blue', versicolor='green', virginica='orange')
col.ind <- lookup[iris$Species]
# Pair plot to visualize relationships
pairs(iris[-5], pch=21, col="gray", bg=col.ind)
# Perform LDA
lda.fit <- lda(Species ~ ., data = iris)
print(lda.fit)
## Call:
## lda(Species ~ ., data = iris)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 0.8293776 -0.02410215
## Sepal.Width 1.5344731 -2.16452123
## Petal.Length -2.2012117 0.93192121
## Petal.Width -2.8104603 -2.83918785
##
## Proportion of trace:
## LD1 LD2
## 0.9912 0.0088
# Scatter plot of Sepal.Width vs. Sepal.Length
plot(Sepal.Width ~ Sepal.Length, data = iris, pch=21, col="gray", bg=col.ind)
points(lda.fit$means[,1], lda.fit$means[,2], pch=21, cex=2, col="black", bg=lookup)
# Make predictions
lda.pred <- predict(lda.fit)
head(lda.pred$x)
## LD1 LD2
## 1 8.061800 -0.3004206
## 2 7.128688 0.7866604
## 3 7.489828 0.2653845
## 4 6.813201 0.6706311
## 5 8.132309 -0.5144625
## 6 7.701947 -1.4617210
# Plot LDA results
plot(LD2 ~ LD1, data = lda.pred$x, pch=21, col="gray", bg=col.ind)
# Confusion matrix
table(pred=lda.pred$class, true=iris$Species)
## true
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
# Misclassification rate
1 - mean(lda.pred$class == iris$Species)
## [1] 0.02
# Cross-validation
lda.cv <- predict(lda.fit, CV=TRUE)
In this example, we perform Linear Discriminant Analysis (LDA) on the
Smarket
dataset, following the example from An
Introduction to Statistical Learning.
# Load necessary libraries
library(MASS)
library(ISLR) # Ensure this is installed before running
# Inspect the data
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
# Define training set
train <- Smarket$Year < 2005
# Fit the LDA model
lda.fit <- lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
print(lda.fit)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(lda.fit)
# Make predictions on 2005 data
lda.pred <- predict(lda.fit, Smarket[!train, ])
# Extract predicted classes
lda.class <- lda.pred$class
# True labels
Direction.2005 <- Smarket$Direction[!train]
# Confusion matrix
table(Predicted = lda.class, Actual = Direction.2005)
## Actual
## Predicted Down Up
## Down 35 35
## Up 76 106
# Compute accuracy
mean(lda.class == Direction.2005)
## [1] 0.5595238
# Inspect first 20 posterior probabilities
lda.pred$posterior[1:20, 1]
## 999 1000 1001 1002 1003 1004 1005 1006
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 0.4872861
## 1007 1008 1009 1010 1011 1012 1013 1014
## 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 0.4799583
## 1015 1016 1017 1018
## 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
## [1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up Up
## [16] Up Up Down Up Up
## Levels: Down Up
# Count cases where the posterior probability of 'Down' is greater than 0.9
sum(lda.pred$posterior[, 1] > 0.9)
## [1] 0