Number 1

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Linear Discriminant Analysis (LDA) Example

# Load necessary libraries
library(MASS)

# Check the number of observations for each species
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
# Define colors for species
lookup <- c(setosa='blue', versicolor='green', virginica='orange')
col.ind <- lookup[iris$Species]

# Pair plot to visualize relationships
pairs(iris[-5], pch=21, col="gray", bg=col.ind)

# Perform LDA
lda.fit <- lda(Species ~ ., data = iris)
print(lda.fit)
## Call:
## lda(Species ~ ., data = iris)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.8293776 -0.02410215
## Sepal.Width   1.5344731 -2.16452123
## Petal.Length -2.2012117  0.93192121
## Petal.Width  -2.8104603 -2.83918785
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9912 0.0088
# Scatter plot of Sepal.Width vs. Sepal.Length
plot(Sepal.Width ~ Sepal.Length, data = iris, pch=21, col="gray", bg=col.ind)
points(lda.fit$means[,1], lda.fit$means[,2], pch=21, cex=2, col="black", bg=lookup)

# Make predictions
lda.pred <- predict(lda.fit)
head(lda.pred$x)
##        LD1        LD2
## 1 8.061800 -0.3004206
## 2 7.128688  0.7866604
## 3 7.489828  0.2653845
## 4 6.813201  0.6706311
## 5 8.132309 -0.5144625
## 6 7.701947 -1.4617210
# Plot LDA results
plot(LD2 ~ LD1, data = lda.pred$x, pch=21, col="gray", bg=col.ind)

# Confusion matrix
table(pred=lda.pred$class, true=iris$Species)
##             true
## pred         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         1
##   virginica       0          2        49
# Misclassification rate
1 - mean(lda.pred$class == iris$Species)
## [1] 0.02
# Cross-validation
lda.cv <- predict(lda.fit, CV=TRUE)

Number 2

In this example, we perform Linear Discriminant Analysis (LDA) on the Smarket dataset, following the example from An Introduction to Statistical Learning.

# Load necessary libraries
library(MASS)
library(ISLR)  # Ensure this is installed before running

# Inspect the data
summary(Smarket)
##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 
# Define training set
train <- Smarket$Year < 2005

# Fit the LDA model
lda.fit <- lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
print(lda.fit)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293

Plot LDA

plot(lda.fit)

Predictions

# Make predictions on 2005 data
lda.pred <- predict(lda.fit, Smarket[!train, ])

# Extract predicted classes
lda.class <- lda.pred$class

# True labels
Direction.2005 <- Smarket$Direction[!train]

# Confusion matrix
table(Predicted = lda.class, Actual = Direction.2005)
##          Actual
## Predicted Down  Up
##      Down   35  35
##      Up     76 106
# Compute accuracy
mean(lda.class == Direction.2005)
## [1] 0.5595238

Posterior Probabilities

# Inspect first 20 posterior probabilities
lda.pred$posterior[1:20, 1]
##       999      1000      1001      1002      1003      1004      1005      1006 
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 0.4872861 
##      1007      1008      1009      1010      1011      1012      1013      1014 
## 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 0.4799583 
##      1015      1016      1017      1018 
## 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up  
## [16] Up   Up   Down Up   Up  
## Levels: Down Up

Adjusting the Probability Threshold

# Count cases where the posterior probability of 'Down' is greater than 0.9
sum(lda.pred$posterior[, 1] > 0.9)
## [1] 0