Part 1 PCA in R

# checking data

summary(USArrests)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00
# scaling data
scaled_df <- scale(USArrests)
head(scaled_df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207
# calculate eigenvalues and eigenvectors
arrests.cov <-cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)
## List of 2
##  $ values : num [1:4] 2.48 0.99 0.357 0.173
##  $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
##  - attr(*, "class")= chr "eigen"
# Extract the loading
phi <- arrests.eigen$vectors[, 1:2]
print(phi)
##            [,1]       [,2]
## [1,] -0.5358995  0.4181809
## [2,] -0.5831836  0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186
library(magrittr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.1
phi <- -1*phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
kbl(phi) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
PC1 PC2
Murder 0.5358995 -0.4181809
Assault 0.5831836 -0.1879856
UrbanPop 0.2781909 0.8728062
Rape 0.5434321 0.1673186
# calculate principal components scores
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]

# create data frame with principal components scores
PC <- data.frame(State = row.names(USArrests), PC1,PC2)
head(PC)
##                 State        PC1        PC2
## Alabama       Alabama  0.9756604 -1.1220012
## Alaska         Alaska  1.9305379 -1.0624269
## Arizona       Arizona  1.7454429  0.7384595
## Arkansas     Arkansas -0.1399989 -1.1085423
## California California  2.4986128  1.5274267
## Colorado     Colorado  1.4993407  0.9776297
library(ggplot2)
gra <- ggplot(PC, aes(x = PC1, y = PC2, label = State))+
  geom_point() +
  geom_text(aes(label = State)) +
  labs(title = "First Two Principal Components of USArrests Data",
      x = "First Principal Component",
      y = "Second Principal Component")
gra

PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04
pve_df <- data.frame(
  Component = 1:length(PVE),
  PVE = PVE
)

pve_df$Cumulative <- cumsum(pve_df$PVE)

# scree plot
graph1 <- ggplot(pve_df, aes(x = Component, y = PVE)) +
  geom_point() +
  geom_line() +
  ylim(0,1) + 
  labs(title = "Scree Plot",
       x = "Principal Component",
       y = "PVE")

# cumulative scree plot
graph2 <- ggplot(pve_df, aes(x = Component, y =Cumulative)) +
  geom_point() +
  geom_line() +
  ylim(0,1) +
  labs(title = "Cumulative Scree Plot",
       x = "Principal Component",
       y = "PVE")
  
library(gridExtra)
gridExtra::grid.arrange(graph1,graph2, ncol = 2)

# PCA built-in analysis

pca_result <- prcomp(USArrests, scale = TRUE)
pca_result$rotation <- -pca_result$rotation
pca_result$x <- - pca_result$x
biplot(pca_result, scale = 0)

Part 2. Discriminant Analysis

library(MASS)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.5.1
select <- dplyr::select
# check the data
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
train = Smarket %>%
  filter(Year < 2005)

test = Smarket %>%
  filter(Year >= 2005)

model_LDA = lda(Direction~Lag1+Lag2, data=train)
print(model_LDA)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
# visualization
plot(model_LDA)

predictions_LDA = data.frame(predict(model_LDA, test))
names(predictions_LDA)
## [1] "class"          "posterior.Down" "posterior.Up"   "LD1"
predictions_LDA = cbind(test, predictions_LDA)

predictions_LDA %>%
  count(class, Direction)
##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106
predictions_LDA %>%
  summarize(score = mean(class == Direction))
##       score
## 1 0.5595238
# logistic model, for comparison

model_logistic = glm(Direction~Lag1+Lag2, data = train, family=binomial)

logistic_probs = data.frame(probs = predict(model_logistic, test, type="response"))

predictions_logistic = logistic_probs %>%
  mutate(class = ifelse(probs>.5, "Up", "Down"))

predictions_logistic = cbind(test, predictions_logistic)

predictions_logistic %>%
  count(class, Direction)
##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106
predictions_logistic %>%
  summarize(score = mean(class == Direction))
##       score
## 1 0.5595238
# Quadratic discriminant analysis

model_QDA = qda(Direction~Lag1+Lag2, data=train)
model_QDA
## Call:
## qda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
predictions_QDA = data.frame(predict(model_QDA, test))

predictions_QDA = cbind(test, predictions_QDA)

predictions_QDA %>%
  count(class,Direction)
##   class Direction   n
## 1  Down      Down  30
## 2  Down        Up  20
## 3    Up      Down  81
## 4    Up        Up 121
predictions_QDA %>%
  summarise(score = mean(class == Direction))
##       score
## 1 0.5992063
# Carseats data
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
head(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1  9.50       138     73          11        276   120       Bad  42        17
## 2 11.22       111     48          16        260    83      Good  65        10
## 3 10.06       113     35          10        269    80    Medium  59        12
## 4  7.40       117    100           4        466    97    Medium  55        14
## 5  4.15       141     64           3        340   128       Bad  38        13
## 6 10.81       124    113          13        501    72       Bad  78        16
##   Urban  US
## 1   Yes Yes
## 2   Yes Yes
## 3   Yes Yes
## 4   Yes Yes
## 5   Yes  No
## 6    No Yes
# split the data into 7:3
set.seed(123)
sample_index <- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_app <- Carseats[sample_index, ]
test_app <- Carseats[-sample_index, ]

model_LDA_app = lda(ShelveLoc~Sales+CompPrice+Income+Advertising+Population+Price+Age, data=train_app)
print(model_LDA_app)
## Call:
## lda(ShelveLoc ~ Sales + CompPrice + Income + Advertising + Population + 
##     Price + Age, data = train_app)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2428571 0.2071429 0.5500000 
## 
## Group means:
##            Sales CompPrice   Income Advertising Population    Price      Age
## Bad     5.494265  125.7941 70.60294    5.617647   271.4265 114.5588 50.66176
## Good   10.222069  125.5000 66.67241    7.310345   261.9138 117.6379 51.24138
## Medium  7.247338  125.4870 67.86364    6.837662   266.2078 116.5649 54.38312
## 
## Coefficients of linear discriminants:
##                       LD1           LD2
## Sales        0.9936930918 -0.0323577436
## CompPrice   -0.0994891459 -0.0020342161
## Income      -0.0144712965 -0.0071636005
## Advertising -0.1071881086  0.0532279412
## Population  -0.0005356228 -0.0007702285
## Price        0.0979901846  0.0059272162
## Age          0.0462021888  0.0555535766
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9948 0.0052
prediction_LDA_app = data.frame(predict(model_LDA_app, test_app))
names(prediction_LDA_app)
## [1] "class"            "posterior.Bad"    "posterior.Good"   "posterior.Medium"
## [5] "x.LD1"            "x.LD2"
prediction_LDA_app = cbind(test_app, prediction_LDA_app)

prediction_LDA_app %>%
  count(class, ShelveLoc)
##    class ShelveLoc  n
## 1    Bad       Bad 14
## 2    Bad    Medium  5
## 3   Good      Good 25
## 4   Good    Medium  3
## 5 Medium       Bad 14
## 6 Medium      Good  2
## 7 Medium    Medium 57
prediction_LDA_app %>%
  summarize(score= mean(class == ShelveLoc))
##   score
## 1   0.8
model_QDA_app = qda(ShelveLoc~Sales+CompPrice+Income+Advertising+Population+Price+Age, data=train_app)
model_QDA_app
## Call:
## qda(ShelveLoc ~ Sales + CompPrice + Income + Advertising + Population + 
##     Price + Age, data = train_app)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2428571 0.2071429 0.5500000 
## 
## Group means:
##            Sales CompPrice   Income Advertising Population    Price      Age
## Bad     5.494265  125.7941 70.60294    5.617647   271.4265 114.5588 50.66176
## Good   10.222069  125.5000 66.67241    7.310345   261.9138 117.6379 51.24138
## Medium  7.247338  125.4870 67.86364    6.837662   266.2078 116.5649 54.38312
prediction_QDA_app = data.frame(predict(model_QDA_app, test_app))

predictions_QDA_app = cbind(test_app,prediction_QDA_app )

predictions_QDA_app %>%
  count(class,ShelveLoc)
##    class ShelveLoc  n
## 1    Bad       Bad 14
## 2    Bad    Medium  4
## 3   Good      Good 25
## 4   Good    Medium  3
## 5 Medium       Bad 14
## 6 Medium      Good  2
## 7 Medium    Medium 58
predictions_QDA_app %>%
  summarize(score=mean(class == ShelveLoc))
##       score
## 1 0.8083333