Part 1 PCA in R
# checking data
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
# scaling data
scaled_df <- scale(USArrests)
head(scaled_df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# calculate eigenvalues and eigenvectors
arrests.cov <-cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)
## List of 2
## $ values : num [1:4] 2.48 0.99 0.357 0.173
## $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
## - attr(*, "class")= chr "eigen"
# Extract the loading
phi <- arrests.eigen$vectors[, 1:2]
print(phi)
## [,1] [,2]
## [1,] -0.5358995 0.4181809
## [2,] -0.5831836 0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186
library(magrittr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.1
phi <- -1*phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
kbl(phi) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
PC1 | PC2 | |
---|---|---|
Murder | 0.5358995 | -0.4181809 |
Assault | 0.5831836 | -0.1879856 |
UrbanPop | 0.2781909 | 0.8728062 |
Rape | 0.5434321 | 0.1673186 |
# calculate principal components scores
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]
# create data frame with principal components scores
PC <- data.frame(State = row.names(USArrests), PC1,PC2)
head(PC)
## State PC1 PC2
## Alabama Alabama 0.9756604 -1.1220012
## Alaska Alaska 1.9305379 -1.0624269
## Arizona Arizona 1.7454429 0.7384595
## Arkansas Arkansas -0.1399989 -1.1085423
## California California 2.4986128 1.5274267
## Colorado Colorado 1.4993407 0.9776297
library(ggplot2)
gra <- ggplot(PC, aes(x = PC1, y = PC2, label = State))+
geom_point() +
geom_text(aes(label = State)) +
labs(title = "First Two Principal Components of USArrests Data",
x = "First Principal Component",
y = "Second Principal Component")
gra
PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04
pve_df <- data.frame(
Component = 1:length(PVE),
PVE = PVE
)
pve_df$Cumulative <- cumsum(pve_df$PVE)
# scree plot
graph1 <- ggplot(pve_df, aes(x = Component, y = PVE)) +
geom_point() +
geom_line() +
ylim(0,1) +
labs(title = "Scree Plot",
x = "Principal Component",
y = "PVE")
# cumulative scree plot
graph2 <- ggplot(pve_df, aes(x = Component, y =Cumulative)) +
geom_point() +
geom_line() +
ylim(0,1) +
labs(title = "Cumulative Scree Plot",
x = "Principal Component",
y = "PVE")
library(gridExtra)
gridExtra::grid.arrange(graph1,graph2, ncol = 2)
# PCA built-in analysis
pca_result <- prcomp(USArrests, scale = TRUE)
pca_result$rotation <- -pca_result$rotation
pca_result$x <- - pca_result$x
biplot(pca_result, scale = 0)
Part 2. Discriminant Analysis
library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.5.1
select <- dplyr::select
# check the data
head(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
train = Smarket %>%
filter(Year < 2005)
test = Smarket %>%
filter(Year >= 2005)
model_LDA = lda(Direction~Lag1+Lag2, data=train)
print(model_LDA)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
# visualization
plot(model_LDA)
predictions_LDA = data.frame(predict(model_LDA, test))
names(predictions_LDA)
## [1] "class" "posterior.Down" "posterior.Up" "LD1"
predictions_LDA = cbind(test, predictions_LDA)
predictions_LDA %>%
count(class, Direction)
## class Direction n
## 1 Down Down 35
## 2 Down Up 35
## 3 Up Down 76
## 4 Up Up 106
predictions_LDA %>%
summarize(score = mean(class == Direction))
## score
## 1 0.5595238
# logistic model, for comparison
model_logistic = glm(Direction~Lag1+Lag2, data = train, family=binomial)
logistic_probs = data.frame(probs = predict(model_logistic, test, type="response"))
predictions_logistic = logistic_probs %>%
mutate(class = ifelse(probs>.5, "Up", "Down"))
predictions_logistic = cbind(test, predictions_logistic)
predictions_logistic %>%
count(class, Direction)
## class Direction n
## 1 Down Down 35
## 2 Down Up 35
## 3 Up Down 76
## 4 Up Up 106
predictions_logistic %>%
summarize(score = mean(class == Direction))
## score
## 1 0.5595238
# Quadratic discriminant analysis
model_QDA = qda(Direction~Lag1+Lag2, data=train)
model_QDA
## Call:
## qda(Direction ~ Lag1 + Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
predictions_QDA = data.frame(predict(model_QDA, test))
predictions_QDA = cbind(test, predictions_QDA)
predictions_QDA %>%
count(class,Direction)
## class Direction n
## 1 Down Down 30
## 2 Down Up 20
## 3 Up Down 81
## 4 Up Up 121
predictions_QDA %>%
summarise(score = mean(class == Direction))
## score
## 1 0.5992063
# Carseats data
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
# split the data into 7:3
set.seed(123)
sample_index <- sample(1:nrow(Carseats), 0.7 * nrow(Carseats))
train_app <- Carseats[sample_index, ]
test_app <- Carseats[-sample_index, ]
model_LDA_app = lda(ShelveLoc~Sales+CompPrice+Income+Advertising+Population+Price+Age, data=train_app)
print(model_LDA_app)
## Call:
## lda(ShelveLoc ~ Sales + CompPrice + Income + Advertising + Population +
## Price + Age, data = train_app)
##
## Prior probabilities of groups:
## Bad Good Medium
## 0.2428571 0.2071429 0.5500000
##
## Group means:
## Sales CompPrice Income Advertising Population Price Age
## Bad 5.494265 125.7941 70.60294 5.617647 271.4265 114.5588 50.66176
## Good 10.222069 125.5000 66.67241 7.310345 261.9138 117.6379 51.24138
## Medium 7.247338 125.4870 67.86364 6.837662 266.2078 116.5649 54.38312
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sales 0.9936930918 -0.0323577436
## CompPrice -0.0994891459 -0.0020342161
## Income -0.0144712965 -0.0071636005
## Advertising -0.1071881086 0.0532279412
## Population -0.0005356228 -0.0007702285
## Price 0.0979901846 0.0059272162
## Age 0.0462021888 0.0555535766
##
## Proportion of trace:
## LD1 LD2
## 0.9948 0.0052
prediction_LDA_app = data.frame(predict(model_LDA_app, test_app))
names(prediction_LDA_app)
## [1] "class" "posterior.Bad" "posterior.Good" "posterior.Medium"
## [5] "x.LD1" "x.LD2"
prediction_LDA_app = cbind(test_app, prediction_LDA_app)
prediction_LDA_app %>%
count(class, ShelveLoc)
## class ShelveLoc n
## 1 Bad Bad 14
## 2 Bad Medium 5
## 3 Good Good 25
## 4 Good Medium 3
## 5 Medium Bad 14
## 6 Medium Good 2
## 7 Medium Medium 57
prediction_LDA_app %>%
summarize(score= mean(class == ShelveLoc))
## score
## 1 0.8
model_QDA_app = qda(ShelveLoc~Sales+CompPrice+Income+Advertising+Population+Price+Age, data=train_app)
model_QDA_app
## Call:
## qda(ShelveLoc ~ Sales + CompPrice + Income + Advertising + Population +
## Price + Age, data = train_app)
##
## Prior probabilities of groups:
## Bad Good Medium
## 0.2428571 0.2071429 0.5500000
##
## Group means:
## Sales CompPrice Income Advertising Population Price Age
## Bad 5.494265 125.7941 70.60294 5.617647 271.4265 114.5588 50.66176
## Good 10.222069 125.5000 66.67241 7.310345 261.9138 117.6379 51.24138
## Medium 7.247338 125.4870 67.86364 6.837662 266.2078 116.5649 54.38312
prediction_QDA_app = data.frame(predict(model_QDA_app, test_app))
predictions_QDA_app = cbind(test_app,prediction_QDA_app )
predictions_QDA_app %>%
count(class,ShelveLoc)
## class ShelveLoc n
## 1 Bad Bad 14
## 2 Bad Medium 4
## 3 Good Good 25
## 4 Good Medium 3
## 5 Medium Bad 14
## 6 Medium Good 2
## 7 Medium Medium 58
predictions_QDA_app %>%
summarize(score=mean(class == ShelveLoc))
## score
## 1 0.8083333