library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
data("USArrests")
dt = head(USArrests, 10)
kbl(dt) %>%
kable_minimal()
|
|
Murder
|
Assault
|
UrbanPop
|
Rape
|
|
Alabama
|
13.2
|
236
|
58
|
21.2
|
|
Alaska
|
10.0
|
263
|
48
|
44.5
|
|
Arizona
|
8.1
|
294
|
80
|
31.0
|
|
Arkansas
|
8.8
|
190
|
50
|
19.5
|
|
California
|
9.0
|
276
|
91
|
40.6
|
|
Colorado
|
7.9
|
204
|
78
|
38.7
|
|
Connecticut
|
3.3
|
110
|
77
|
11.1
|
|
Delaware
|
5.9
|
238
|
72
|
15.8
|
|
Florida
|
15.4
|
335
|
80
|
31.9
|
|
Georgia
|
17.4
|
211
|
60
|
25.8
|
dt = apply(USArrests, 2, var)
kbl(dt) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
|
|
x
|
|
Murder
|
18.97047
|
|
Assault
|
6945.16571
|
|
UrbanPop
|
209.51878
|
|
Rape
|
87.72916
|
scaled_df <- apply(USArrests, 2, scale)
dt = head(scaled_df)
kbl(dt) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
|
Murder
|
Assault
|
UrbanPop
|
Rape
|
|
1.2425641
|
0.7828393
|
-0.5209066
|
-0.0034165
|
|
0.5078625
|
1.1068225
|
-1.2117642
|
2.4842029
|
|
0.0716334
|
1.4788032
|
0.9989801
|
1.0428784
|
|
0.2323494
|
0.2308680
|
-1.0735927
|
-0.1849166
|
|
0.2782682
|
1.2628144
|
1.7589234
|
2.0678203
|
|
0.0257146
|
0.3988593
|
0.8608085
|
1.8649672
|
arrests.cov <- cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)
## List of 2
## $ values : num [1:4] 2.48 0.99 0.357 0.173
## $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
## - attr(*, "class")= chr "eigen"
phi <- arrests.eigen$vectors[,1:2]
print(phi)
## [,1] [,2]
## [1,] -0.5358995 0.4181809
## [2,] -0.5831836 0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186
phi <- -1*phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
kbl(phi) %>%
kable_styling(bootstrap_options = c("striped", "hover"))
|
|
PC1
|
PC2
|
|
Murder
|
0.5358995
|
-0.4181809
|
|
Assault
|
0.5831836
|
-0.1879856
|
|
UrbanPop
|
0.2781909
|
0.8728062
|
|
Rape
|
0.5434321
|
0.1673186
|
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]
PC <- data.frame(State = row.names(USArrests), PC1, PC2)
head(PC)
## State PC1 PC2
## 1 Alabama 0.9756604 -1.1220012
## 2 Alaska 1.9305379 -1.0624269
## 3 Arizona 1.7454429 0.7384595
## 4 Arkansas -0.1399989 -1.1085423
## 5 California 2.4986128 1.5274267
## 6 Colorado 1.4993407 0.9776297
ggplot(PC, aes(PC1, PC2)) +
modelr::geom_ref_line(h = 0) +
modelr::geom_ref_line(v = 0) +
geom_text(aes(label = State), size = 3) +
xlab("First Principal Component") +
ylab("Second Principal Component") +
ggtitle("First Two Principal Components of USArrests Data")

PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04
PVEplot <- qplot(c(1:4), PVE) +
geom_line() +
xlab("Principal Component") +
ylab("PVE") +
ggtitle("Scree Plot") +
ylim(0,1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cumPVE <- qplot(c(1:4), cumsum(PVE)) +
geom_line() +
xlab("Principal Component") +
ylab(NULL) +
ggtitle("Cumulative Scree Plot") +
ylim(0,1)
grid.arrange(PVEplot, cumPVE, ncol = 2)

pca_result <- prcomp(USArrests, scale = TRUE)
pca_result$rotation <- -pca_result$rotation
pca_result$x <- -pca_result$x
biplot(pca_result, scale = 0)

library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(dplyr)
library(ISLR)
select <- dplyr::select
train = Smarket %>%
filter(Year < 2005)
test = Smarket %>%
filter(Year >= 2005)
model_LDA = lda(Direction ~ Lag1 + Lag2, data = train)
print(model_LDA)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(model_LDA)

predictions_LDA = data.frame(predict(model_LDA, test))
names(predictions_LDA)
## [1] "class" "posterior.Down" "posterior.Up" "LD1"
predictions_LDA = cbind(test, predictions_LDA)
predictions_LDA %>%
count(class, Direction)
## class Direction n
## 1 Down Down 35
## 2 Down Up 35
## 3 Up Down 76
## 4 Up Up 106
predictions_LDA %>%
summarize(score = mean(class == Direction))
## score
## 1 0.5595238
model_logistic = glm(Direction ~ Lag1 + Lag2, data = train, family = binomial)
logistic_probs = data.frame(probs = predict(model_logistic, test, type = "response"))
predictions_logistic = logistic_probs %>%
mutate(class = ifelse(probs>.5, "Up", "Down"))
predictions_logistic = cbind(test, predictions_logistic)
predictions_logistic %>%
count(class, Direction)
## class Direction n
## 1 Down Down 35
## 2 Down Up 35
## 3 Up Down 76
## 4 Up Up 106
predictions_logistic %>%
summarize(score = mean(class == Direction))
## score
## 1 0.5595238
model_QDA = qda(Direction ~ Lag1 + Lag2, data = train)
model_QDA
## Call:
## qda(Direction ~ Lag1 + Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
predictions_QDA = data.frame(predict(model_QDA, test))
predictions_QDA = cbind(test, predictions_QDA)
predictions_QDA %>%
count(class, Direction)
## class Direction n
## 1 Down Down 30
## 2 Down Up 20
## 3 Up Down 81
## 4 Up Up 121
predictions_QDA %>%
summarize(score = mean(class == Direction))
## score
## 1 0.5992063
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
library(dplyr)
Carseats_scaled <- Carseats %>%
relocate(ShelveLoc, .after = Education) %>%
mutate(across(where(is.numeric), scale))
str(Carseats_scaled)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num [1:400, 1] 0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
## ..- attr(*, "scaled:center")= num 7.5
## ..- attr(*, "scaled:scale")= num 2.82
## $ CompPrice : num [1:400, 1] 0.849 -0.911 -0.781 -0.52 1.045 ...
## ..- attr(*, "scaled:center")= num 125
## ..- attr(*, "scaled:scale")= num 15.3
## $ Income : num [1:400, 1] 0.155 -0.738 -1.203 1.12 -0.166 ...
## ..- attr(*, "scaled:center")= num 68.7
## ..- attr(*, "scaled:scale")= num 28
## $ Advertising: num [1:400, 1] 0.656 1.408 0.506 -0.396 -0.547 ...
## ..- attr(*, "scaled:center")= num 6.63
## ..- attr(*, "scaled:scale")= num 6.65
## $ Population : num [1:400, 1] 0.0757 -0.0328 0.0282 1.3649 0.51 ...
## ..- attr(*, "scaled:center")= num 265
## ..- attr(*, "scaled:scale")= num 147
## $ Price : num [1:400, 1] 0.178 -1.385 -1.512 -0.794 0.515 ...
## ..- attr(*, "scaled:center")= num 116
## ..- attr(*, "scaled:scale")= num 23.7
## $ Age : num [1:400, 1] -0.699 0.721 0.35 0.104 -0.946 ...
## ..- attr(*, "scaled:center")= num 53.3
## ..- attr(*, "scaled:scale")= num 16.2
## $ Education : num [1:400, 1] 1.183 -1.4882 -0.725 0.0382 -0.3434 ...
## ..- attr(*, "scaled:center")= num 13.9
## ..- attr(*, "scaled:scale")= num 2.62
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123)
t.idx <- createDataPartition(Carseats_scaled$ShelveLoc, p = 0.7, list = FALSE)
train_carseats <- Carseats_scaled[t.idx,]
test_carseats <- Carseats_scaled[-t.idx,]
model_LDA1 <- lda(ShelveLoc ~ ., data = train_carseats)
print(model_LDA1)
## Call:
## lda(ShelveLoc ~ ., data = train_carseats)
##
## Prior probabilities of groups:
## Bad Good Medium
## 0.2411348 0.2127660 0.5460993
##
## Group means:
## Sales CompPrice Income Advertising Population Price
## Bad -0.69063852 -0.19592484 0.229783710 -0.06894802 0.02573242 -0.13233432
## Good 0.98762797 0.12988132 -0.005032271 0.14009258 0.03987521 0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710 0.03047387 0.03078256 -0.01136249
## Age Education UrbanYes USYes
## Bad -0.15697788 0.09427825 0.7794118 0.6029412
## Good -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium 0.08430782 0.05798375 0.7272727 0.6363636
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sales 2.71880530 -0.1423670
## CompPrice -1.24362409 0.2527614
## Income -0.48513557 -0.7345195
## Advertising -0.81733638 0.3394201
## Population 0.02168590 -0.0236600
## Price 2.14173028 -0.1795976
## Age 0.80586756 0.6475076
## Education 0.03677347 0.0136664
## UrbanYes -0.33391364 -0.2404916
## USYes 0.37937261 -0.5681783
##
## Proportion of trace:
## LD1 LD2
## 0.9899 0.0101
plot(model_LDA1)

lda_pred <- predict(model_LDA1, newdata = test_carseats)
table(Predict = lda_pred$class, Actual = test_carseats$ShelveLoc)
## Actual
## Predict Bad Good Medium
## Bad 20 0 5
## Good 0 18 1
## Medium 8 7 59
predictions_LDA = cbind(test_carseats, lda_pred)
predictions_LDA %>%
count(class, ShelveLoc)
## class ShelveLoc n
## 1 Bad Bad 20
## 2 Bad Medium 5
## 3 Good Good 18
## 4 Good Medium 1
## 5 Medium Bad 8
## 6 Medium Good 7
## 7 Medium Medium 59
predictions_LDA %>%
summarize(score = mean(class == ShelveLoc))
## score
## 1 0.8220339
model_QDA1 <- qda(ShelveLoc ~ ., data = train_carseats)
print(model_QDA1)
## Call:
## qda(ShelveLoc ~ ., data = train_carseats)
##
## Prior probabilities of groups:
## Bad Good Medium
## 0.2411348 0.2127660 0.5460993
##
## Group means:
## Sales CompPrice Income Advertising Population Price
## Bad -0.69063852 -0.19592484 0.229783710 -0.06894802 0.02573242 -0.13233432
## Good 0.98762797 0.12988132 -0.005032271 0.14009258 0.03987521 0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710 0.03047387 0.03078256 -0.01136249
## Age Education UrbanYes USYes
## Bad -0.15697788 0.09427825 0.7794118 0.6029412
## Good -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium 0.08430782 0.05798375 0.7272727 0.6363636
qda_pred = predict(model_QDA1, newdata = test_carseats)
predictions_QDA = cbind(test_carseats, qda_pred)
predictions_QDA %>%
count(class, ShelveLoc)
## class ShelveLoc n
## 1 Bad Bad 17
## 2 Bad Medium 6
## 3 Good Good 19
## 4 Good Medium 1
## 5 Medium Bad 11
## 6 Medium Good 6
## 7 Medium Medium 58
predictions_QDA %>%
summarize(score = mean(class == ShelveLoc))
## score
## 1 0.7966102
table(Predict = qda_pred$class, Actual = test_carseats$ShelveLoc)
## Actual
## Predict Bad Good Medium
## Bad 17 0 6
## Good 0 19 1
## Medium 11 6 58