library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
data("USArrests")
dt = head(USArrests, 10)
kbl(dt) %>%
  kable_minimal()
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
Colorado 7.9 204 78 38.7
Connecticut 3.3 110 77 11.1
Delaware 5.9 238 72 15.8
Florida 15.4 335 80 31.9
Georgia 17.4 211 60 25.8
dt = apply(USArrests, 2, var)
kbl(dt) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
x
Murder 18.97047
Assault 6945.16571
UrbanPop 209.51878
Rape 87.72916
scaled_df <- apply(USArrests, 2, scale)
dt = head(scaled_df)
kbl(dt) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Murder Assault UrbanPop Rape
1.2425641 0.7828393 -0.5209066 -0.0034165
0.5078625 1.1068225 -1.2117642 2.4842029
0.0716334 1.4788032 0.9989801 1.0428784
0.2323494 0.2308680 -1.0735927 -0.1849166
0.2782682 1.2628144 1.7589234 2.0678203
0.0257146 0.3988593 0.8608085 1.8649672
arrests.cov <- cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)
## List of 2
##  $ values : num [1:4] 2.48 0.99 0.357 0.173
##  $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
##  - attr(*, "class")= chr "eigen"
phi <- arrests.eigen$vectors[,1:2]
print(phi)
##            [,1]       [,2]
## [1,] -0.5358995  0.4181809
## [2,] -0.5831836  0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186
phi <- -1*phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
kbl(phi) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
PC1 PC2
Murder 0.5358995 -0.4181809
Assault 0.5831836 -0.1879856
UrbanPop 0.2781909 0.8728062
Rape 0.5434321 0.1673186
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]

PC <- data.frame(State = row.names(USArrests), PC1, PC2)
head(PC)
##        State        PC1        PC2
## 1    Alabama  0.9756604 -1.1220012
## 2     Alaska  1.9305379 -1.0624269
## 3    Arizona  1.7454429  0.7384595
## 4   Arkansas -0.1399989 -1.1085423
## 5 California  2.4986128  1.5274267
## 6   Colorado  1.4993407  0.9776297
ggplot(PC, aes(PC1, PC2)) +
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_text(aes(label = State), size = 3) +
  xlab("First Principal Component") +
  ylab("Second Principal Component") +
  ggtitle("First Two Principal Components of USArrests Data")

PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04
PVEplot <- qplot(c(1:4), PVE) +
  geom_line() +
  xlab("Principal Component") +
  ylab("PVE") +
  ggtitle("Scree Plot") +
  ylim(0,1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cumPVE <- qplot(c(1:4), cumsum(PVE)) +
  geom_line() +
  xlab("Principal Component") +
  ylab(NULL) +
  ggtitle("Cumulative Scree Plot") +
  ylim(0,1)

grid.arrange(PVEplot, cumPVE, ncol = 2)

pca_result <- prcomp(USArrests, scale = TRUE)
pca_result$rotation <- -pca_result$rotation
pca_result$x <- -pca_result$x
biplot(pca_result, scale = 0)

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(dplyr)
library(ISLR)
select <- dplyr::select
train = Smarket %>%
  filter(Year < 2005)

test = Smarket %>%
  filter(Year >= 2005)

model_LDA = lda(Direction ~ Lag1 + Lag2, data = train)
print(model_LDA)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(model_LDA)

predictions_LDA = data.frame(predict(model_LDA, test))
names(predictions_LDA)
## [1] "class"          "posterior.Down" "posterior.Up"   "LD1"
predictions_LDA = cbind(test, predictions_LDA)

predictions_LDA %>%
  count(class, Direction)
##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106
predictions_LDA %>%
  summarize(score = mean(class == Direction))
##       score
## 1 0.5595238
model_logistic = glm(Direction ~ Lag1 + Lag2, data = train, family = binomial)

logistic_probs = data.frame(probs = predict(model_logistic, test, type = "response"))

predictions_logistic = logistic_probs %>%
  mutate(class = ifelse(probs>.5, "Up", "Down"))

predictions_logistic = cbind(test, predictions_logistic)

predictions_logistic %>%
  count(class, Direction)
##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106
predictions_logistic %>%
  summarize(score = mean(class == Direction))
##       score
## 1 0.5595238
model_QDA = qda(Direction ~ Lag1 + Lag2, data = train)
model_QDA
## Call:
## qda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
predictions_QDA = data.frame(predict(model_QDA, test))

predictions_QDA = cbind(test, predictions_QDA)

predictions_QDA %>%
  count(class, Direction)
##   class Direction   n
## 1  Down      Down  30
## 2  Down        Up  20
## 3    Up      Down  81
## 4    Up        Up 121
predictions_QDA %>%
  summarize(score = mean(class == Direction))
##       score
## 1 0.5992063
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
library(dplyr)
Carseats_scaled <- Carseats %>%
  relocate(ShelveLoc, .after = Education) %>%
  mutate(across(where(is.numeric), scale))
str(Carseats_scaled)
## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num [1:400, 1] 0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
##   ..- attr(*, "scaled:center")= num 7.5
##   ..- attr(*, "scaled:scale")= num 2.82
##  $ CompPrice  : num [1:400, 1] 0.849 -0.911 -0.781 -0.52 1.045 ...
##   ..- attr(*, "scaled:center")= num 125
##   ..- attr(*, "scaled:scale")= num 15.3
##  $ Income     : num [1:400, 1] 0.155 -0.738 -1.203 1.12 -0.166 ...
##   ..- attr(*, "scaled:center")= num 68.7
##   ..- attr(*, "scaled:scale")= num 28
##  $ Advertising: num [1:400, 1] 0.656 1.408 0.506 -0.396 -0.547 ...
##   ..- attr(*, "scaled:center")= num 6.63
##   ..- attr(*, "scaled:scale")= num 6.65
##  $ Population : num [1:400, 1] 0.0757 -0.0328 0.0282 1.3649 0.51 ...
##   ..- attr(*, "scaled:center")= num 265
##   ..- attr(*, "scaled:scale")= num 147
##  $ Price      : num [1:400, 1] 0.178 -1.385 -1.512 -0.794 0.515 ...
##   ..- attr(*, "scaled:center")= num 116
##   ..- attr(*, "scaled:scale")= num 23.7
##  $ Age        : num [1:400, 1] -0.699 0.721 0.35 0.104 -0.946 ...
##   ..- attr(*, "scaled:center")= num 53.3
##   ..- attr(*, "scaled:scale")= num 16.2
##  $ Education  : num [1:400, 1] 1.183 -1.4882 -0.725 0.0382 -0.3434 ...
##   ..- attr(*, "scaled:center")= num 13.9
##   ..- attr(*, "scaled:scale")= num 2.62
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(123)
t.idx <- createDataPartition(Carseats_scaled$ShelveLoc, p = 0.7, list = FALSE)
train_carseats <- Carseats_scaled[t.idx,]
test_carseats <- Carseats_scaled[-t.idx,]
model_LDA1 <- lda(ShelveLoc ~ ., data = train_carseats)
print(model_LDA1)
## Call:
## lda(ShelveLoc ~ ., data = train_carseats)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2411348 0.2127660 0.5460993 
## 
## Group means:
##              Sales   CompPrice       Income Advertising Population       Price
## Bad    -0.69063852 -0.19592484  0.229783710 -0.06894802 0.02573242 -0.13233432
## Good    0.98762797  0.12988132 -0.005032271  0.14009258 0.03987521  0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710  0.03047387 0.03078256 -0.01136249
##                Age   Education  UrbanYes     USYes
## Bad    -0.15697788  0.09427825 0.7794118 0.6029412
## Good   -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium  0.08430782  0.05798375 0.7272727 0.6363636
## 
## Coefficients of linear discriminants:
##                     LD1        LD2
## Sales        2.71880530 -0.1423670
## CompPrice   -1.24362409  0.2527614
## Income      -0.48513557 -0.7345195
## Advertising -0.81733638  0.3394201
## Population   0.02168590 -0.0236600
## Price        2.14173028 -0.1795976
## Age          0.80586756  0.6475076
## Education    0.03677347  0.0136664
## UrbanYes    -0.33391364 -0.2404916
## USYes        0.37937261 -0.5681783
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9899 0.0101
plot(model_LDA1)

lda_pred <- predict(model_LDA1, newdata = test_carseats)
table(Predict = lda_pred$class, Actual = test_carseats$ShelveLoc)
##         Actual
## Predict  Bad Good Medium
##   Bad     20    0      5
##   Good     0   18      1
##   Medium   8    7     59
predictions_LDA = cbind(test_carseats, lda_pred)

predictions_LDA %>%
  count(class, ShelveLoc)
##    class ShelveLoc  n
## 1    Bad       Bad 20
## 2    Bad    Medium  5
## 3   Good      Good 18
## 4   Good    Medium  1
## 5 Medium       Bad  8
## 6 Medium      Good  7
## 7 Medium    Medium 59
predictions_LDA %>%
  summarize(score = mean(class == ShelveLoc))
##       score
## 1 0.8220339
model_QDA1 <- qda(ShelveLoc ~ ., data = train_carseats)
print(model_QDA1)
## Call:
## qda(ShelveLoc ~ ., data = train_carseats)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2411348 0.2127660 0.5460993 
## 
## Group means:
##              Sales   CompPrice       Income Advertising Population       Price
## Bad    -0.69063852 -0.19592484  0.229783710 -0.06894802 0.02573242 -0.13233432
## Good    0.98762797  0.12988132 -0.005032271  0.14009258 0.03987521  0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710  0.03047387 0.03078256 -0.01136249
##                Age   Education  UrbanYes     USYes
## Bad    -0.15697788  0.09427825 0.7794118 0.6029412
## Good   -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium  0.08430782  0.05798375 0.7272727 0.6363636
qda_pred = predict(model_QDA1, newdata = test_carseats)

predictions_QDA = cbind(test_carseats, qda_pred)

predictions_QDA %>%
  count(class, ShelveLoc)
##    class ShelveLoc  n
## 1    Bad       Bad 17
## 2    Bad    Medium  6
## 3   Good      Good 19
## 4   Good    Medium  1
## 5 Medium       Bad 11
## 6 Medium      Good  6
## 7 Medium    Medium 58
predictions_QDA %>%
  summarize(score = mean(class == ShelveLoc))
##       score
## 1 0.7966102
table(Predict = qda_pred$class, Actual = test_carseats$ShelveLoc)
##         Actual
## Predict  Bad Good Medium
##   Bad     17    0      6
##   Good     0   19      1
##   Medium  11    6     58