Phil Gonzalez_D590 ADS_Week 7 Coding Practice

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

data("USArrests")
dt = head(USArrests, 10)
kbl(dt) %>%
  kable_minimal()

	Murder	Assault	UrbanPop	Rape
Alabama	13.2	236	58	21.2
Alaska	10.0	263	48	44.5
Arizona	8.1	294	80	31.0
Arkansas	8.8	190	50	19.5
California	9.0	276	91	40.6
Colorado	7.9	204	78	38.7
Connecticut	3.3	110	77	11.1
Delaware	5.9	238	72	15.8
Florida	15.4	335	80	31.9
Georgia	17.4	211	60	25.8

dt = apply(USArrests, 2, var)
kbl(dt) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

	x
Murder	18.97047
Assault	6945.16571
UrbanPop	209.51878
Rape	87.72916

scaled_df <- apply(USArrests, 2, scale)
dt = head(scaled_df)
kbl(dt) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Murder	Assault	UrbanPop	Rape
1.2425641	0.7828393	-0.5209066	-0.0034165
0.5078625	1.1068225	-1.2117642	2.4842029
0.0716334	1.4788032	0.9989801	1.0428784
0.2323494	0.2308680	-1.0735927	-0.1849166
0.2782682	1.2628144	1.7589234	2.0678203
0.0257146	0.3988593	0.8608085	1.8649672

arrests.cov <- cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)

## List of 2
##  $ values : num [1:4] 2.48 0.99 0.357 0.173
##  $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
##  - attr(*, "class")= chr "eigen"

phi <- arrests.eigen$vectors[,1:2]
print(phi)

##            [,1]       [,2]
## [1,] -0.5358995  0.4181809
## [2,] -0.5831836  0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186

phi <- -1*phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
kbl(phi) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

	PC1	PC2
Murder	0.5358995	-0.4181809
Assault	0.5831836	-0.1879856
UrbanPop	0.2781909	0.8728062
Rape	0.5434321	0.1673186

PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]

PC <- data.frame(State = row.names(USArrests), PC1, PC2)
head(PC)

##        State        PC1        PC2
## 1    Alabama  0.9756604 -1.1220012
## 2     Alaska  1.9305379 -1.0624269
## 3    Arizona  1.7454429  0.7384595
## 4   Arkansas -0.1399989 -1.1085423
## 5 California  2.4986128  1.5274267
## 6   Colorado  1.4993407  0.9776297

ggplot(PC, aes(PC1, PC2)) +
  modelr::geom_ref_line(h = 0) +
  modelr::geom_ref_line(v = 0) +
  geom_text(aes(label = State), size = 3) +
  xlab("First Principal Component") +
  ylab("Second Principal Component") +
  ggtitle("First Two Principal Components of USArrests Data")

PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)

## [1] 0.62 0.25 0.09 0.04

PVEplot <- qplot(c(1:4), PVE) +
  geom_line() +
  xlab("Principal Component") +
  ylab("PVE") +
  ggtitle("Scree Plot") +
  ylim(0,1)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

cumPVE <- qplot(c(1:4), cumsum(PVE)) +
  geom_line() +
  xlab("Principal Component") +
  ylab(NULL) +
  ggtitle("Cumulative Scree Plot") +
  ylim(0,1)

grid.arrange(PVEplot, cumPVE, ncol = 2)

pca_result <- prcomp(USArrests, scale = TRUE)
pca_result$rotation <- -pca_result$rotation
pca_result$x <- -pca_result$x

biplot(pca_result, scale = 0)

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(dplyr)
library(ISLR)
select <- dplyr::select

train = Smarket %>%
  filter(Year < 2005)

test = Smarket %>%
  filter(Year >= 2005)

model_LDA = lda(Direction ~ Lag1 + Lag2, data = train)
print(model_LDA)

## Call:
## lda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293

plot(model_LDA)

predictions_LDA = data.frame(predict(model_LDA, test))
names(predictions_LDA)

## [1] "class"          "posterior.Down" "posterior.Up"   "LD1"

predictions_LDA = cbind(test, predictions_LDA)

predictions_LDA %>%
  count(class, Direction)

##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106

predictions_LDA %>%
  summarize(score = mean(class == Direction))

##       score
## 1 0.5595238

model_logistic = glm(Direction ~ Lag1 + Lag2, data = train, family = binomial)

logistic_probs = data.frame(probs = predict(model_logistic, test, type = "response"))

predictions_logistic = logistic_probs %>%
  mutate(class = ifelse(probs>.5, "Up", "Down"))

predictions_logistic = cbind(test, predictions_logistic)

predictions_logistic %>%
  count(class, Direction)

##   class Direction   n
## 1  Down      Down  35
## 2  Down        Up  35
## 3    Up      Down  76
## 4    Up        Up 106

predictions_logistic %>%
  summarize(score = mean(class == Direction))

##       score
## 1 0.5595238

model_QDA = qda(Direction ~ Lag1 + Lag2, data = train)
model_QDA

## Call:
## qda(Direction ~ Lag1 + Lag2, data = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544

predictions_QDA = data.frame(predict(model_QDA, test))

predictions_QDA = cbind(test, predictions_QDA)

predictions_QDA %>%
  count(class, Direction)

##   class Direction   n
## 1  Down      Down  30
## 2  Down        Up  20
## 3    Up      Down  81
## 4    Up        Up 121

predictions_QDA %>%
  summarize(score = mean(class == Direction))

##       score
## 1 0.5992063

summary(Carseats)

##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
##

library(dplyr)
Carseats_scaled <- Carseats %>%
  relocate(ShelveLoc, .after = Education) %>%
  mutate(across(where(is.numeric), scale))
str(Carseats_scaled)

## 'data.frame':    400 obs. of  11 variables:
##  $ Sales      : num [1:400, 1] 0.7095 1.3185 0.9078 -0.0341 -1.1849 ...
##   ..- attr(*, "scaled:center")= num 7.5
##   ..- attr(*, "scaled:scale")= num 2.82
##  $ CompPrice  : num [1:400, 1] 0.849 -0.911 -0.781 -0.52 1.045 ...
##   ..- attr(*, "scaled:center")= num 125
##   ..- attr(*, "scaled:scale")= num 15.3
##  $ Income     : num [1:400, 1] 0.155 -0.738 -1.203 1.12 -0.166 ...
##   ..- attr(*, "scaled:center")= num 68.7
##   ..- attr(*, "scaled:scale")= num 28
##  $ Advertising: num [1:400, 1] 0.656 1.408 0.506 -0.396 -0.547 ...
##   ..- attr(*, "scaled:center")= num 6.63
##   ..- attr(*, "scaled:scale")= num 6.65
##  $ Population : num [1:400, 1] 0.0757 -0.0328 0.0282 1.3649 0.51 ...
##   ..- attr(*, "scaled:center")= num 265
##   ..- attr(*, "scaled:scale")= num 147
##  $ Price      : num [1:400, 1] 0.178 -1.385 -1.512 -0.794 0.515 ...
##   ..- attr(*, "scaled:center")= num 116
##   ..- attr(*, "scaled:scale")= num 23.7
##  $ Age        : num [1:400, 1] -0.699 0.721 0.35 0.104 -0.946 ...
##   ..- attr(*, "scaled:center")= num 53.3
##   ..- attr(*, "scaled:scale")= num 16.2
##  $ Education  : num [1:400, 1] 1.183 -1.4882 -0.725 0.0382 -0.3434 ...
##   ..- attr(*, "scaled:center")= num 13.9
##   ..- attr(*, "scaled:scale")= num 2.62
##  $ ShelveLoc  : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
##  $ Urban      : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
##  $ US         : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(123)
t.idx <- createDataPartition(Carseats_scaled$ShelveLoc, p = 0.7, list = FALSE)
train_carseats <- Carseats_scaled[t.idx,]
test_carseats <- Carseats_scaled[-t.idx,]

model_LDA1 <- lda(ShelveLoc ~ ., data = train_carseats)
print(model_LDA1)

## Call:
## lda(ShelveLoc ~ ., data = train_carseats)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2411348 0.2127660 0.5460993 
## 
## Group means:
##              Sales   CompPrice       Income Advertising Population       Price
## Bad    -0.69063852 -0.19592484  0.229783710 -0.06894802 0.02573242 -0.13233432
## Good    0.98762797  0.12988132 -0.005032271  0.14009258 0.03987521  0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710  0.03047387 0.03078256 -0.01136249
##                Age   Education  UrbanYes     USYes
## Bad    -0.15697788  0.09427825 0.7794118 0.6029412
## Good   -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium  0.08430782  0.05798375 0.7272727 0.6363636
## 
## Coefficients of linear discriminants:
##                     LD1        LD2
## Sales        2.71880530 -0.1423670
## CompPrice   -1.24362409  0.2527614
## Income      -0.48513557 -0.7345195
## Advertising -0.81733638  0.3394201
## Population   0.02168590 -0.0236600
## Price        2.14173028 -0.1795976
## Age          0.80586756  0.6475076
## Education    0.03677347  0.0136664
## UrbanYes    -0.33391364 -0.2404916
## USYes        0.37937261 -0.5681783
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9899 0.0101

plot(model_LDA1)

lda_pred <- predict(model_LDA1, newdata = test_carseats)

table(Predict = lda_pred$class, Actual = test_carseats$ShelveLoc)

##         Actual
## Predict  Bad Good Medium
##   Bad     20    0      5
##   Good     0   18      1
##   Medium   8    7     59

predictions_LDA = cbind(test_carseats, lda_pred)

predictions_LDA %>%
  count(class, ShelveLoc)

##    class ShelveLoc  n
## 1    Bad       Bad 20
## 2    Bad    Medium  5
## 3   Good      Good 18
## 4   Good    Medium  1
## 5 Medium       Bad  8
## 6 Medium      Good  7
## 7 Medium    Medium 59

predictions_LDA %>%
  summarize(score = mean(class == ShelveLoc))

##       score
## 1 0.8220339

model_QDA1 <- qda(ShelveLoc ~ ., data = train_carseats)
print(model_QDA1)

## Call:
## qda(ShelveLoc ~ ., data = train_carseats)
## 
## Prior probabilities of groups:
##       Bad      Good    Medium 
## 0.2411348 0.2127660 0.5460993 
## 
## Group means:
##              Sales   CompPrice       Income Advertising Population       Price
## Bad    -0.69063852 -0.19592484  0.229783710 -0.06894802 0.02573242 -0.13233432
## Good    0.98762797  0.12988132 -0.005032271  0.14009258 0.03987521  0.12832607
## Medium -0.08260041 -0.02462403 -0.083124710  0.03047387 0.03078256 -0.01136249
##                Age   Education  UrbanYes     USYes
## Bad    -0.15697788  0.09427825 0.7794118 0.6029412
## Good   -0.06620249 -0.06360041 0.6666667 0.7500000
## Medium  0.08430782  0.05798375 0.7272727 0.6363636

qda_pred = predict(model_QDA1, newdata = test_carseats)

predictions_QDA = cbind(test_carseats, qda_pred)

predictions_QDA %>%
  count(class, ShelveLoc)

##    class ShelveLoc  n
## 1    Bad       Bad 17
## 2    Bad    Medium  6
## 3   Good      Good 19
## 4   Good    Medium  1
## 5 Medium       Bad 11
## 6 Medium      Good  6
## 7 Medium    Medium 58

predictions_QDA %>%
  summarize(score = mean(class == ShelveLoc))

##       score
## 1 0.7966102

table(Predict = qda_pred$class, Actual = test_carseats$ShelveLoc)

##         Actual
## Predict  Bad Good Medium
##   Bad     17    0      6
##   Good     0   19      1
##   Medium  11    6     58

Phil Gonzalez_D590 ADS_Week 7 Coding Practice_PCA

2025-10-13