#install.packages("tidyverse")
#install.packages("gridExtra")
library(tidyverse) # data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.2.1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gridExtra) # plot arrangement
## Warning: package 'gridExtra' was built under R version 4.2.1
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
data("USArrests")
head(USArrests, 10)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
## Connecticut 3.3 110 77 11.1
## Delaware 5.9 238 72 15.8
## Florida 15.4 335 80 31.9
## Georgia 17.4 211 60 25.8
apply(USArrests, 2, var)
## Murder Assault UrbanPop Rape
## 18.97047 6945.16571 209.51878 87.72916
scaled_df <- apply(USArrests, 2, scale)
head(scaled_df)
## Murder Assault UrbanPop Rape
## [1,] 1.24256408 0.7828393 -0.5209066 -0.003416473
## [2,] 0.50786248 1.1068225 -1.2117642 2.484202941
## [3,] 0.07163341 1.4788032 0.9989801 1.042878388
## [4,] 0.23234938 0.2308680 -1.0735927 -0.184916602
## [5,] 0.27826823 1.2628144 1.7589234 2.067820292
## [6,] 0.02571456 0.3988593 0.8608085 1.864967207
arrests.cov <- cov(scaled_df)
arrests.eigen <- eigen(arrests.cov)
str(arrests.eigen)
## List of 2
## $ values : num [1:4] 2.48 0.99 0.357 0.173
## $ vectors: num [1:4, 1:4] -0.536 -0.583 -0.278 -0.543 0.418 ...
## - attr(*, "class")= chr "eigen"
(phi <- arrests.eigen$vectors[,1:2])
## [,1] [,2]
## [1,] -0.5358995 0.4181809
## [2,] -0.5831836 0.1879856
## [3,] -0.2781909 -0.8728062
## [4,] -0.5434321 -0.1673186
phi <- -phi
row.names(phi) <- c("Murder", "Assault", "UrbanPop", "Rape")
colnames(phi) <- c("PC1", "PC2")
phi
## PC1 PC2
## Murder 0.5358995 -0.4181809
## Assault 0.5831836 -0.1879856
## UrbanPop 0.2781909 0.8728062
## Rape 0.5434321 0.1673186
PC1 <- as.matrix(scaled_df) %*% phi[,1]
PC2 <- as.matrix(scaled_df) %*% phi[,2]
PC <- data.frame(State = row.names(USArrests), PC1, PC2)
head(PC)
## State PC1 PC2
## 1 Alabama 0.9756604 -1.1220012
## 2 Alaska 1.9305379 -1.0624269
## 3 Arizona 1.7454429 0.7384595
## 4 Arkansas -0.1399989 -1.1085423
## 5 California 2.4986128 1.5274267
## 6 Colorado 1.4993407 0.9776297
#install.packages("ggplot2")
library(ggplot2)
ggplot(PC, aes(PC1, PC2)) +
modelr::geom_ref_line(h = 0) +
modelr::geom_ref_line(v = 0) +
geom_text(aes(label = State), size = 3) +
xlab("First Principal Component") +
ylab("Second Principal Component") +
ggtitle("First Two Principal Components of USArrests Data")

PVE <- arrests.eigen$values / sum(arrests.eigen$values)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04
PVEplot <- qplot(c(1:4), PVE) +
geom_line() +
xlab("Principal Component") +
ylab("PVE") +
ggtitle("Scree Plot") +
ylim(0, 1)
cumPVE <- qplot(c(1:4), cumsum(PVE)) +
geom_line() +
xlab("Principal Component") +
ylab(NULL) +
ggtitle("Cumulative Scree Plot") +
ylim(0,1)
grid.arrange(PVEplot, cumPVE, ncol = 2)

pca_result <- prcomp(USArrests, scale = TRUE)
names(pca_result)
## [1] "sdev" "rotation" "center" "scale" "x"
pca_result$center
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
pca_result$scale
## Murder Assault UrbanPop Rape
## 4.355510 83.337661 14.474763 9.366385
pca_result$rotation
## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432
pca_result$rotation <- -pca_result$rotation
pca_result$rotation
## PC1 PC2 PC3 PC4
## Murder 0.5358995 -0.4181809 0.3412327 -0.64922780
## Assault 0.5831836 -0.1879856 0.2681484 0.74340748
## UrbanPop 0.2781909 0.8728062 0.3780158 -0.13387773
## Rape 0.5434321 0.1673186 -0.8177779 -0.08902432
pca_result$x <- - pca_result$x
head(pca_result$x)
## PC1 PC2 PC3 PC4
## Alabama 0.9756604 -1.1220012 0.43980366 -0.154696581
## Alaska 1.9305379 -1.0624269 -2.01950027 0.434175454
## Arizona 1.7454429 0.7384595 -0.05423025 0.826264240
## Arkansas -0.1399989 -1.1085423 -0.11342217 0.180973554
## California 2.4986128 1.5274267 -0.59254100 0.338559240
## Colorado 1.4993407 0.9776297 -1.08400162 -0.001450164
biplot(pca_result, scale = 0)

pca_result$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
(VE <- pca_result$sdev^2)
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
PVE <- VE / sum(VE)
round(PVE, 2)
## [1] 0.62 0.25 0.09 0.04