Principle components analysis
Load the data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#load data
data("iris")
Calculate principal components
#calculate principal components
results <- prcomp(iris[,colnames(iris)[1:4]], scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3 PC4
## Sepal.Length -0.5210659 0.37741762 -0.7195664 -0.2612863
## Sepal.Width 0.2693474 0.92329566 0.2443818 0.1235096
## Petal.Length -0.5804131 0.02449161 0.1421264 0.8014492
## Petal.Width -0.5648565 0.06694199 0.6342727 -0.5235971
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3 PC4
## [1,] 2.257141 0.4784238 -0.12727962 -0.024087508
## [2,] 2.074013 -0.6718827 -0.23382552 -0.102662845
## [3,] 2.356335 -0.3407664 0.04405390 -0.028282305
## [4,] 2.291707 -0.5953999 0.09098530 0.065735340
## [5,] 2.381863 0.6446757 0.01568565 0.035802870
## [6,] 2.068701 1.4842053 0.02687825 -0.006586116
Visualize the Results
biplot(results, scale = 0)

data_pot=data.frame(iris,results$x)
library(ggplot2)
# Scatter plot by group, upPCA
ggplot(data_pot, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point()

# Scatter plot by group, PCA
ggplot(data_pot, aes(x = PC1, y = PC2, color = Species)) +
geom_point()

data_spedal=data.frame(results$rotation)
# Scatter plot
ggplot(data_spedal, aes(x = PC1, y = PC2)) +
geom_point()

Find Variance Explained by Each Principal Component
#calculate total variance explained by each principal component
results$sdev^2 / sum(results$sdev^2)
## [1] 0.729624454 0.228507618 0.036689219 0.005178709
#calculate total variance explained by each principal component
var_explained = results$sdev^2 / sum(results$sdev^2)
#create scree plot
qplot(c(1:4), var_explained) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Applications:
- reduce dimensions and biplot, then understand which observations are
similar to each other.
- principal components regression to prevent multicolinearity