Principle components analysis

Load the data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#load data
data("iris")

Calculate principal components

#calculate principal components
results <- prcomp(iris[,colnames(iris)[1:4]], scale = TRUE)

#reverse the signs
results$rotation <- -1*results$rotation

#display principal components
results$rotation
##                     PC1        PC2        PC3        PC4
## Sepal.Length -0.5210659 0.37741762 -0.7195664 -0.2612863
## Sepal.Width   0.2693474 0.92329566  0.2443818  0.1235096
## Petal.Length -0.5804131 0.02449161  0.1421264  0.8014492
## Petal.Width  -0.5648565 0.06694199  0.6342727 -0.5235971
#reverse the signs of the scores
results$x <- -1*results$x

#display the first six scores
head(results$x)
##           PC1        PC2         PC3          PC4
## [1,] 2.257141  0.4784238 -0.12727962 -0.024087508
## [2,] 2.074013 -0.6718827 -0.23382552 -0.102662845
## [3,] 2.356335 -0.3407664  0.04405390 -0.028282305
## [4,] 2.291707 -0.5953999  0.09098530  0.065735340
## [5,] 2.381863  0.6446757  0.01568565  0.035802870
## [6,] 2.068701  1.4842053  0.02687825 -0.006586116

Visualize the Results

biplot(results, scale = 0)

data_pot=data.frame(iris,results$x)

library(ggplot2)
# Scatter plot by group, upPCA
ggplot(data_pot, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point()

# Scatter plot by group, PCA
ggplot(data_pot, aes(x = PC1, y = PC2, color = Species)) +
  geom_point()

data_spedal=data.frame(results$rotation)
# Scatter plot 
ggplot(data_spedal, aes(x = PC1, y = PC2)) +
  geom_point()

Find Variance Explained by Each Principal Component

#calculate total variance explained by each principal component
results$sdev^2 / sum(results$sdev^2)
## [1] 0.729624454 0.228507618 0.036689219 0.005178709
#calculate total variance explained by each principal component
var_explained = results$sdev^2 / sum(results$sdev^2)

#create scree plot
qplot(c(1:4), var_explained) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Applications:

  • reduce dimensions and biplot, then understand which observations are similar to each other.
  • principal components regression to prevent multicolinearity