load library needed

suppressPackageStartupMessages(library(tidyverse))

prepare dataset

data("USArrests")
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

calculate Principal component analysis

results <- prcomp(USArrests, scale. = T)

reverse the sign

results$rotation <- -1 * results$rotation

display principal components

results$rotation
##                PC1        PC2        PC3         PC4
## Murder   0.5358995 -0.4181809  0.3412327 -0.64922780
## Assault  0.5831836 -0.1879856  0.2681484  0.74340748
## UrbanPop 0.2781909  0.8728062  0.3780158 -0.13387773
## Rape     0.5434321  0.1673186 -0.8177779 -0.08902432

Reverse the sign of the scores

results$x <- -1 * results$x

Display the first six scores

head(results$x)
##                   PC1        PC2         PC3          PC4
## Alabama     0.9756604 -1.1220012  0.43980366 -0.154696581
## Alaska      1.9305379 -1.0624269 -2.01950027  0.434175454
## Arizona     1.7454429  0.7384595 -0.05423025  0.826264240
## Arkansas   -0.1399989 -1.1085423 -0.11342217  0.180973554
## California  2.4986128  1.5274267 -0.59254100  0.338559240
## Colorado    1.4993407  0.9776297 -1.08400162 -0.001450164

Next, we create Bi-plot (a plot that projects each of the observations in the dataset onto a scatterplot that uses the first and second principal components as the axes)

biplot(results, scale = 0)

Display states with the highest murder rates in the original dataset

head(USArrests[order(-USArrests$Murder),])
##                Murder Assault UrbanPop Rape
## Georgia          17.4     211       60 25.8
## Mississippi      16.1     259       44 17.1
## Florida          15.4     335       80 31.9
## Louisiana        15.4     249       66 22.2
## South Carolina   14.4     279       48 22.5
## Alabama          13.2     236       58 21.2

calculate total variance explained by each principal component

results$sdev^2 / sum(results$sdev^2)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752

Zach (2020)

calculate total variance explained by each principal component

var_explained = results$sdev^2 / sum(results$sdev^2)

create a screen plot

qplot(c(1:4), var_explained) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)

References

Zach. 2020. “Principal Components Analysis in r: Step-by-Step Example.” Statology. https://www.statology.org/principal-components-analysis-in-r/.