load library needed

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

prepare dataset

data("USArrests")
head(USArrests)

calculate Principal component analysis

results <- prcomp(USArrests, scale. = T)

reverse the sign

results$rotation <- -1 * results$rotation

display principal components

results$rotation
##                PC1        PC2        PC3         PC4
## Murder   0.5358995 -0.4181809  0.3412327 -0.64922780
## Assault  0.5831836 -0.1879856  0.2681484  0.74340748
## UrbanPop 0.2781909  0.8728062  0.3780158 -0.13387773
## Rape     0.5434321  0.1673186 -0.8177779 -0.08902432

Reverse the sign of the scores

results$x <- -1 * results$x

Display the first six scores

head(results$x)
##                   PC1        PC2         PC3          PC4
## Alabama     0.9756604 -1.1220012  0.43980366 -0.154696581
## Alaska      1.9305379 -1.0624269 -2.01950027  0.434175454
## Arizona     1.7454429  0.7384595 -0.05423025  0.826264240
## Arkansas   -0.1399989 -1.1085423 -0.11342217  0.180973554
## California  2.4986128  1.5274267 -0.59254100  0.338559240
## Colorado    1.4993407  0.9776297 -1.08400162 -0.001450164

Next, we create Bi-plot (a plot that projects each of the observations in the dataset onto a scatterplot that uses the first and second principal components as the axes)

biplot(results, scale = 0)

Display states with the highest murder rates in the original dataset

head(USArrests[order(-USArrests$Murder),])

calculate total variance explained by each principal component

results$sdev^2 / sum(results$sdev^2)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752

calculate total variance explained by each principal component

var_explained = results$sdev^2 / sum(results$sdev^2)

create a screen plot

qplot(c(1:4), var_explained) + 
  geom_line() + 
  xlab("Principal Component") + 
  ylab("Variance Explained") +
  ggtitle("Scree Plot") +
  ylim(0, 1)