#
#Principal Components Analysis
#PREPARED BY NEIL JOHN P.GUTIERREZ
homedir <- "C:/Users/NP Gutierrez/OneDrive/Desktop/R-Studio Only"
setwd(homedir)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#load data
data("USArrests")
#view first six rows of data
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
#calculate principal components
results <- prcomp(USArrests, scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3 PC4
## Murder 0.5358995 -0.4181809 0.3412327 -0.64922780
## Assault 0.5831836 -0.1879856 0.2681484 0.74340748
## UrbanPop 0.2781909 0.8728062 0.3780158 -0.13387773
## Rape 0.5434321 0.1673186 -0.8177779 -0.08902432
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3 PC4
## Alabama 0.9756604 -1.1220012 0.43980366 -0.154696581
## Alaska 1.9305379 -1.0624269 -2.01950027 0.434175454
## Arizona 1.7454429 0.7384595 -0.05423025 0.826264240
## Arkansas -0.1399989 -1.1085423 -0.11342217 0.180973554
## California 2.4986128 1.5274267 -0.59254100 0.338559240
## Colorado 1.4993407 0.9776297 -1.08400162 -0.001450164
biplot(results, scale = 0)

#display states with highest murder rates in original dataset
head(USArrests[order(-USArrests$Murder),])
## Murder Assault UrbanPop Rape
## Georgia 17.4 211 60 25.8
## Mississippi 16.1 259 44 17.1
## Florida 15.4 335 80 31.9
## Louisiana 15.4 249 66 22.2
## South Carolina 14.4 279 48 22.5
## Alabama 13.2 236 58 21.2
#calculate total variance explained by each principal component
results$sdev^2 / sum(results$sdev^2)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
#calculate total variance explained by each principal component
var_explained = results$sdev^2 / sum(results$sdev^2)
#create scree plot
qplot(c(1:4), var_explained) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
