##IVY D. PONDIAS
##BS MATHEMATICS
##PROF. CARLITO DAAROL, MS
###################Principal Components Analysis###################
##install and load the package tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##load data
data("USArrests")
##view first six rows of data
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
#calculate principal components
results <- prcomp(USArrests, scale = TRUE)
#reverse the signs
results$rotation <- -1*results$rotation
#display principal components
results$rotation
## PC1 PC2 PC3 PC4
## Murder 0.5358995 -0.4181809 0.3412327 -0.64922780
## Assault 0.5831836 -0.1879856 0.2681484 0.74340748
## UrbanPop 0.2781909 0.8728062 0.3780158 -0.13387773
## Rape 0.5434321 0.1673186 -0.8177779 -0.08902432
#reverse the signs of the scores
results$x <- -1*results$x
#display the first six scores
head(results$x)
## PC1 PC2 PC3 PC4
## Alabama 0.9756604 -1.1220012 0.43980366 -0.154696581
## Alaska 1.9305379 -1.0624269 -2.01950027 0.434175454
## Arizona 1.7454429 0.7384595 -0.05423025 0.826264240
## Arkansas -0.1399989 -1.1085423 -0.11342217 0.180973554
## California 2.4986128 1.5274267 -0.59254100 0.338559240
## Colorado 1.4993407 0.9776297 -1.08400162 -0.001450164
biplot(results, scale = 0)

#display states with highest murder rates in original dataset
head(USArrests[order(-USArrests$Murder),])
## Murder Assault UrbanPop Rape
## Georgia 17.4 211 60 25.8
## Mississippi 16.1 259 44 17.1
## Florida 15.4 335 80 31.9
## Louisiana 15.4 249 66 22.2
## South Carolina 14.4 279 48 22.5
## Alabama 13.2 236 58 21.2
#calculate total variance explained by each principal component
results$sdev^2 / sum(results$sdev^2)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
#calculate total variance explained by each principal component
var_explained = results$sdev^2 / sum(results$sdev^2)
#create scree plot
qplot(c(1:4), var_explained) +
geom_line() +
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)

###################K-Means Clustering###################
##install and load the packages
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
#load data
df <- USArrests
#remove rows with missing values
df <- na.omit(df)
#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)
#view first six rows of dataset
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
fviz_nbclust(df, kmeans, method = "wss")

#calculate gap statistic based on number of clusters
gap_stat <- clusGap(df,
FUN = kmeans,
nstart = 25,
K.max = 10,
B = 50)
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#make this example reproducible
set.seed(1)
#perform k-means clustering with k = 4 clusters
km <- kmeans(df, centers = 4, nstart = 25)
#view results
km
## K-means clustering with 4 clusters of sizes 13, 13, 16, 8
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2 0.6950701 1.0394414 0.7226370 1.27693964
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 1.4118898 0.8743346 -0.8145211 0.01927104
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 4 2 2 4 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 4
## Hawaii Idaho Illinois Indiana Iowa
## 3 1 2 3 1
## Kansas Kentucky Louisiana Maine Maryland
## 3 1 4 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 1 4 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 4 1 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 4
## South Dakota Tennessee Texas Utah Vermont
## 1 4 2 3 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 11.952463 19.922437 16.212213 8.316061
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#plot results of final k-means model
fviz_cluster(km, data = df)

#find means of each cluster
aggregate(USArrests, by=list(cluster=km$cluster), mean)
## cluster Murder Assault UrbanPop Rape
## 1 1 3.60000 78.53846 52.07692 12.17692
## 2 2 10.81538 257.38462 76.00000 33.19231
## 3 3 5.65625 138.87500 73.87500 18.78125
## 4 4 13.93750 243.62500 53.75000 21.41250
#add cluster assigment to original data
final_data <- cbind(USArrests, cluster = km$cluster)
#view final data
head(final_data)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 4
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 2
## Arkansas 8.8 190 50 19.5 4
## California 9.0 276 91 40.6 2
## Colorado 7.9 204 78 38.7 2
###################K-Medoids Clustering###################
##install and load the packages
library(factoextra)
library(cluster)
#load data
df <- USArrests
#remove rows with missing values
df <- na.omit(df)
#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)
#plot clusters vs within sum of squares
fviz_nbclust(df, pam, method = "wss")

#calculate gap statistic based on number of clusters
gap_stat <- clusGap(df,
FUN = pam,
K.max = 10, #max clusters to consider
B = 50) #total bootstrapped iterations
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#make this example reproducible
set.seed(1)
#perform k-medoids clustering with k = 4 clusters
kmed <- pam(df, k = 4)
#view results
kmed
## Medoids:
## ID Murder Assault UrbanPop Rape
## Alabama 1 1.2425641 0.7828393 -0.5209066 -0.003416473
## Michigan 22 0.9900104 1.0108275 0.5844655 1.480613993
## Oklahoma 36 -0.2727580 -0.2371077 0.1699510 -0.131534211
## New Hampshire 29 -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 4 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 4 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 4 1 2 3 4
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 4 4 3
## Objective function:
## build swap
## 1.035116 1.027102
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
#plot results of final k-medoids model
fviz_cluster(kmed, data = df)

#add cluster assignment to original data
final_data <- cbind(USArrests, cluster = kmed$cluster)
#view final data
head(final_data)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 2
## Arkansas 8.8 190 50 19.5 1
## California 9.0 276 91 40.6 2
## Colorado 7.9 204 78 38.7 2
###################Hierarchical Clustering###################
##install and load the packages
library(factoextra)
library(cluster)
#load data
df <- USArrests
#remove rows with missing values
df <- na.omit(df)
#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)
#define linkage methods
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
#function to compute agglomerative coefficient
ac <- function(x) {
agnes(df, method = x)$ac
}
#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)
## average single complete ward
## 0.7379371 0.6276128 0.8531583 0.9346210
#perform hierarchical clustering using Ward's minimum variance
clust <- agnes(df, method = "ward")
#produce dendrogram
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram")

#calculate gap statistic for each number of clusters (up to 10 clusters)
gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50)
#produce plot of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#compute distance matrix
d <- dist(df, method = "euclidean")
#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )
#cut the dendrogram into 4 clusters
groups <- cutree(final_clust, k=4)
# Number of members in each cluster
table(groups)
## groups
## 1 2 3 4
## 7 12 19 12
#append cluster labels to original data
final_data <- cbind(USArrests, cluster = groups)
#display first six rows of final data
head(final_data)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 2
## Arkansas 8.8 190 50 19.5 3
## California 9.0 276 91 40.6 2
## Colorado 7.9 204 78 38.7 2
#find mean values for each cluster
aggregate(final_data, by=list(cluster=final_data$cluster), mean)
## cluster Murder Assault UrbanPop Rape cluster
## 1 1 14.671429 251.2857 54.28571 21.68571 1
## 2 2 10.966667 264.0000 76.50000 33.60833 2
## 3 3 6.210526 142.0526 71.26316 19.18421 3
## 4 4 3.091667 76.0000 52.08333 11.83333 4