# Necessary Libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the data
data <- read.csv("us_crime_data_02101dc2-349e-48cf-8789-793b608d4210.csv")
#
country <- data$State
#
data <- data[,-1]
#
nd <- scale(data)
# (a) Obtain a PCA based 3-dimensional projection of the data. PCA to be done using
# standardized data.
pca_data <- prcomp(nd, scale = TRUE)
#summary(pca_data)
#dim(pca_data$x)
# Extract first 3 PC
pca_projection <- as.data.frame((pca_data)$x[, 1:3])
# Plot the 3D projection
plot_ly(pca_projection, x = ~PC1, y = ~PC2, z = ~PC3, type = "scatter3d", mode = "markers" )
# (b) Detect outliers, if any, from the PCA projection plot obtained in (a).
# Using mahalanobis distance for detecting outliers
maha_dist <- mahalanobis(pca_projection, colMeans(pca_projection), cov(pca_projection))
outliers <- which(maha_dist > qchisq(0.975, df = 3))
print("Outliers detected in PCA projection :")
## [1] "Outliers detected in PCA projection :"
print(data[outliers, 1]) #prints outliers corresponding to states
## [1] 152555 27600 1117696
# (c) Obtain scree plot and suggest the number of principal components for efficient data
# dimensional reduction.
fviz_eig(pca_data, addlabels = TRUE)

# (d) Find the proportion of total sample variation captured by the first 3 principal components.
prop_var <- summary(pca_data)$importance[,1:3]
print("Proportion of variance explained by the first 3 PCs : ")
## [1] "Proportion of variance explained by the first 3 PCs : "
print(prop_var)
## PC1 PC2 PC3
## Standard deviation 2.642358 0.1030908 0.05768881
## Proportion of Variance 0.997440 0.0015200 0.00048000
## Cumulative Proportion 0.997440 0.9989500 0.99943000
# Create a DataFrame with case identifiers and PC1 scores
ranking1 <- data.frame(CaseTag = country, PC1_Score = pca_data$x[, 1])
ranking1 <- arrange(ranking1,desc(PC1_Score))
# Rank cases based on PC1 (Higher PC1 score = Higher Rank)
ranking1 <- ranking1 %>% arrange(desc(PC1_Score)) %>% mutate(Rank = row_number())
# (e) Find the sample correlation coefficient between the first PC and the variable “assault”.
assault_index <- which(colnames(data) == "assault")
correlation <- cor(pca_data$x[, 1], data[, assault_index])
print("Correlation between first PC and 'assault':")
## [1] "Correlation between first PC and 'assault':"
print(correlation)
##
## [1,]
# (f) Obtain complete linkage hierarchical cluster dendogram. Partition the states into 5
# clusters using the constructed dendogram and list the states in each of the clusters.
dist_matrix <- dist(scale(data[, -1])) # Standardize data and compute distance
hc <- hclust(dist_matrix, method = "complete")
plot(hc, hang = -1, main = "Dendrogram of US Crime Data") # Plot dendrogram

# Cut dendrogram into 5 clusters
clusters <- cutree(hc, k = 5)
data$cluster <- clusters
# List states in each cluster
cluster_list <- split(country, data$cluster)
print("States in each cluster:")
## [1] "States in each cluster:"
print(cluster_list)
## $`1`
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "Colorado" "Connecticut"
## [7] "Delaware" "District of Columbia" "Georgia"
## [10] "Hawaii" "Idaho" "Indiana"
## [13] "Iowa" "Kansas" "Kentucky"
## [16] "Louisiana" "Maine" "Maryland"
## [19] "Massachusetts" "Michigan" "Minnesota"
## [22] "Mississippi" "Missouri" "Montana"
## [25] "Nebraska" "Nevada" "New Hampshire"
## [28] "New Jersey" "New Mexico" "North Carolina"
## [31] "North Dakota" "Ohio" "Oklahoma"
## [34] "Oregon" "Pennsylvania" "Rhode Island"
## [37] "South Carolina" "South Dakota" "Tennessee"
## [40] "Utah" "Vermont" "Virginia"
## [43] "Washington" "West Virginia" "Wisconsin"
## [46] "Wyoming"
##
## $`2`
## [1] "California"
##
## $`3`
## [1] "Florida" "Illinois" "New York"
##
## $`4`
## [1] "Texas"
##
## $`5`
## [1] "United States"