# Necessary Libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the data
data <- read.csv("us_crime_data_02101dc2-349e-48cf-8789-793b608d4210.csv")
#
country <- data$State
#
data <- data[,-1]
#
nd <- scale(data)
# (a) Obtain a PCA based 3-dimensional projection of the data. PCA to be done using
# standardized data.
pca_data <- prcomp(nd, scale = TRUE)
#summary(pca_data)
#dim(pca_data$x)
# Extract first 3 PC
pca_projection <- as.data.frame((pca_data)$x[, 1:3])
# Plot the 3D projection
plot_ly(pca_projection, x = ~PC1, y = ~PC2, z = ~PC3, type = "scatter3d", mode = "markers" )
# (b) Detect outliers, if any, from the PCA projection plot obtained in (a).
# Using mahalanobis distance for detecting outliers
maha_dist <- mahalanobis(pca_projection, colMeans(pca_projection), cov(pca_projection))
outliers <- which(maha_dist > qchisq(0.975, df = 3))
print("Outliers detected in PCA projection :")
## [1] "Outliers detected in PCA projection :"
print(data[outliers, 1]) #prints outliers corresponding to states
## [1]  152555   27600 1117696
# (c) Obtain scree plot and suggest the number of principal components for efficient data
# dimensional reduction.
fviz_eig(pca_data, addlabels = TRUE)

# (d) Find the proportion of total sample variation captured by the first 3 principal components.
prop_var <- summary(pca_data)$importance[,1:3] 
print("Proportion of variance explained by the first 3 PCs : ")
## [1] "Proportion of variance explained by the first 3 PCs : "
print(prop_var)
##                             PC1       PC2        PC3
## Standard deviation     2.642358 0.1030908 0.05768881
## Proportion of Variance 0.997440 0.0015200 0.00048000
## Cumulative Proportion  0.997440 0.9989500 0.99943000
# Create a DataFrame with case identifiers and PC1 scores
ranking1 <- data.frame(CaseTag = country, PC1_Score = pca_data$x[, 1])
ranking1 <- arrange(ranking1,desc(PC1_Score))
# Rank cases based on PC1 (Higher PC1 score = Higher Rank)
ranking1 <- ranking1 %>% arrange(desc(PC1_Score)) %>% mutate(Rank = row_number())

# (e) Find the sample correlation coefficient between the first PC and the variable “assault”.

assault_index <- which(colnames(data) == "assault")
correlation <- cor(pca_data$x[, 1], data[, assault_index])
print("Correlation between first PC and 'assault':")
## [1] "Correlation between first PC and 'assault':"
print(correlation)
##     
## [1,]
# (f) Obtain complete linkage hierarchical cluster dendogram. Partition the states into 5
# clusters using the constructed dendogram and list the states in each of the clusters.

dist_matrix <- dist(scale(data[, -1]))  # Standardize data and compute distance
hc <- hclust(dist_matrix, method = "complete")
plot(hc, hang = -1, main = "Dendrogram of US Crime Data")  # Plot dendrogram

# Cut dendrogram into 5 clusters
clusters <- cutree(hc, k = 5)
data$cluster <- clusters

# List states in each cluster
cluster_list <- split(country, data$cluster)
print("States in each cluster:")
## [1] "States in each cluster:"
print(cluster_list)
## $`1`
##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "Arkansas"             "Colorado"             "Connecticut"         
##  [7] "Delaware"             "District of Columbia" "Georgia"             
## [10] "Hawaii"               "Idaho"                "Indiana"             
## [13] "Iowa"                 "Kansas"               "Kentucky"            
## [16] "Louisiana"            "Maine"                "Maryland"            
## [19] "Massachusetts"        "Michigan"             "Minnesota"           
## [22] "Mississippi"          "Missouri"             "Montana"             
## [25] "Nebraska"             "Nevada"               "New Hampshire"       
## [28] "New Jersey"           "New Mexico"           "North Carolina"      
## [31] "North Dakota"         "Ohio"                 "Oklahoma"            
## [34] "Oregon"               "Pennsylvania"         "Rhode Island"        
## [37] "South Carolina"       "South Dakota"         "Tennessee"           
## [40] "Utah"                 "Vermont"              "Virginia"            
## [43] "Washington"           "West Virginia"        "Wisconsin"           
## [46] "Wyoming"             
## 
## $`2`
## [1] "California"
## 
## $`3`
## [1] "Florida"  "Illinois" "New York"
## 
## $`4`
## [1] "Texas"
## 
## $`5`
## [1] "United States"