#Load the data
cnbc <- read.csv("/Users/farihaarpa/Downloads/cnbc_data_2024.csv", header = TRUE)
#Remove the "Overall" ranking column from clustering analysis
cnbc_clean <- cnbc %>% select(-overall)
#Convert state column to row names
cnbc_clean <- tibble::column_to_rownames(cnbc_clean, var = "state")
# Check if the data is clusterable using the Hopkins statistic
set.seed(42)
hopkins_stat <- get_clust_tendency(cnbc_clean, n = nrow(cnbc_clean) / 10)$hopkins_stat
print(paste("Hopkins Statistic:", round(hopkins_stat, 3)))
## [1] "Hopkins Statistic: 0.527"
# Check if the data is clusterable using the Hopkins statistic
set.seed(42)
hopkins_stat <- get_clust_tendency(cnbc_clean, n = nrow(cnbc_clean) / 10)$hopkins_stat
print(paste("Hopkins Statistic:", round(hopkins_stat, 3)))
## [1] "Hopkins Statistic: 0.527"
# Find the optimal number of clusters
set.seed(42)
nb_clusters <- NbClust(cnbc_clean, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 10 proposed 3 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
optimal_k <- nb_clusters$Best.nc[1]
print(paste("Optimal Number of Clusters:", optimal_k))
## [1] "Optimal Number of Clusters: 9"
# Perform k-means clustering
set.seed(42)
kmeans_result <- kmeans(cnbc_clean, centers = optimal_k, nstart = 25)
# Add cluster labels to the original data
cnbc$cluster <- as.factor(kmeans_result$cluster)
library(grid)
library(viridis)
ggplot(data = cnbc, aes(x = as.factor(cluster), y = reorder(state, desc(overall)),
label = state, color = as.factor(cluster))) +
geom_point(aes(size = overall), alpha = 0.6, show.legend = FALSE) +
geom_text(size = 2.5, hjust = 1, check_overlap = TRUE, show.legend = FALSE) +
scale_color_viridis_d(option = "plasma") +
scale_size_continuous(range = c(2, 6)) +
labs(
title = "Business Competitiveness Clusters in the U.S. (2024)",
subtitle = "Cluster Grouping of CNBC's Top States for Business Rankings",
x = "Cluster Group",
y = "State Ranking",
caption = "Source: CNBC Top States for Business 2024"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10),
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
plot.caption = element_text(size = 8, hjust = 0.5),
plot.margin = unit(c(5, 5, 5, 5), "pt"),
axis.title = element_text(size = 12)
)

## [1] "Connecticut is in Cluster: 4"
## [1] "States in the same cluster as Connecticut:"
## [1] "Washington" "Colorado" "New York" "California"
## [5] "New Jersey" "Oregon" "Maryland" "Connecticut"
## [9] "Massachusetts"
## [1] "Defining Features of Connecticut's Cluster:"
## overall infra_structure workforce economy quality_of_life
## 1 25 23.44444 22.77778 26.55556 9.777778
## cost_of_doing_business technology_innovation business_friendliness education
## 1 42.11111 10.55556 42.66667 17.77778
## access_to_capital cost_of_living
## 1 13.22222 40