#Load the data
cnbc <- read.csv("/Users/farihaarpa/Downloads/cnbc_data_2024.csv", header = TRUE)

#Remove the "Overall" ranking column from clustering analysis
cnbc_clean <- cnbc %>% select(-overall)

#Convert state column to row names
cnbc_clean <- tibble::column_to_rownames(cnbc_clean, var = "state")
# Check if the data is clusterable using the Hopkins statistic
set.seed(42)
hopkins_stat <- get_clust_tendency(cnbc_clean, n = nrow(cnbc_clean) / 10)$hopkins_stat
print(paste("Hopkins Statistic:", round(hopkins_stat, 3)))
## [1] "Hopkins Statistic: 0.527"
# Check if the data is clusterable using the Hopkins statistic
set.seed(42)
hopkins_stat <- get_clust_tendency(cnbc_clean, n = nrow(cnbc_clean) / 10)$hopkins_stat
print(paste("Hopkins Statistic:", round(hopkins_stat, 3)))
## [1] "Hopkins Statistic: 0.527"
# Find the optimal number of clusters
set.seed(42)
nb_clusters <- NbClust(cnbc_clean, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
optimal_k <- nb_clusters$Best.nc[1]
print(paste("Optimal Number of Clusters:", optimal_k))
## [1] "Optimal Number of Clusters: 9"
# Perform k-means clustering
set.seed(42)
kmeans_result <- kmeans(cnbc_clean, centers = optimal_k, nstart = 25)

# Add cluster labels to the original data
cnbc$cluster <- as.factor(kmeans_result$cluster)
library(grid)  
library(viridis)  

ggplot(data = cnbc, aes(x = as.factor(cluster), y = reorder(state, desc(overall)), 
                        label = state, color = as.factor(cluster))) +
  geom_point(aes(size = overall), alpha = 0.6, show.legend = FALSE) + 
  geom_text(size = 2.5, hjust = 1, check_overlap = TRUE, show.legend = FALSE) +  
  scale_color_viridis_d(option = "plasma") +  
  scale_size_continuous(range = c(2, 6)) +  
  labs(
    title = "Business Competitiveness Clusters in the U.S. (2024)",
    subtitle = "Cluster Grouping of CNBC's Top States for Business Rankings",
    x = "Cluster Group",
    y = "State Ranking",
    caption = "Source: CNBC Top States for Business 2024"
  ) +
  theme_minimal(base_size = 14) +  
  theme(
    legend.position = "none",  
    axis.text.y = element_text(size = 8), 
    axis.text.x = element_text(size = 10),  
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5), 
    plot.subtitle = element_text(size = 12, hjust = 0.5), 
    plot.caption = element_text(size = 8, hjust = 0.5),  
    plot.margin = unit(c(5, 5, 5, 5), "pt"),  
    axis.title = element_text(size = 12)  
  )

## [1] "Connecticut is in Cluster: 4"
## [1] "States in the same cluster as Connecticut:"
## [1] "Washington"    "Colorado"      "New York"      "California"   
## [5] "New Jersey"    "Oregon"        "Maryland"      "Connecticut"  
## [9] "Massachusetts"
## [1] "Defining Features of Connecticut's Cluster:"
##   overall infra_structure workforce  economy quality_of_life
## 1      25        23.44444  22.77778 26.55556        9.777778
##   cost_of_doing_business technology_innovation business_friendliness education
## 1               42.11111              10.55556              42.66667  17.77778
##   access_to_capital cost_of_living
## 1          13.22222             40