Data management & Exploratory analysis

crime1<- read.table("crime_data.csv",sep=",",header = T)
year<- crime1[crime1$Year==2011,] ######## change rowname with
year<-year[,!colSums(year==0)>(77/2)] #########getting rid of the ones with alot 0
##################################################
crime<- year[,c(28,19,17,13)]
##############################################################
crime<-data.frame(lapply(crime, function(x) x/year$Population ))
rownames(crime)<- year$Name_Community##
#######
desc_stats <- data.frame(       #######checking for difference in means and SD
  Min = apply(crime, 2, min),
  Med = apply(crime, 2, median), 
  Mean = apply(crime, 2, mean),
  SD = apply(crime, 2, sd),  
  Max = apply(crime, 2, max) # Maximum
)
head(desc_stats)
##                              Min          Med         Mean           SD
## THEFT               0.0071422273 2.492282e-02 0.0283471059 1.919120e-02
## MOTOR.VEHICLE.THEFT 0.0006172295 7.467330e-03 0.0075568728 4.614360e-03
## KIDNAPPING          0.0000000000 7.787859e-05 0.0001127810 9.500035e-05
## GAMBLING            0.0000000000 5.605067e-05 0.0002943944 4.878404e-04
##                              Max
## THEFT               0.1361540826
## MOTOR.VEHICLE.THEFT 0.0229485396
## KIDNAPPING          0.0003888673
## GAMBLING            0.0019770550
crime<- scale(crime)
crime<- data.frame(crime)

Before we starting to clustering, we need to inspect the data and sort it, since each state have uneven number of populations, we can divide each of the 4 variables by population and standardized the data to make them comparable.

Observe the data before apply cluster methods, to see if there exists any patterns or any hidden structure to discover

ggpairs(crime,aes(col="black"),lower = list(continuous = wrap("smooth", method = "lm")))+
  labs(subtitle="Plot 1") +  
  theme(plot.subtitle = element_text(size =15, colour="red3"))

1 or 2 points are very far from the line of best fit, which lowers the correlation, this may raise the awareness that we may have a cluster build with 1 or 2 communities.

Dendrograms

discrime<- dist(crime)
hcrime<- hclust(discrime)

######general dendrogram########################

fullden<-ggdendrogram(hcrime,size=2,rotate = F,hang=-1,  
                      theme_dendro = F,labels = T)+
  ggtitle("Dendrogram",
          subtitle = "Plot 2")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

hclustcomplete2 <- eclust(crime,k=2 ,"hclust",hc_method  = "complete",nboot = 50) # compute hclust
hclustcomplete3 <- eclust(crime,k=3 ,"hclust",hc_method  = "complete",nboot = 50) # compute hclust

####################Dendrograms
#####################################################
#######the 2 cluster one
hden2<-fviz_dend(hclustcomplete2, rect = TRUE)+
  ggtitle("Cluster Dendrogram for 2 Clusters",
          subtitle = "Plot 3")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

##########one with 3 clusters
hden3<-fviz_dend(hclustcomplete3, rect = TRUE)+
  ggtitle("Cluster Dendrogram for 3 Clusters",
          subtitle = "Plot 4")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))
##################bombine 1,2,3#########################
grid.arrange(fullden,hden2,hden3, 
             layout_matrix=cbind(c(1,1),c(2,3)))

cluster plots & silhouette plots

hcluster2<-fviz_cluster(hclustcomplete2)+
  ggtitle("Cluster plot for 2 Clusters",subtitle = "Plot 5")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

# ##########cluster plot for 3
hcluster3<-fviz_cluster(hclustcomplete3)+
  ggtitle("Cluster plot for 3 Clusters",subtitle = "Plot 6")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

hsilhouette2<-fviz_silhouette(hclustcomplete2)+
  ggtitle("Silhouette plot for 2 Clusters",subtitle = "Plot 7")+
  labs(caption="Average Silhouette width = 0.58")+
  theme(plot.caption  = element_text(colour = "purple",
                                     size = 15,hjust = 0.5))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))
##   cluster size ave.sil.width
## 1       1   75          0.60
## 2       2    2          0.06
##########################
##########################silhouette with 3 clusters
hsilhouette3<-fviz_silhouette(hclustcomplete3)+
  ggtitle("Silhouette plot for 3 Clusters",subtitle = "Plot 8")+
  labs(caption="Average Silhouette width = 0.5")+
  theme(plot.caption  = element_text(colour = "purple",
                                     size = 15,hjust = 0.5,vjust = 1))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))
##   cluster size ave.sil.width
## 1       1   67          0.52
## 2       2    8          0.49
## 3       3    2         -0.02
###########combine plots#################################
grid.arrange(hcluster2,hcluster3,
             hsilhouette2,hsilhouette3,
             ncol = 2, nrow = 2)