Clustering

Data management & Exploratory analysis

crime1<- read.table("crime_data.csv",sep=",",header = T)
year<- crime1[crime1$Year==2011,] ######## change rowname with
year<-year[,!colSums(year==0)>(77/2)] #########getting rid of the ones with alot 0
##################################################
crime<- year[,c(28,19,17,13)]
##############################################################
crime<-data.frame(lapply(crime, function(x) x/year$Population ))
rownames(crime)<- year$Name_Community##
#######
desc_stats <- data.frame(       #######checking for difference in means and SD
  Min = apply(crime, 2, min),
  Med = apply(crime, 2, median), 
  Mean = apply(crime, 2, mean),
  SD = apply(crime, 2, sd),  
  Max = apply(crime, 2, max) # Maximum
)
head(desc_stats)

##                              Min          Med         Mean           SD
## THEFT               0.0071422273 2.492282e-02 0.0283471059 1.919120e-02
## MOTOR.VEHICLE.THEFT 0.0006172295 7.467330e-03 0.0075568728 4.614360e-03
## KIDNAPPING          0.0000000000 7.787859e-05 0.0001127810 9.500035e-05
## GAMBLING            0.0000000000 5.605067e-05 0.0002943944 4.878404e-04
##                              Max
## THEFT               0.1361540826
## MOTOR.VEHICLE.THEFT 0.0229485396
## KIDNAPPING          0.0003888673
## GAMBLING            0.0019770550

crime<- scale(crime)
crime<- data.frame(crime)

K means Clustering

Choosing the value of K

#############checking for the number of k by silhouette
nsilhouette<-fviz_nbclust(crime, kmeans, method = "silhouette",
                          linecolor = "white",
                          barfill = "pink")+theme_dark()+
  ggtitle("Optimal number of clusters by Silhouette average",
          subtitle = "Plot 9")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

#####################checking the number of k by WSS  
nwss<-fviz_nbclust(crime, kmeans, method = "wss")+
  geom_vline(xintercept = 3, linetype = 2)+theme_gray()+
  ggtitle("Optimal number of clusters by WSS",
          subtitle = "Plot 10")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))
##############################combine 6,7########
grid.arrange(nsilhouette,nwss,nrow=2,ncol=1)

discrime<- dist(crime)
hcrime<- hclust(discrime)
mcrimekn2 <- kmeans(crime, centers=2)
mcrimekn3 <-kmeans(crime, centers=3)
########################################################
#######################silhouette k means############
ksilhouette2<-fviz_silhouette(silhouette(mcrimekn2$cluster,discrime))+
  ggtitle("2 Clusters K- means Silhouette plot",subtitle = "Plot 11")+
  labs(caption="Average Silhouette width = 0.45")+
  theme(plot.caption  = element_text(colour = "purple",
                                     size = 15,hjust = 0.5))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

##   cluster size ave.sil.width
## 1       1   28          0.17
## 2       2   49          0.61

######
ksilhouette3<-fviz_silhouette(silhouette(mcrimekn3$cluster,discrime))+
  ggtitle("3 Clusters K- means Silhouette plot",subtitle = "Plot 12")+
  labs(caption="Average Silhouette width = 0.45")+
  theme(plot.caption  = element_text(colour = "purple",
                                     size = 15,hjust = 0.5))+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

##   cluster size ave.sil.width
## 1       1    2         -0.03
## 2       2   26          0.25
## 3       3   49          0.58

##################combine 9, 10
##grid.arrange(ksilhouette2,ksilhouette3,nrow=1,ncol=2)
###################################cluster plot 
k2clusters<-fviz_cluster(mcrimekn2, crime)+
  ggtitle("K-means Clusters plot for 2 clusters",subtitle = "Plot 13")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))



k3clusters<- fviz_cluster(mcrimekn3, crime)+
  ggtitle("K-means Clusters plot for 3 clusters",subtitle = "Plot 14")+
  theme(plot.title = element_text(hjust = 0.5,size = 15))+
  theme(plot.subtitle = element_text(size =15, colour="red3"))

#################combine 11,12
grid.arrange(nrow=2,ncol=2,ksilhouette2,ksilhouette3,
             k2clusters,k3clusters)

Clustering - K