Data management & Exploratory analysis
crime1<- read.table("crime_data.csv",sep=",",header = T)
year<- crime1[crime1$Year==2011,] ######## change rowname with
year<-year[,!colSums(year==0)>(77/2)] #########getting rid of the ones with alot 0
##################################################
crime<- year[,c(28,19,17,13)]
##############################################################
crime<-data.frame(lapply(crime, function(x) x/year$Population ))
rownames(crime)<- year$Name_Community##
#######
desc_stats <- data.frame( #######checking for difference in means and SD
Min = apply(crime, 2, min),
Med = apply(crime, 2, median),
Mean = apply(crime, 2, mean),
SD = apply(crime, 2, sd),
Max = apply(crime, 2, max) # Maximum
)
head(desc_stats)
## Min Med Mean SD
## THEFT 0.0071422273 2.492282e-02 0.0283471059 1.919120e-02
## MOTOR.VEHICLE.THEFT 0.0006172295 7.467330e-03 0.0075568728 4.614360e-03
## KIDNAPPING 0.0000000000 7.787859e-05 0.0001127810 9.500035e-05
## GAMBLING 0.0000000000 5.605067e-05 0.0002943944 4.878404e-04
## Max
## THEFT 0.1361540826
## MOTOR.VEHICLE.THEFT 0.0229485396
## KIDNAPPING 0.0003888673
## GAMBLING 0.0019770550
crime<- scale(crime)
crime<- data.frame(crime)
K means Clustering
Choosing the value of K
#############checking for the number of k by silhouette
nsilhouette<-fviz_nbclust(crime, kmeans, method = "silhouette",
linecolor = "white",
barfill = "pink")+theme_dark()+
ggtitle("Optimal number of clusters by Silhouette average",
subtitle = "Plot 9")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
#####################checking the number of k by WSS
nwss<-fviz_nbclust(crime, kmeans, method = "wss")+
geom_vline(xintercept = 3, linetype = 2)+theme_gray()+
ggtitle("Optimal number of clusters by WSS",
subtitle = "Plot 10")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
##############################combine 6,7########
grid.arrange(nsilhouette,nwss,nrow=2,ncol=1)
discrime<- dist(crime)
hcrime<- hclust(discrime)
mcrimekn2 <- kmeans(crime, centers=2)
mcrimekn3 <-kmeans(crime, centers=3)
########################################################
#######################silhouette k means############
ksilhouette2<-fviz_silhouette(silhouette(mcrimekn2$cluster,discrime))+
ggtitle("2 Clusters K- means Silhouette plot",subtitle = "Plot 11")+
labs(caption="Average Silhouette width = 0.45")+
theme(plot.caption = element_text(colour = "purple",
size = 15,hjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
## cluster size ave.sil.width
## 1 1 28 0.17
## 2 2 49 0.61
######
ksilhouette3<-fviz_silhouette(silhouette(mcrimekn3$cluster,discrime))+
ggtitle("3 Clusters K- means Silhouette plot",subtitle = "Plot 12")+
labs(caption="Average Silhouette width = 0.45")+
theme(plot.caption = element_text(colour = "purple",
size = 15,hjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
## cluster size ave.sil.width
## 1 1 2 -0.03
## 2 2 26 0.25
## 3 3 49 0.58
##################combine 9, 10
##grid.arrange(ksilhouette2,ksilhouette3,nrow=1,ncol=2)
###################################cluster plot
k2clusters<-fviz_cluster(mcrimekn2, crime)+
ggtitle("K-means Clusters plot for 2 clusters",subtitle = "Plot 13")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
k3clusters<- fviz_cluster(mcrimekn3, crime)+
ggtitle("K-means Clusters plot for 3 clusters",subtitle = "Plot 14")+
theme(plot.title = element_text(hjust = 0.5,size = 15))+
theme(plot.subtitle = element_text(size =15, colour="red3"))
#################combine 11,12
grid.arrange(nrow=2,ncol=2,ksilhouette2,ksilhouette3,
k2clusters,k3clusters)