CrimeData using KMeans Clustering:
The Cluster with maximum # of murders seems to be a major threat to Live.
The 2nd most cluster with a maximum on Assault, Rape and other factors takes the 2nd rank
on the most dangerous state to live.
The Next two clusters would take those rankings based on the # of Crime rate categories
on rape, murder, assault and urbanpop metrics.
install.packages("plyr",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/tswaminathan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'plyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\tswaminathan\AppData\Local\Temp\RtmpkXoLDt\downloaded_packages
library(plyr)
mydata <- read.csv(file.choose())
str(mydata)
## 'data.frame': 50 obs. of 5 variables:
## $ X : Factor w/ 50 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
normalized_data<-scale(mydata[,2:5])
wss = (nrow(normalized_data)-1)*sum(apply(normalized_data, 2, var)) # Determine number of clusters by scree-plot
for (i in 2:5) wss[i] = sum(kmeans(normalized_data, centers=i)$withinss)
plot(1:5, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares") # Look for an "elbow" in the scree plot #
title(sub = "K-Means Clustering Scree-Plot")

fit <- kmeans(normalized_data, 4) # 4 cluster solution
final2<- data.frame(mydata, fit$cluster) # append cluster membership
final2
## X Murder Assault UrbanPop Rape fit.cluster
## 1 Alabama 13.2 236 58 21.2 2
## 2 Alaska 10.0 263 48 44.5 4
## 3 Arizona 8.1 294 80 31.0 4
## 4 Arkansas 8.8 190 50 19.5 2
## 5 California 9.0 276 91 40.6 4
## 6 Colorado 7.9 204 78 38.7 4
## 7 Connecticut 3.3 110 77 11.1 3
## 8 Delaware 5.9 238 72 15.8 3
## 9 Florida 15.4 335 80 31.9 4
## 10 Georgia 17.4 211 60 25.8 2
## 11 Hawaii 5.3 46 83 20.2 3
## 12 Idaho 2.6 120 54 14.2 1
## 13 Illinois 10.4 249 83 24.0 4
## 14 Indiana 7.2 113 65 21.0 3
## 15 Iowa 2.2 56 57 11.3 1
## 16 Kansas 6.0 115 66 18.0 3
## 17 Kentucky 9.7 109 52 16.3 1
## 18 Louisiana 15.4 249 66 22.2 2
## 19 Maine 2.1 83 51 7.8 1
## 20 Maryland 11.3 300 67 27.8 4
## 21 Massachusetts 4.4 149 85 16.3 3
## 22 Michigan 12.1 255 74 35.1 4
## 23 Minnesota 2.7 72 66 14.9 1
## 24 Mississippi 16.1 259 44 17.1 2
## 25 Missouri 9.0 178 70 28.2 4
## 26 Montana 6.0 109 53 16.4 1
## 27 Nebraska 4.3 102 62 16.5 1
## 28 Nevada 12.2 252 81 46.0 4
## 29 New Hampshire 2.1 57 56 9.5 1
## 30 New Jersey 7.4 159 89 18.8 3
## 31 New Mexico 11.4 285 70 32.1 4
## 32 New York 11.1 254 86 26.1 4
## 33 North Carolina 13.0 337 45 16.1 2
## 34 North Dakota 0.8 45 44 7.3 1
## 35 Ohio 7.3 120 75 21.4 3
## 36 Oklahoma 6.6 151 68 20.0 3
## 37 Oregon 4.9 159 67 29.3 3
## 38 Pennsylvania 6.3 106 72 14.9 3
## 39 Rhode Island 3.4 174 87 8.3 3
## 40 South Carolina 14.4 279 48 22.5 2
## 41 South Dakota 3.8 86 45 12.8 1
## 42 Tennessee 13.2 188 59 26.9 2
## 43 Texas 12.7 201 80 25.5 4
## 44 Utah 3.2 120 80 22.9 3
## 45 Vermont 2.2 48 32 11.2 1
## 46 Virginia 8.5 156 63 20.7 3
## 47 Washington 4.0 145 73 26.2 3
## 48 West Virginia 5.7 81 39 9.3 1
## 49 Wisconsin 2.6 53 66 10.8 1
## 50 Wyoming 6.8 161 60 15.6 3
aggregate(mydata[,2:5], by=list(fit$cluster), FUN=mean)
## Group.1 Murder Assault UrbanPop Rape
## 1 1 3.60000 78.53846 52.07692 12.17692
## 2 2 13.93750 243.62500 53.75000 21.41250
## 3 3 5.65625 138.87500 73.87500 18.78125
## 4 4 10.81538 257.38462 76.00000 33.19231
table(fit$cluster)
##
## 1 2 3 4
## 13 8 16 13
# install.packages("animation")
# library(animation)
# nm <- (normalized_data[,1:1])
# km <- kmeans(normalized_data,4) #kmeans clustering
# str(km)
# km$cluster
# km1 <- kmeans.ani(normalized_data, 4)
# str(km1)
# km1$centers