Read data

crime <- read.csv("state_crime.csv", header = T, sep = ",", stringsAsFactors = T)

Convert “Year” as Factor

crime$Year <- as.factor(crime$Year)

Load Library DPLYR

library(dplyr)

Using DPLYR function, select the required columns and filter the required year and eliminate the unnecessary elements

crime1 <- crime %>% select(State, Year, Data.Rates.Property.Burglary, Data.Rates.Property.Larceny, 
                           Data.Rates.Property.Motor, 
                           Data.Rates.Violent.Assault, Data.Rates.Violent.Murder, Data.Rates.Violent.Rape, 
                           Data.Rates.Violent.Robbery) %>%
          filter(Year== 2019, State != "United States")

Convert the DF into standard normal format

crime2 <- scale(crime1[3:9])

Activate library factoextra for visualizing optimal number of clusters

library(factoextra)

Use fviz function to view Elbow plot for visualizing optimal number of clusters

fviz_nbclust(crime2, kmeans, method = "wss")

fviz_nbclust(crime2, kmeans, method = "silhouette")

Set seed for obtaining same results every time

set.seed(1234)

Use kmeans algorithm, mention the df and mention the number of clusters to be formed

Analyse the cluster values. Check between to total sum of squares

crime_cluster1 <- kmeans(crime2, 2)
crime_cluster1
## K-means clustering with 2 clusters of sizes 20, 31
## 
## Cluster means:
##   Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1                    0.9427298                   0.8224926
## 2                   -0.6082128                  -0.5306404
##   Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1                 0.9439707                  0.7017456
## 2                -0.6090134                 -0.4527391
##   Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1                 0.6286338               0.3573943                  0.5123014
## 2                -0.4055702              -0.2305770                 -0.3305170
## 
## Clustering vector:
##  [1] 1 1 1 1 1 1 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1 2 1 2 2 1 1
## [39] 2 2 1 2 1 1 2 2 2 1 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 158.29420  68.84104
##  (between_SS / total_SS =  35.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Use fviz_cluster algorithm to visualize the clusters formed

fviz_cluster(crime_cluster1, data = crime2)

Use kmeans algorithm and create three clusters and visualize the clusters

crime_cluster2 <- kmeans(crime2, 3)
crime_cluster2
## K-means clustering with 3 clusters of sizes 15, 10, 26
## 
## Cluster means:
##   Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1                  -0.96598611                 -0.98238282
## 2                   1.25279290                  1.25623495
## 3                   0.07545625                  0.08359203
##   Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1                -1.1239134                 -0.7559138
## 2                 0.9745695                  1.5679476
## 3                 0.2735771                 -0.1669526
##   Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1                -0.6214763             -0.51417653               -0.535548273
## 2                 1.3343556              0.73601714                0.784048716
## 3                -0.1546697              0.01355679                0.007412959
## 
## Clustering vector:
##  [1] 2 2 3 2 3 3 1 3 2 3 3 3 1 3 3 1 3 3 2 1 3 1 3 3 3 2 3 3 3 1 1 2 1 3 3 3 2 3
## [39] 1 1 2 3 2 3 3 1 1 3 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 16.48446 99.62064 65.28491
##  (between_SS / total_SS =  48.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(crime_cluster2, data = crime2)

Use kmeans algorithm and create 4 clusters(centers = 4) and visualize the clusters

crime_cluster3 <- kmeans(crime2, centers = 4, nstart = 100)
crime_cluster3
## K-means clustering with 4 clusters of sizes 15, 1, 10, 25
## 
## Cluster means:
##   Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1                  -0.96598611                 -0.98238282
## 2                  -0.59733629                  4.57391402
## 3                   1.42413519                  0.77230560
## 4                   0.03383104                  0.09755089
##   Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1                -1.1239134                 -0.7559138
## 2                 1.1845513                  2.6124762
## 3                 1.0105077                  1.3496809
## 4                 0.2227629                 -0.1908231
##   Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1                -0.6214763             -0.51417653                -0.53554827
## 2                 4.9630097              0.06330600                 5.70824842
## 3                 0.8233157              0.84448608                 0.28162367
## 4                -0.1549608             -0.03182075                -0.01965044
## 
## Clustering vector:
##  [1] 3 3 4 3 4 4 1 4 2 4 4 4 1 4 4 1 4 4 3 1 4 1 4 4 4 3 4 4 3 1 1 3 1 4 4 4 3 4
## [39] 1 1 3 4 3 4 4 1 1 4 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 16.48446  0.00000 44.13860 60.16874
##  (between_SS / total_SS =  65.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
fviz_cluster(crime_cluster3, data = crime2)

Assign crime1 to crime 3 and create new DF

crime3 <- crime1

Create a new column named “Clusters” from the three cluster solution formed

crime3$Clusters <- crime_cluster3$cluster

Name the Clusters

crime3$Cluster_names <- factor(crime3$Clusters, levels = c(1,2,3,4), 
                               labels  = c("Peaceful", "Critical", "Serious", "Under Control"))

Attach crime3

attach(crime3)

Group the Clusters by Cluster names and Summarize the Columns

crime4 <- crime3 %>% group_by(Cluster_names) %>% summarize(Larceny= mean(Data.Rates.Property.Larceny), 
                   Burglary = mean(Data.Rates.Property.Burglary),
                   Motor= mean(Data.Rates.Property.Motor),
                   Assault= mean(Data.Rates.Violent.Assault), 
                   Murder= mean(Data.Rates.Violent.Murder),
                   Rape= mean(Data.Rates.Violent.Rape), 
                   Robbery= mean(Data.Rates.Violent.Robbery), Count= n())

View DataFrame

crime4
## # A tibble: 4 x 9
##   Cluster_names Larceny Burglary Motor Assault Murder  Rape Robbery Count
##   <fct>           <dbl>    <dbl> <dbl>   <dbl>  <dbl> <dbl>   <dbl> <int>
## 1 Peaceful        1109.     209.  106.    160.   2.85  37.6    39.3    15
## 2 Critical        3775.     261.  331.    592.  23.5   48.9   384.      1
## 3 Serious         1951.     547.  314.    430.   8.19  64.2    84.5    10
## 4 Under Control   1627.     350.  237.    233.   4.57  47.0    67.8    25

Load Library GGPLOT2 for Visualizing the Grouped Data

library(ggplot2)

Bar plot of CRIMES v/s CLUSTER NAMES

ggplot(crime4) + aes(x= Cluster_names, y= Larceny, fill= Cluster_names, label= round(Larceny, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Burglary, fill= Cluster_names, label= round(Burglary, digits = 2)) + geom_col()+ geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Motor, fill= Cluster_names, label= round(Motor, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Assault, fill= Cluster_names, label= round(Assault, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Murder, fill= Cluster_names, label= round(Murder, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Rape, fill= Cluster_names, label= round(Rape, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Robbery, fill= Cluster_names, label= round(Robbery, digits = 2)) + geom_col() + geom_text()