Read data
crime <- read.csv("state_crime.csv", header = T, sep = ",", stringsAsFactors = T)
Convert “Year” as Factor
crime$Year <- as.factor(crime$Year)
Load Library DPLYR
library(dplyr)
Using DPLYR function, select the required columns and filter the required year and eliminate the unnecessary elements
crime1 <- crime %>% select(State, Year, Data.Rates.Property.Burglary, Data.Rates.Property.Larceny,
Data.Rates.Property.Motor,
Data.Rates.Violent.Assault, Data.Rates.Violent.Murder, Data.Rates.Violent.Rape,
Data.Rates.Violent.Robbery) %>%
filter(Year== 2019, State != "United States")
Use fviz function to view Elbow plot for visualizing optimal number of clusters
fviz_nbclust(crime2, kmeans, method = "wss")

fviz_nbclust(crime2, kmeans, method = "silhouette")

Set seed for obtaining same results every time
set.seed(1234)
Use kmeans algorithm, mention the df and mention the number of clusters to be formed
Analyse the cluster values. Check between to total sum of squares
crime_cluster1 <- kmeans(crime2, 2)
crime_cluster1
## K-means clustering with 2 clusters of sizes 20, 31
##
## Cluster means:
## Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1 0.9427298 0.8224926
## 2 -0.6082128 -0.5306404
## Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1 0.9439707 0.7017456
## 2 -0.6090134 -0.4527391
## Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1 0.6286338 0.3573943 0.5123014
## 2 -0.4055702 -0.2305770 -0.3305170
##
## Clustering vector:
## [1] 1 1 1 1 1 1 2 2 1 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1 2 1 2 2 1 1
## [39] 2 2 1 2 1 1 2 2 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 158.29420 68.84104
## (between_SS / total_SS = 35.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Use kmeans algorithm and create three clusters and visualize the clusters
crime_cluster2 <- kmeans(crime2, 3)
crime_cluster2
## K-means clustering with 3 clusters of sizes 15, 10, 26
##
## Cluster means:
## Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1 -0.96598611 -0.98238282
## 2 1.25279290 1.25623495
## 3 0.07545625 0.08359203
## Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1 -1.1239134 -0.7559138
## 2 0.9745695 1.5679476
## 3 0.2735771 -0.1669526
## Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1 -0.6214763 -0.51417653 -0.535548273
## 2 1.3343556 0.73601714 0.784048716
## 3 -0.1546697 0.01355679 0.007412959
##
## Clustering vector:
## [1] 2 2 3 2 3 3 1 3 2 3 3 3 1 3 3 1 3 3 2 1 3 1 3 3 3 2 3 3 3 1 1 2 1 3 3 3 2 3
## [39] 1 1 2 3 2 3 3 1 1 3 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 16.48446 99.62064 65.28491
## (between_SS / total_SS = 48.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(crime_cluster2, data = crime2)

Use kmeans algorithm and create 4 clusters(centers = 4) and visualize the clusters
crime_cluster3 <- kmeans(crime2, centers = 4, nstart = 100)
crime_cluster3
## K-means clustering with 4 clusters of sizes 15, 1, 10, 25
##
## Cluster means:
## Data.Rates.Property.Burglary Data.Rates.Property.Larceny
## 1 -0.96598611 -0.98238282
## 2 -0.59733629 4.57391402
## 3 1.42413519 0.77230560
## 4 0.03383104 0.09755089
## Data.Rates.Property.Motor Data.Rates.Violent.Assault
## 1 -1.1239134 -0.7559138
## 2 1.1845513 2.6124762
## 3 1.0105077 1.3496809
## 4 0.2227629 -0.1908231
## Data.Rates.Violent.Murder Data.Rates.Violent.Rape Data.Rates.Violent.Robbery
## 1 -0.6214763 -0.51417653 -0.53554827
## 2 4.9630097 0.06330600 5.70824842
## 3 0.8233157 0.84448608 0.28162367
## 4 -0.1549608 -0.03182075 -0.01965044
##
## Clustering vector:
## [1] 3 3 4 3 4 4 1 4 2 4 4 4 1 4 4 1 4 4 3 1 4 1 4 4 4 3 4 4 3 1 1 3 1 4 4 4 3 4
## [39] 1 1 3 4 3 4 4 1 1 4 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 16.48446 0.00000 44.13860 60.16874
## (between_SS / total_SS = 65.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(crime_cluster3, data = crime2)

Assign crime1 to crime 3 and create new DF
crime3 <- crime1
Name the Clusters
crime3$Cluster_names <- factor(crime3$Clusters, levels = c(1,2,3,4),
labels = c("Peaceful", "Critical", "Serious", "Under Control"))
Attach crime3
attach(crime3)
Group the Clusters by Cluster names and Summarize the Columns
crime4 <- crime3 %>% group_by(Cluster_names) %>% summarize(Larceny= mean(Data.Rates.Property.Larceny),
Burglary = mean(Data.Rates.Property.Burglary),
Motor= mean(Data.Rates.Property.Motor),
Assault= mean(Data.Rates.Violent.Assault),
Murder= mean(Data.Rates.Violent.Murder),
Rape= mean(Data.Rates.Violent.Rape),
Robbery= mean(Data.Rates.Violent.Robbery), Count= n())
View DataFrame
crime4
## # A tibble: 4 x 9
## Cluster_names Larceny Burglary Motor Assault Murder Rape Robbery Count
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Peaceful 1109. 209. 106. 160. 2.85 37.6 39.3 15
## 2 Critical 3775. 261. 331. 592. 23.5 48.9 384. 1
## 3 Serious 1951. 547. 314. 430. 8.19 64.2 84.5 10
## 4 Under Control 1627. 350. 237. 233. 4.57 47.0 67.8 25
Load Library GGPLOT2 for Visualizing the Grouped Data
library(ggplot2)
Bar plot of CRIMES v/s CLUSTER NAMES
ggplot(crime4) + aes(x= Cluster_names, y= Larceny, fill= Cluster_names, label= round(Larceny, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Burglary, fill= Cluster_names, label= round(Burglary, digits = 2)) + geom_col()+ geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Motor, fill= Cluster_names, label= round(Motor, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Assault, fill= Cluster_names, label= round(Assault, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Murder, fill= Cluster_names, label= round(Murder, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Rape, fill= Cluster_names, label= round(Rape, digits = 2)) + geom_col() + geom_text()

ggplot(crime4) + aes(x= Cluster_names, y= Robbery, fill= Cluster_names, label= round(Robbery, digits = 2)) + geom_col() + geom_text()
