Kmeans clustering algorithm is an iterative algorithm that tries to partition the dataset into distinct non-overlapping clusters where each datapoint belongs to only one group. It assigns the data points to the clusters such that the euclidean distance between the data points and the cluster’s centroid is at the minimum. ` This dataset contains statistics, in arrest per 100,000 residents for assault, murder and rape in each of the 50 US States in 1973. The percentage of the population living in urban areas is also given. The aim of the dataset is to see if there is any dependency between the state been acquired and the arrest history.

data("USArrests")  # Load the data set
head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

any(is.na(USArrests))

## [1] FALSE

library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.1

## corrplot 0.84 loaded

corrplot(cor(USArrests), method = "number",
         type = "lower")

scale

USArrests <- scale(USArrests)
dim(USArrests)

## [1] 50  4

head(USArrests,n=5)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292

Required R Packages we’ll use mainly the following R packages:

cluster for computing clustering algorithms, and factoextra for ggplot2-based elegant visualization of clustering results.

library(cluster)
library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.1

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.1

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

how to prepare your data for cluster analysis and describes the essential R package for cluster analysis.

set.seed(123)
crime <- sample(1:50,10)

crime_1 <- USArrests[crime,]
head(crime_1)

##                 Murder    Assault    UrbanPop        Rape
## New Mexico  0.82929443  1.3708088  0.30812248  1.16031965
## Iowa       -1.28297267 -1.3770485 -0.58999237 -1.06038781
## Indiana    -0.13500142 -0.6930840 -0.03730631 -0.02476943
## Arizona     0.07163341  1.4788032  0.99898006  1.04287839
## Tennessee   1.24256408  0.2068693 -0.45182086  0.60514278
## Texas       1.12776696  0.3628612  0.99898006  0.45567209

Euclidean Distance

dist.eucl <- dist(crime_1, method = "euclidean")
head(dist.eucl)

## [1] 4.213139 2.590999 1.037685 1.552830 1.441824 2.155313

Euclidean Distance are put in a matrix and on;y 4 cities are shown. Distance are rounded at 1 decimal place.

round(as.matrix(dist.eucl)[1:4, 1:4], 1)

##            New Mexico Iowa Indiana Arizona
## New Mexico        0.0  4.2     2.6     1.0
## Iowa              4.2  0.0     1.8     4.1
## Indiana           2.6  1.8     0.0     2.6
## Arizona           1.0  4.1     2.6     0.0

Heat map

fviz_dist(dist.eucl)

Lets find out optimum cluster

Similar to the elbow method, there is a function fviz_nbclust() that is used to visualize and determine the optimal number of clusters.

wss <- sapply(1:crime, 
              function(k){kmeans(USArrests, k, nstart=20,iter.max = 15                     )$tot.withinss})

## Warning in 1:crime: numerical expression has 10 elements: only the first
## used

plot(1:crime, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

## Warning in 1:crime: numerical expression has 10 elements: only the first
## used

fviz_nbclust(USArrests, kmeans, method = 'wss') +
             geom_vline(xintercept = 4, linetype=5, col= "darkred")

r will start with 20 randdom starting points and find with lowest within cluster varition

The output of Kmeans returns a list of components. The most important one are listed below:

cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated. centers: A matrix of cluster centers. totss: The total sum of squares. withinss: Vector of within-cluster sum of squares, one component per cluster. tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss). betweenss: The between-cluster sum of squares, i.e. \(totss-tot.withinss\). size: The number of points in each cluster. These components can be accessed as follows

km.res <- kmeans(USArrests, 4, nstart = 20)
km.res

## K-means clustering with 4 clusters of sizes 13, 13, 8, 16
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2  0.6950701  1.0394414  0.7226370  1.27693964
## 3  1.4118898  0.8743346 -0.8145211  0.01927104
## 4 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              3              2              2              3              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              4              4              2              3 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              4              1              2              4              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              4              1              3              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              4              2              1              3              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              4 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              3              1              4 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              4              4              4              4              3 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              3              2              4              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              4              4              1              1              4 
## 
## Within cluster sum of squares by cluster:
## [1] 11.952463 19.922437  8.316061 16.212213
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

km.res$totss

## [1] 196

km.res$betweenss

## [1] 139.5968

139.5968/196

## [1] 0.7122286

Adding point classification to the original data.

df_member <- cbind(USArrests, cluster = km.res$cluster)
head(df_member,10)

##                  Murder    Assault   UrbanPop         Rape cluster
## Alabama      1.24256408  0.7828393 -0.5209066 -0.003416473       3
## Alaska       0.50786248  1.1068225 -1.2117642  2.484202941       2
## Arizona      0.07163341  1.4788032  0.9989801  1.042878388       2
## Arkansas     0.23234938  0.2308680 -1.0735927 -0.184916602       3
## California   0.27826823  1.2628144  1.7589234  2.067820292       2
## Colorado     0.02571456  0.3988593  0.8608085  1.864967207       2
## Connecticut -1.03041900 -0.7290821  0.7917228 -1.081740768       4
## Delaware    -0.43347395  0.8068381  0.4462940 -0.579946294       4
## Florida      1.74767144  1.9707777  0.9989801  1.138966691       2
## Georgia      2.20685994  0.4828549 -0.3827351  0.487701523       3

#save data file bu write
#write.csv(df_member,'file:///C:/Users/badal/Desktop/datset_/us_crime.csv')

Visualizing K-means Clusters

fviz_cluster(km.res, data = USArrests,
             palette=c("red", "blue", "black", "darkgreen"),
             ellipse.type = "euclid",
             star.plot = T,
             repel = T,
             ggtheme = theme())

hance, the optimal number of clusters and visualize K-mean clustring.

UsArrest

scale

Euclidean Distance

r will start with 20 randdom starting points and find with lowest within cluster varition