Part 1: K-means Clustering
# Setting a random seed so data can be reproduced
set.seed(1234)
# Loading the data set
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
tornadoes <- read_csv("us_tornado_dataset_1950_2021 3.csv", show_col_types = FALSE)
# Focusing on tornado data from 2000-2010 only
tornadoes2021 <- tornadoes[-1:-67058, ]
# Loading summary statistics of data set
summary(tornadoes2021)
## yr mo dy date
## Min. :2021 Min. : 8.00 Min. : 1.00 Min. :2021-08-18
## 1st Qu.:2021 1st Qu.:10.00 1st Qu.:11.00 1st Qu.:2021-10-10
## Median :2021 Median :11.00 Median :15.00 Median :2021-11-10
## Mean :2021 Mean :10.63 Mean :15.92 Mean :2021-11-04
## 3rd Qu.:2021 3rd Qu.:12.00 3rd Qu.:21.00 3rd Qu.:2021-12-15
## Max. :2021 Max. :12.00 Max. :31.00 Max. :2021-12-31
## st mag inj fat
## Length:500 Min. :-9.000 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Median : 1.000 Median : 0.000 Median : 0.000
## Mean :-0.098 Mean : 1.404 Mean : 0.184
## 3rd Qu.: 1.000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. : 4.000 Max. :515.000 Max. :57.000
## slat slon elat elon
## Min. :26.51 Min. :-122.60 Min. :26.53 Min. :-122.52
## 1st Qu.:35.85 1st Qu.: -95.45 1st Qu.:35.91 1st Qu.: -95.41
## Median :40.06 Median : -91.99 Median :40.08 Median : -91.98
## Mean :39.02 Mean : -90.44 Mean :39.06 Mean : -90.37
## 3rd Qu.:42.43 3rd Qu.: -86.30 3rd Qu.:42.51 3rd Qu.: -86.15
## Max. :48.06 Max. : -70.21 Max. :48.10 Max. : -70.21
## len wid
## Min. : 0.0100 Min. : 1.0
## 1st Qu.: 0.8875 1st Qu.: 50.0
## Median : 2.6900 Median : 75.0
## Mean : 5.2762 Mean : 136.6
## 3rd Qu.: 6.3225 3rd Qu.: 150.0
## Max. :168.5300 Max. :2600.0
# Removing categorical & unnecessary variables and scaling data
data <- tornadoes2021 %>% select(-date, -st, -dy, -yr, -inj, -elat, elon)
# Generating clusters with k-means
kmeans(data, centers = 3, iter.max = 100, nstart = 100)
## K-means clustering with 3 clusters of sizes 426, 4, 70
##
## Cluster means:
## mo mag fat slat slon elon len
## 1 10.58451 -0.3661972 0.004694836 39.17028 -90.55788 -90.51692 3.756878
## 2 12.00000 3.5000000 16.250000000 36.31227 -88.99440 -87.40565 100.085000
## 3 10.80000 1.3285714 0.357142857 38.23395 -89.77881 -89.65041 9.105000
## wid
## 1 76.01174
## 2 1950.00000
## 3 401.68571
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 3 1 1 1 1 1 3 1 1 1 3 1 1 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1
## [186] 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1 3 1 1
## [223] 3 1 3 1 3 3 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 3 3 2 1 1 3 3 1 1 3 1 1 1 1 1 1 3 1 1 1 1 2 2 1 1 1 1 3 1 3 1 1 1 3 3 3
## [334] 1 2 3 3 1 1 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3
## [445] 1 1 1 1 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3
## [482] 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 3 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 1093668.6 763013.6 2205373.7
## (between_SS / total_SS = 82.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Determining optimal number of clusters with k-means
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(data, kmeans, method = "silhouette")
fviz_nbclust(data, kmeans, method = "wss")
fviz_nbclust(data, kmeans, method = "gap_stat")
# Using silhouette method value for k to plot clusters
fviz_cluster(kmeans(data, centers = 2, iter.max = 100, nstart = 100), data = data)
# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slat, y = wid , color = factor(tor_clust))) +
geom_point()
# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slon, y = wid , color = factor(tor_clust))) +
geom_point()
# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slat, y = len , color = factor(tor_clust))) +
geom_point()
# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slon, y = len , color = factor(tor_clust))) +
geom_point()
Part 2: Hierarchical Clustering
# Creating linkage steps to extract clusters
dist_tornadoes <- dist(data, method = 'euclidean')
hc_tornadoes_comp <- hclust(dist_tornadoes, method = 'complete')
hc_tornadoes_avg <- hclust(dist_tornadoes, method = 'average')
hc_tornadoes_sing <- hclust(dist_tornadoes, method = 'single')
# Extracting k clusters
cluster_assignments <- cutree(hc_tornadoes_comp, k = 2)
cluster_assignments
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
tornadoes_clustered <- mutate(data, cluster = cluster_assignments)
tornadoes_clustered
## # A tibble: 500 × 9
## mo mag fat slat slon elon len wid cluster
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 8 1 0 40.5 -76.0 -76.0 1.07 125 1
## 2 8 1 0 40.3 -75.3 -75.2 8.76 140 1
## 3 8 0 0 40.6 -102. -102. 0.01 25 1
## 4 8 0 0 40.5 -102. -102. 0.02 25 1
## 5 8 1 0 40.5 -102. -102. 0.01 50 1
## 6 8 0 0 42.0 -71.9 -71.8 4.64 50 1
## 7 8 0 0 42.4 -71.7 -71.7 0.54 35 1
## 8 8 0 0 40.9 -74.5 -74.5 0.87 50 1
## 9 8 -9 0 42.6 -94.6 -94.6 1.18 20 1
## 10 8 -9 0 42.7 -94.3 -94.3 2.16 60 1
## # ℹ 490 more rows
# Visualizing k clusters
ggplot(tornadoes_clustered, aes(x = slon, y = wid, color = factor(cluster))) + geom_point()
# Visualizing the dendograms
plot(hc_tornadoes_comp, cex = 0.5)
plot(hc_tornadoes_avg, cex = 0.5)
plot(hc_tornadoes_sing, cex = 0.5)
# Plotting the dendogram
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
dend_tornadoes <- as.dendrogram(hc_tornadoes_comp)
dend_colored <- color_branches(dend_tornadoes, h = 2)
plot(dend_colored)
Part 3: DBSCAN Clustering
#Installing packages
library(fpc)
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:fpc':
##
## dbscan
## The following object is masked from 'package:stats':
##
## as.dendrogram
# Obtaining optimal eps value
kNNdistplot(data, k = 2)
abline(h = 40, lty = 2)
# Density based clustering
set.seed(123)
f <- fpc::dbscan(data, eps = 40, MinPts = 5)
d <- dbscan::dbscan(data, eps = 40, minPts = 5)
# Visualizing clusters with dbscan
library(factoextra)
fviz_cluster(d, data, geom = "point")
Fin