MGT 665: Solving Problems w/ Machine Learning

Clustering Analysis of Historical Tornado Data

Part 1: K-means Clustering

# Setting a random seed so data can be reproduced
set.seed(1234)
# Loading the data set
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

tornadoes <- read_csv("us_tornado_dataset_1950_2021 3.csv", show_col_types = FALSE)

# Focusing on tornado data from 2000-2010 only
tornadoes2021 <- tornadoes[-1:-67058, ]

# Loading summary statistics of data set
summary(tornadoes2021)

##        yr             mo              dy             date           
##  Min.   :2021   Min.   : 8.00   Min.   : 1.00   Min.   :2021-08-18  
##  1st Qu.:2021   1st Qu.:10.00   1st Qu.:11.00   1st Qu.:2021-10-10  
##  Median :2021   Median :11.00   Median :15.00   Median :2021-11-10  
##  Mean   :2021   Mean   :10.63   Mean   :15.92   Mean   :2021-11-04  
##  3rd Qu.:2021   3rd Qu.:12.00   3rd Qu.:21.00   3rd Qu.:2021-12-15  
##  Max.   :2021   Max.   :12.00   Max.   :31.00   Max.   :2021-12-31  
##       st                 mag              inj               fat        
##  Length:500         Min.   :-9.000   Min.   :  0.000   Min.   : 0.000  
##  Class :character   1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.: 0.000  
##  Mode  :character   Median : 1.000   Median :  0.000   Median : 0.000  
##                     Mean   :-0.098   Mean   :  1.404   Mean   : 0.184  
##                     3rd Qu.: 1.000   3rd Qu.:  0.000   3rd Qu.: 0.000  
##                     Max.   : 4.000   Max.   :515.000   Max.   :57.000  
##       slat            slon              elat            elon        
##  Min.   :26.51   Min.   :-122.60   Min.   :26.53   Min.   :-122.52  
##  1st Qu.:35.85   1st Qu.: -95.45   1st Qu.:35.91   1st Qu.: -95.41  
##  Median :40.06   Median : -91.99   Median :40.08   Median : -91.98  
##  Mean   :39.02   Mean   : -90.44   Mean   :39.06   Mean   : -90.37  
##  3rd Qu.:42.43   3rd Qu.: -86.30   3rd Qu.:42.51   3rd Qu.: -86.15  
##  Max.   :48.06   Max.   : -70.21   Max.   :48.10   Max.   : -70.21  
##       len                wid        
##  Min.   :  0.0100   Min.   :   1.0  
##  1st Qu.:  0.8875   1st Qu.:  50.0  
##  Median :  2.6900   Median :  75.0  
##  Mean   :  5.2762   Mean   : 136.6  
##  3rd Qu.:  6.3225   3rd Qu.: 150.0  
##  Max.   :168.5300   Max.   :2600.0

# Removing categorical & unnecessary variables and scaling data
data <- tornadoes2021 %>% select(-date, -st, -dy, -yr, -inj, -elat, elon)

# Generating clusters with k-means
kmeans(data, centers = 3, iter.max = 100, nstart = 100)

## K-means clustering with 3 clusters of sizes 426, 4, 70
## 
## Cluster means:
##         mo        mag          fat     slat      slon      elon        len
## 1 10.58451 -0.3661972  0.004694836 39.17028 -90.55788 -90.51692   3.756878
## 2 12.00000  3.5000000 16.250000000 36.31227 -88.99440 -87.40565 100.085000
## 3 10.80000  1.3285714  0.357142857 38.23395 -89.77881 -89.65041   9.105000
##          wid
## 1   76.01174
## 2 1950.00000
## 3  401.68571
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 3 1 1 1 1 1 3 1 1 1 3 1 1 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1
## [186] 1 1 1 1 3 1 1 3 1 1 1 1 1 1 1 1 1 1 3 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1 3 1 1
## [223] 3 1 3 1 3 3 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 3 3 2 1 1 3 3 1 1 3 1 1 1 1 1 1 3 1 1 1 1 2 2 1 1 1 1 3 1 3 1 1 1 3 3 3
## [334] 1 2 3 3 1 1 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3
## [445] 1 1 1 1 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3
## [482] 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 3 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 1093668.6  763013.6 2205373.7
##  (between_SS / total_SS =  82.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Determining optimal number of clusters with k-means
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(data, kmeans, method = "silhouette")

fviz_nbclust(data, kmeans, method = "wss")

fviz_nbclust(data, kmeans, method = "gap_stat")

# Using silhouette method value for k to plot clusters
fviz_cluster(kmeans(data, centers = 2, iter.max = 100, nstart = 100), data = data)

# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slat, y = wid , color = factor(tor_clust))) + 
  geom_point()

# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slon, y = wid , color = factor(tor_clust))) + 
  geom_point()

# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slat, y = len , color = factor(tor_clust))) + 
  geom_point()

# Cluster visualization when comparing variables
clusters <- kmeans(data, centers = 2, iter.max = 100, nstart = 100)
# Extracting a cluster vector
tor_clust <- clusters$cluster
# Plotting the cluster vector
ggplot(data, aes(x = slon, y = len , color = factor(tor_clust))) + 
  geom_point()

Part 2: Hierarchical Clustering

# Creating linkage steps to extract clusters
dist_tornadoes <- dist(data, method = 'euclidean')
hc_tornadoes_comp <- hclust(dist_tornadoes, method = 'complete')
hc_tornadoes_avg <- hclust(dist_tornadoes, method = 'average')
hc_tornadoes_sing <- hclust(dist_tornadoes, method = 'single')

# Extracting k clusters
cluster_assignments <- cutree(hc_tornadoes_comp, k = 2)
cluster_assignments

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [482] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

tornadoes_clustered <- mutate(data, cluster = cluster_assignments)
tornadoes_clustered

## # A tibble: 500 × 9
##       mo   mag   fat  slat   slon   elon   len   wid cluster
##    <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>   <int>
##  1     8     1     0  40.5  -76.0  -76.0  1.07   125       1
##  2     8     1     0  40.3  -75.3  -75.2  8.76   140       1
##  3     8     0     0  40.6 -102.  -102.   0.01    25       1
##  4     8     0     0  40.5 -102.  -102.   0.02    25       1
##  5     8     1     0  40.5 -102.  -102.   0.01    50       1
##  6     8     0     0  42.0  -71.9  -71.8  4.64    50       1
##  7     8     0     0  42.4  -71.7  -71.7  0.54    35       1
##  8     8     0     0  40.9  -74.5  -74.5  0.87    50       1
##  9     8    -9     0  42.6  -94.6  -94.6  1.18    20       1
## 10     8    -9     0  42.7  -94.3  -94.3  2.16    60       1
## # ℹ 490 more rows

# Visualizing k clusters
ggplot(tornadoes_clustered, aes(x = slon, y = wid, color = factor(cluster))) +        geom_point()

# Visualizing the dendograms
plot(hc_tornadoes_comp, cex = 0.5)

plot(hc_tornadoes_avg, cex = 0.5)

plot(hc_tornadoes_sing, cex = 0.5)

# Plotting the dendogram
library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

dend_tornadoes <- as.dendrogram(hc_tornadoes_comp)
dend_colored <- color_branches(dend_tornadoes, h = 2)
plot(dend_colored)

Part 3: DBSCAN Clustering

#Installing packages
library(fpc)
library(dbscan)

## 
## Attaching package: 'dbscan'

## The following object is masked from 'package:fpc':
## 
##     dbscan

## The following object is masked from 'package:stats':
## 
##     as.dendrogram

# Obtaining optimal eps value
kNNdistplot(data, k = 2)
abline(h = 40, lty = 2)

# Density based clustering
set.seed(123)
f <- fpc::dbscan(data, eps = 40, MinPts = 5)
d <- dbscan::dbscan(data, eps = 40, minPts = 5)

# Visualizing clusters with dbscan
library(factoextra)
fviz_cluster(d, data, geom = "point")

Fin

MGT 665: Solving Problems w/ Machine Learning

Final Project

Dominic Valdiserri

2024-05-09

Clustering Analysis of Historical Tornado Data