Introduction

This is an attempt to perform cluster analysis on the Virginia Beach police incident crime file.

In my initial attempts, I ran into problems processing the entire dataset. Trying to process the dataset resulted in slow performance and insufficient memory errors. My i5-2540M 16GB Windows 10 Pro laptop and my coding implementation were not up to the task.

I revised how I processed the data. Breaking up the dataset by year, resulted in similar problems if there were a large number of data points. Further filtering by year and quarter solved my issues.

Let’s begin.

rm(list = ls())

library(tidyverse)

Load CSV file.

library(lubridate)
library(stringr)

# get file
df <- read_csv("Data/Police_Incident_Reports.csv")

# extract specific fields to process
df.1 <- df %>% dplyr::select("Police Case Number", "Location", "Date Occured")
# create a "coordinates" column and extract lat/lng from the location column
df.1$coord <- gsub("[()]", "", str_extract(df.1$Location, "\\((.*)"))
# break coord column into lat and lng columns
df.1 <- df.1 %>% separate(coord, c("lat", "lng"), sep = ",", remove=FALSE)
# make lat and lng columns numeric
df.1$lat <- as.numeric(df.1$lat)
df.1$lng <- as.numeric(df.1$lng)
# create year and qtr columns
df.1$year <- year(mdy_hms(df.1$`Date Occured`))
df.1$qtr <- quarter(mdy_hms(df.1$`Date Occured`))

# define years to perform analysis
start_year <- 2014
end_year <- 2017

# define VB max longitude and latitude
max_vb_lng <- -77
max_vb_lat <- 37


set.seed(334455)

What does the data look like?

# get records that have a lat and lng
df.1 <- df.1 %>% filter(!is.na(lat) & !is.na(lng))

print(ggplot(df.1 %>% 
               dplyr::select(lat, lng) %>% 
               filter(complete.cases(.))
       , aes(lng, lat)) +
       geom_point() +
       labs(title = paste0("Virginia Beach Crime Incidents "
                          , "("
                          , min(df.1$year)
                          , "-"
                          , max(df.1$year)
                          , ")")
            , x = "Longitude"
            , y = "Latitude"))

Bad data found. Any longitude less than -77 and a latitude greater than 37 is outside of Virginia Beach.

df.2 <- df.1 %>% filter((lng < max_vb_lng) | (lat > max_vb_lat))
#knitr::kable(df.2, caption = "Non Virginia Beach Locations")
# display unformatted dataframe -- RMarkdown tables don't display this data well
as.data.frame(df.2)
##   Police Case Number
## 1         2017032567
## 2         2017001162
## 3         2016014953
##                                                                 Location
## 1        200 N MURRAY BL\nCOLORADO SPRINGS, CO\n(38.835659, -104.745383)
## 2          22 HALF ST\nVIRGINIA BEACH, VA 23451\n(37.707636, -75.733434)
## 3 2700 LAKE FOREST DR\nUPPER MARLBORO, MD 20774\n(38.853327, -76.741952)
##             Date Occured                  coord      lat        lng year
## 1 12/01/2016 12:00:00 PM 38.835659, -104.745383 38.83566 -104.74538 2016
## 2 01/11/2017 05:00:00 PM  37.707636, -75.733434 37.70764  -75.73343 2017
## 3 04/08/2016 06:00:00 PM  38.853327, -76.741952 38.85333  -76.74195 2016
##   qtr
## 1   4
## 2   1
## 3   2

Exclude above records from the analysis.

df.1 <- anti_join(df.1, df.2, by = "Police Case Number") 

print(ggplot(df.1
       , aes(lng, lat)) +
       geom_point() +
       labs(title = paste0("Virginia Beach Crime Incidents "
                          , "("
                          , min(df.1$year)
                          , "-"
                          , max(df.1$year)
                          , ")")
            , x = "Longitude"
            , y = "Latitude"))

Breakdown of incidents by year and quarter.

table(df.1$year, df.1$qtr)
##       
##           1    2    3    4
##   1961    2    0    0    0
##   1962    0    1    0    0
##   1972    0    1    0    0
##   1977    0    0    0    1
##   1979    1    0    0    0
##   1981    0    1    0    0
##   1987    0    1    0    0
##   1988    2    0    0    0
##   1989    1    0    0    0
##   1992    1    0    0    0
##   1994    2    0    1    0
##   1996    1    0    0    0
##   1998    7    0    0    0
##   1999    3    1    0    1
##   2000    4    2    0    3
##   2001    7    1    6    2
##   2002    3    2    3    0
##   2003    3    2    1    2
##   2004    2    2    3    2
##   2005    8    1    0    2
##   2006    7    7    4    9
##   2007   12   10    2    4
##   2008   12    7    5    5
##   2009   21    3    5    9
##   2010   18    9   10    7
##   2011   28    9   16   16
##   2012   49   25   31   29
##   2013  142   68  134  462
##   2014 6833 7995 8200 7720
##   2015 6918 8268 8325 7351
##   2016 7239 8272 8003 7129
##   2017 6466 7579 7499 2430

There is not much data prior to 2014. Given this information, let’s base the cluster analysis on data from 2014 forward.


# Return dataframe filtered by year and quarter
filter_data = function(y, q) {
  d <- df.1 %>%
    dplyr::select(lat, lng, year, qtr) %>%
    filter(year == y & qtr == q)
    d
}
# https://gis.stackexchange.com/questions/17638/how-to-cluster-spatial-data-in-r

# Core functionality based upon the above link to indentify clusters.
library(sp)
library(rgdal)
library(geosphere)
library(dismo)
library(rgeos)

plot_cluster = function(y, q) {
  # get filtered data
  x <- filter_data(y, q)
  
  # convert data to a SpatialPointsDataFrame object
  xy <- SpatialPointsDataFrame(
    matrix(c(x$lng, x$lat), ncol = 2),
    data.frame(ID = seq(1:length(x$lng))),
    proj4string = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")
    )
    
  # use the distm function to generate a geodesic distance matrix in meters
  mdist <- distm(xy)
  
  # cluster all points using a hierarchical clustering approach
  hc <- hclust(as.dist(mdist), method = "complete")
  
  # define the distance threshold, in this case 
  d = 12875 # 8 miles to meters
  
  # define clusters based on a tree "height" cutoff "d" and add them to the SpDataFrame
  xy$clust <- cutree(hc, h = d)
  
  # expand the extent of plotting frame
  xy@bbox[] <- as.matrix(extend(extent(xy), 0.001))
  
  # get the centroid coords for each cluster
  cent <- matrix(ncol = 2, nrow = max(xy$clust))
  for (i in 1:max(xy$clust)) {
    # gCentroid from the rgeos package
    cent[i,] <- gCentroid(subset(xy, clust == i))@coords
  }
  
  # compute circles around the centroid coords using the above radius
  # from the dismo package
  ci <- circles(cent, lonlat = T)
  
  # plot
  plot(ci@polygons, axes = T)
  plot(xy, col = rainbow(4)[factor(xy$clust)], add = T)
  points(cent, cex = 1, pch=18)
  title(main = paste(y, " Quarter ", q))

  # return center coordinates for clusters  
  cent
}
# Helper function to convert a center coordinate matrix into a dataframe and
# add year and qtr columns.
get_df = function(m, y, q) {
  # convert matrix to dataframe and add year and qtr columns
  df <- as.data.frame(m)
  df["year"] <- y
  df["qtr"] <- q
  colnames(df) <- c("lng", "lat", "year", "qtr")
  df
}

Plot clusters by year and quarter.

for (year in start_year:end_year) {
  for (qtr in 1:4) {
    # create a dataframe of cluster centers for each year and qtr
    if (year == start_year & qtr == 1) {
      # first dataframe
      df.centers <- get_df(plot_cluster(year, qtr), year, qtr)
    } else {
      df.centers <- bind_rows(df.centers, get_df(plot_cluster(year, qtr), year, qtr))
    }
  }
}

Cluster center details.

knitr::kable(table(df.centers$year, df.centers$qtr)
             , caption = "Number Of Clusters By Year And Quarter")
Number Of Clusters By Year And Quarter
1 2 3 4
2014 11 11 10 11
2015 10 12 14 13
2016 9 11 11 12
2017 11 11 11 12
knitr::kable(df.centers
             , caption = "Cluster Center coordinates")
Cluster Center coordinates
lng lat year qtr
-75.99759 36.84910 2014 1
-76.16410 36.79756 2014 1
-76.10412 36.90381 2014 1
-76.09864 36.82902 2014 1
-76.16335 36.86619 2014 1
-76.09669 36.77827 2014 1
-76.00721 36.77808 2014 1
-75.99973 36.74004 2014 1
-76.20160 36.81882 2014 1
-76.02042 36.63542 2014 1
-76.09442 36.59992 2014 1
-76.18112 36.80990 2014 2
-75.99783 36.84992 2014 2
-75.98643 36.73616 2014 2
-76.15093 36.85913 2014 2
-76.05038 36.75652 2014 2
-76.09830 36.80823 2014 2
-75.99666 36.79061 2014 2
-76.09283 36.90147 2014 2
-76.07053 36.61828 2014 2
-75.99454 36.65191 2014 2
-76.01123 36.57056 2014 2
-76.09496 36.85116 2014 3
-76.09923 36.80161 2014 3
-76.17225 36.81891 2014 3
-75.98895 36.82439 2014 3
-75.99542 36.85340 2014 3
-76.04130 36.63291 2014 3
-76.04326 36.75688 2014 3
-76.16168 36.88120 2014 3
-75.98329 36.75862 2014 3
-76.05045 36.57830 2014 3
-76.10449 36.82165 2014 4
-76.00213 36.84909 2014 4
-76.16357 36.87099 2014 4
-76.17685 36.80757 2014 4
-76.08955 36.76275 2014 4
-76.00801 36.77442 2014 4
-76.09672 36.90319 2014 4
-75.98660 36.73042 2014 4
-76.03716 36.63932 2014 4
-76.07348 36.58361 2014 4
-75.92794 36.55713 2014 4
-76.15648 36.85697 2015 1
-76.11534 36.80810 2015 1
-76.00508 36.84773 2015 1
-76.06825 36.77233 2015 1
-75.99822 36.77361 2015 1
-76.10142 36.87909 2015 1
-76.20288 36.80497 2015 1
-75.96491 36.69466 2015 1
-76.04949 36.62202 2015 1
-76.25300 36.94351 2015 1
-76.16176 36.85668 2015 2
-76.11058 36.81133 2015 2
-75.98561 36.84647 2015 2
-76.05737 36.84370 2015 2
-76.18442 36.80032 2015 2
-76.11457 36.89859 2015 2
-76.05383 36.76186 2015 2
-75.99428 36.77373 2015 2
-76.02069 36.66552 2015 2
-76.10148 36.64164 2015 2
-75.93251 36.71232 2015 2
-76.06088 36.58947 2015 2
-76.08314 36.76159 2015 3
-76.15618 36.82221 2015 3
-76.09942 36.80259 2015 3
-75.98630 36.83168 2015 3
-76.06233 36.84297 2015 3
-76.15517 36.87963 2015 3
-76.03359 36.89880 2015 3
-76.06614 36.64680 2015 3
-75.99848 36.73882 2015 3
-76.21068 36.81201 2015 3
-76.03586 36.59509 2015 3
-75.92794 36.55713 2015 3
-76.61758 36.78092 2015 3
-76.24132 36.72625 2015 3
-76.16880 36.82441 2015 4
-76.00411 36.77321 2015 4
-76.09472 36.79135 2015 4
-76.09478 36.83281 2015 4
-76.14975 36.88705 2015 4
-76.03753 36.86876 2015 4
-75.98650 36.84369 2015 4
-76.07234 36.73014 2015 4
-76.05966 36.64005 2015 4
-76.02487 36.58301 2015 4
-75.96423 36.69486 2015 4
-75.92794 36.55713 2015 4
-76.27800 36.96689 2015 4
-76.09657 36.81545 2016 1
-76.15052 36.86439 2016 1
-75.98939 36.83302 2016 1
-76.18118 36.81084 2016 1
-76.02372 36.86374 2016 1
-76.08511 36.76251 2016 1
-75.99933 36.75856 2016 1
-76.09852 36.63456 2016 1
-76.04051 36.59794 2016 1
-76.08285 36.80269 2016 2
-76.14675 36.78873 2016 2
-76.00377 36.84691 2016 2
-76.13596 36.85250 2016 2
-76.01876 36.77372 2016 2
-76.19260 36.81848 2016 2
-75.97710 36.73120 2016 2
-76.05397 36.90491 2016 2
-76.03404 36.61038 2016 2
-75.92794 36.55713 2016 2
-76.39741 36.79634 2016 2
-75.99087 36.83617 2016 3
-76.08777 36.76733 2016 3
-76.16407 36.81324 2016 3
-76.08648 36.82120 2016 3
-76.03554 36.87467 2016 3
-76.15083 36.86763 2016 3
-76.00345 36.75654 2016 3
-75.94756 36.70974 2016 3
-76.06446 36.63030 2016 3
-76.00887 36.57641 2016 3
-76.58114 36.72202 2016 3
-76.15047 36.85937 2016 4
-75.99183 36.84410 2016 4
-76.18423 36.80505 2016 4
-76.11407 36.79458 2016 4
-75.98934 36.73385 2016 4
-76.01770 36.77236 2016 4
-76.07714 36.82239 2016 4
-76.04313 36.87498 2016 4
-76.05950 36.60152 2016 4
-75.99386 36.64889 2016 4
-76.61108 36.98102 2016 4
-76.36720 36.81620 2016 4
-76.00232 36.84676 2017 1
-76.04587 36.75427 2017 1
-76.15070 36.88637 2017 1
-76.05714 36.60357 2017 1
-76.15445 36.78597 2017 1
-76.13648 36.84202 2017 1
-76.09295 36.80164 2017 1
-76.04020 36.89625 2017 1
-75.99679 36.78153 2017 1
-75.97625 36.71470 2017 1
-76.21068 36.81264 2017 1
-76.15602 36.82267 2017 2
-75.98654 36.84532 2017 2
-76.01895 36.76853 2017 2
-76.15663 36.88043 2017 2
-76.09067 36.80539 2017 2
-76.05701 36.85978 2017 2
-75.96616 36.72822 2017 2
-76.03452 36.59083 2017 2
-76.06946 36.72690 2017 2
-76.02936 36.65760 2017 2
-76.28092 36.84770 2017 2
-76.10429 36.82847 2017 3
-76.09695 36.77699 2017 3
-76.18029 36.80769 2017 3
-76.11930 36.88990 2017 3
-76.17162 36.86872 2017 3
-75.99523 36.84957 2017 3
-75.99597 36.77748 2017 3
-75.97947 36.69822 2017 3
-76.02135 36.60560 2017 3
-76.09722 36.64208 2017 3
-76.30223 36.83224 2017 3
-76.15995 36.86252 2017 4
-76.18730 36.79979 2017 4
-76.04568 36.75358 2017 4
-75.99744 36.85006 2017 4
-75.99351 36.79358 2017 4
-76.10830 36.79751 2017 4
-76.09079 36.83211 2017 4
-75.94111 36.73569 2017 4
-76.06727 36.90225 2017 4
-76.02060 36.66728 2017 4
-76.06952 36.59869 2017 4
-75.92794 36.55713 2017 4
for (y in start_year:end_year) {
  c <- df.centers %>% filter(year == y)
  
  print(ggplot(c
         , aes(lng, lat)) +
         geom_point() +
         geom_text(aes(label=paste0(round(lng, 3), ',', round(lat,3)), hjust = 0.5, vjust=1)
                   , size=2.5
                   , color="blue") +
         facet_wrap( ~ qtr, labeller = label_both) +
         labs(title = paste(y, " Cluster"), x = "Longitude", y = "Latitude"))
}

rm(c)

Overlay cluster centers on map.

library(leaflet)

# create a palette for cluster years
pal <- colorFactor(c(palette()[3], palette()[4], palette()[5], palette()[6])
                   , domain = as.factor(df.centers$year))

# get Virginia Beach police precincts
vbPrecincts <- read_csv("Data/vb_precincts.csv")

# create custom icon for precincts
icons <- awesomeIcons(
  icon = 'glyphicon-earphone'
  , iconColor = 'black'
  , library = 'glyphicon'
)

map <- leaflet(df.centers) %>%
  addTiles() %>%
  addAwesomeMarkers(lng=vbPrecincts$lng, lat=vbPrecincts$lat, popup=vbPrecincts$descr, icon=icons) %>%
  addCircleMarkers(lng=df.centers$lng
             , lat=df.centers$lat
             , popup=paste0('[', df.centers$year, '-', df.centers$qtr, '] ('
                            , round(df.centers$lng, 3), ',', round(df.centers$lat,3), ')')
             , color = ~pal(df.centers$year)
             , radius = 3
             , fillOpacity = 0.5
             ) %>%
             setView(-76.1339487, 36.8464062, 11)
                                                  
map

Other things to do and explore.

  • Identify number of incidents per cluster
  • Re-explore DBSCAN clustering. Compare results.

Endnotes.

Loaded packages.

sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] leaflet_1.1.0   rgeos_0.3-25    dismo_1.1-4     raster_2.5-8   
##  [5] geosphere_1.5-5 rgdal_1.2-15    sp_1.2-5        bindrcpp_0.2   
##  [9] dplyr_0.7.4     purrr_0.2.4     readr_1.1.1     tidyr_0.7.2    
## [13] tibble_1.3.4    ggplot2_2.2.1   tidyverse_1.1.1
## 
## loaded via a namespace (and not attached):
##  [1] reshape2_1.4.2   haven_1.1.0      lattice_0.20-35  colorspace_1.3-2
##  [5] htmltools_0.3.6  yaml_2.1.14      rlang_0.1.2      foreign_0.8-69  
##  [9] glue_1.2.0       modelr_0.1.1     readxl_1.0.0     bindr_0.1       
## [13] plyr_1.8.4       stringr_1.2.0    munsell_0.4.3    gtable_0.2.0    
## [17] cellranger_1.1.0 rvest_0.3.2      htmlwidgets_0.9  psych_1.7.8     
## [21] evaluate_0.10.1  labeling_0.3     knitr_1.17       forcats_0.2.0   
## [25] httpuv_1.3.5     crosstalk_1.0.0  parallel_3.4.2   highr_0.6       
## [29] broom_0.4.2      Rcpp_0.12.13     xtable_1.8-2     scales_0.5.0    
## [33] backports_1.1.1  jsonlite_1.5     mime_0.5         mnormt_1.5-5    
## [37] hms_0.3          digest_0.6.12    stringi_1.1.5    shiny_1.0.5     
## [41] grid_3.4.2       rprojroot_1.2    tools_3.4.2      magrittr_1.5    
## [45] lazyeval_0.2.1   pkgconfig_2.0.1  xml2_1.1.1       lubridate_1.7.0 
## [49] assertthat_0.2.0 rmarkdown_1.6    httr_1.3.1       R6_2.2.2        
## [53] nlme_3.1-131     compiler_3.4.2

Report generated: 2017-11-12 16:18:36