This is an attempt to perform cluster analysis on the Virginia Beach police incident crime file.
In my initial attempts, I ran into problems processing the entire dataset. Trying to process the dataset resulted in slow performance and insufficient memory errors. My i5-2540M 16GB Windows 10 Pro laptop and my coding implementation were not up to the task.
I revised how I processed the data. Breaking up the dataset by year, resulted in similar problems if there were a large number of data points. Further filtering by year and quarter solved my issues.
Let’s begin.
rm(list = ls())
library(tidyverse)
library(lubridate)
library(stringr)
# get file
df <- read_csv("Data/Police_Incident_Reports.csv")
# extract specific fields to process
df.1 <- df %>% dplyr::select("Police Case Number", "Location", "Date Occured")
# create a "coordinates" column and extract lat/lng from the location column
df.1$coord <- gsub("[()]", "", str_extract(df.1$Location, "\\((.*)"))
# break coord column into lat and lng columns
df.1 <- df.1 %>% separate(coord, c("lat", "lng"), sep = ",", remove=FALSE)
# make lat and lng columns numeric
df.1$lat <- as.numeric(df.1$lat)
df.1$lng <- as.numeric(df.1$lng)
# create year and qtr columns
df.1$year <- year(mdy_hms(df.1$`Date Occured`))
df.1$qtr <- quarter(mdy_hms(df.1$`Date Occured`))
# define years to perform analysis
start_year <- 2014
end_year <- 2017
# define VB max longitude and latitude
max_vb_lng <- -77
max_vb_lat <- 37
set.seed(334455)
# get records that have a lat and lng
df.1 <- df.1 %>% filter(!is.na(lat) & !is.na(lng))
print(ggplot(df.1 %>%
dplyr::select(lat, lng) %>%
filter(complete.cases(.))
, aes(lng, lat)) +
geom_point() +
labs(title = paste0("Virginia Beach Crime Incidents "
, "("
, min(df.1$year)
, "-"
, max(df.1$year)
, ")")
, x = "Longitude"
, y = "Latitude"))
Bad data found. Any longitude less than -77 and a latitude greater than 37 is outside of Virginia Beach.
df.2 <- df.1 %>% filter((lng < max_vb_lng) | (lat > max_vb_lat))
#knitr::kable(df.2, caption = "Non Virginia Beach Locations")
# display unformatted dataframe -- RMarkdown tables don't display this data well
as.data.frame(df.2)
## Police Case Number
## 1 2017032567
## 2 2017001162
## 3 2016014953
## Location
## 1 200 N MURRAY BL\nCOLORADO SPRINGS, CO\n(38.835659, -104.745383)
## 2 22 HALF ST\nVIRGINIA BEACH, VA 23451\n(37.707636, -75.733434)
## 3 2700 LAKE FOREST DR\nUPPER MARLBORO, MD 20774\n(38.853327, -76.741952)
## Date Occured coord lat lng year
## 1 12/01/2016 12:00:00 PM 38.835659, -104.745383 38.83566 -104.74538 2016
## 2 01/11/2017 05:00:00 PM 37.707636, -75.733434 37.70764 -75.73343 2017
## 3 04/08/2016 06:00:00 PM 38.853327, -76.741952 38.85333 -76.74195 2016
## qtr
## 1 4
## 2 1
## 3 2
Exclude above records from the analysis.
df.1 <- anti_join(df.1, df.2, by = "Police Case Number")
print(ggplot(df.1
, aes(lng, lat)) +
geom_point() +
labs(title = paste0("Virginia Beach Crime Incidents "
, "("
, min(df.1$year)
, "-"
, max(df.1$year)
, ")")
, x = "Longitude"
, y = "Latitude"))
table(df.1$year, df.1$qtr)
##
## 1 2 3 4
## 1961 2 0 0 0
## 1962 0 1 0 0
## 1972 0 1 0 0
## 1977 0 0 0 1
## 1979 1 0 0 0
## 1981 0 1 0 0
## 1987 0 1 0 0
## 1988 2 0 0 0
## 1989 1 0 0 0
## 1992 1 0 0 0
## 1994 2 0 1 0
## 1996 1 0 0 0
## 1998 7 0 0 0
## 1999 3 1 0 1
## 2000 4 2 0 3
## 2001 7 1 6 2
## 2002 3 2 3 0
## 2003 3 2 1 2
## 2004 2 2 3 2
## 2005 8 1 0 2
## 2006 7 7 4 9
## 2007 12 10 2 4
## 2008 12 7 5 5
## 2009 21 3 5 9
## 2010 18 9 10 7
## 2011 28 9 16 16
## 2012 49 25 31 29
## 2013 142 68 134 462
## 2014 6833 7995 8200 7720
## 2015 6918 8268 8325 7351
## 2016 7239 8272 8003 7129
## 2017 6466 7579 7499 2430
There is not much data prior to 2014. Given this information, let’s base the cluster analysis on data from 2014 forward.
# Return dataframe filtered by year and quarter
filter_data = function(y, q) {
d <- df.1 %>%
dplyr::select(lat, lng, year, qtr) %>%
filter(year == y & qtr == q)
d
}
# https://gis.stackexchange.com/questions/17638/how-to-cluster-spatial-data-in-r
# Core functionality based upon the above link to indentify clusters.
library(sp)
library(rgdal)
library(geosphere)
library(dismo)
library(rgeos)
plot_cluster = function(y, q) {
# get filtered data
x <- filter_data(y, q)
# convert data to a SpatialPointsDataFrame object
xy <- SpatialPointsDataFrame(
matrix(c(x$lng, x$lat), ncol = 2),
data.frame(ID = seq(1:length(x$lng))),
proj4string = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")
)
# use the distm function to generate a geodesic distance matrix in meters
mdist <- distm(xy)
# cluster all points using a hierarchical clustering approach
hc <- hclust(as.dist(mdist), method = "complete")
# define the distance threshold, in this case
d = 12875 # 8 miles to meters
# define clusters based on a tree "height" cutoff "d" and add them to the SpDataFrame
xy$clust <- cutree(hc, h = d)
# expand the extent of plotting frame
xy@bbox[] <- as.matrix(extend(extent(xy), 0.001))
# get the centroid coords for each cluster
cent <- matrix(ncol = 2, nrow = max(xy$clust))
for (i in 1:max(xy$clust)) {
# gCentroid from the rgeos package
cent[i,] <- gCentroid(subset(xy, clust == i))@coords
}
# compute circles around the centroid coords using the above radius
# from the dismo package
ci <- circles(cent, lonlat = T)
# plot
plot(ci@polygons, axes = T)
plot(xy, col = rainbow(4)[factor(xy$clust)], add = T)
points(cent, cex = 1, pch=18)
title(main = paste(y, " Quarter ", q))
# return center coordinates for clusters
cent
}
# Helper function to convert a center coordinate matrix into a dataframe and
# add year and qtr columns.
get_df = function(m, y, q) {
# convert matrix to dataframe and add year and qtr columns
df <- as.data.frame(m)
df["year"] <- y
df["qtr"] <- q
colnames(df) <- c("lng", "lat", "year", "qtr")
df
}
for (year in start_year:end_year) {
for (qtr in 1:4) {
# create a dataframe of cluster centers for each year and qtr
if (year == start_year & qtr == 1) {
# first dataframe
df.centers <- get_df(plot_cluster(year, qtr), year, qtr)
} else {
df.centers <- bind_rows(df.centers, get_df(plot_cluster(year, qtr), year, qtr))
}
}
}
knitr::kable(table(df.centers$year, df.centers$qtr)
, caption = "Number Of Clusters By Year And Quarter")
| 1 | 2 | 3 | 4 | |
|---|---|---|---|---|
| 2014 | 11 | 11 | 10 | 11 |
| 2015 | 10 | 12 | 14 | 13 |
| 2016 | 9 | 11 | 11 | 12 |
| 2017 | 11 | 11 | 11 | 12 |
knitr::kable(df.centers
, caption = "Cluster Center coordinates")
| lng | lat | year | qtr |
|---|---|---|---|
| -75.99759 | 36.84910 | 2014 | 1 |
| -76.16410 | 36.79756 | 2014 | 1 |
| -76.10412 | 36.90381 | 2014 | 1 |
| -76.09864 | 36.82902 | 2014 | 1 |
| -76.16335 | 36.86619 | 2014 | 1 |
| -76.09669 | 36.77827 | 2014 | 1 |
| -76.00721 | 36.77808 | 2014 | 1 |
| -75.99973 | 36.74004 | 2014 | 1 |
| -76.20160 | 36.81882 | 2014 | 1 |
| -76.02042 | 36.63542 | 2014 | 1 |
| -76.09442 | 36.59992 | 2014 | 1 |
| -76.18112 | 36.80990 | 2014 | 2 |
| -75.99783 | 36.84992 | 2014 | 2 |
| -75.98643 | 36.73616 | 2014 | 2 |
| -76.15093 | 36.85913 | 2014 | 2 |
| -76.05038 | 36.75652 | 2014 | 2 |
| -76.09830 | 36.80823 | 2014 | 2 |
| -75.99666 | 36.79061 | 2014 | 2 |
| -76.09283 | 36.90147 | 2014 | 2 |
| -76.07053 | 36.61828 | 2014 | 2 |
| -75.99454 | 36.65191 | 2014 | 2 |
| -76.01123 | 36.57056 | 2014 | 2 |
| -76.09496 | 36.85116 | 2014 | 3 |
| -76.09923 | 36.80161 | 2014 | 3 |
| -76.17225 | 36.81891 | 2014 | 3 |
| -75.98895 | 36.82439 | 2014 | 3 |
| -75.99542 | 36.85340 | 2014 | 3 |
| -76.04130 | 36.63291 | 2014 | 3 |
| -76.04326 | 36.75688 | 2014 | 3 |
| -76.16168 | 36.88120 | 2014 | 3 |
| -75.98329 | 36.75862 | 2014 | 3 |
| -76.05045 | 36.57830 | 2014 | 3 |
| -76.10449 | 36.82165 | 2014 | 4 |
| -76.00213 | 36.84909 | 2014 | 4 |
| -76.16357 | 36.87099 | 2014 | 4 |
| -76.17685 | 36.80757 | 2014 | 4 |
| -76.08955 | 36.76275 | 2014 | 4 |
| -76.00801 | 36.77442 | 2014 | 4 |
| -76.09672 | 36.90319 | 2014 | 4 |
| -75.98660 | 36.73042 | 2014 | 4 |
| -76.03716 | 36.63932 | 2014 | 4 |
| -76.07348 | 36.58361 | 2014 | 4 |
| -75.92794 | 36.55713 | 2014 | 4 |
| -76.15648 | 36.85697 | 2015 | 1 |
| -76.11534 | 36.80810 | 2015 | 1 |
| -76.00508 | 36.84773 | 2015 | 1 |
| -76.06825 | 36.77233 | 2015 | 1 |
| -75.99822 | 36.77361 | 2015 | 1 |
| -76.10142 | 36.87909 | 2015 | 1 |
| -76.20288 | 36.80497 | 2015 | 1 |
| -75.96491 | 36.69466 | 2015 | 1 |
| -76.04949 | 36.62202 | 2015 | 1 |
| -76.25300 | 36.94351 | 2015 | 1 |
| -76.16176 | 36.85668 | 2015 | 2 |
| -76.11058 | 36.81133 | 2015 | 2 |
| -75.98561 | 36.84647 | 2015 | 2 |
| -76.05737 | 36.84370 | 2015 | 2 |
| -76.18442 | 36.80032 | 2015 | 2 |
| -76.11457 | 36.89859 | 2015 | 2 |
| -76.05383 | 36.76186 | 2015 | 2 |
| -75.99428 | 36.77373 | 2015 | 2 |
| -76.02069 | 36.66552 | 2015 | 2 |
| -76.10148 | 36.64164 | 2015 | 2 |
| -75.93251 | 36.71232 | 2015 | 2 |
| -76.06088 | 36.58947 | 2015 | 2 |
| -76.08314 | 36.76159 | 2015 | 3 |
| -76.15618 | 36.82221 | 2015 | 3 |
| -76.09942 | 36.80259 | 2015 | 3 |
| -75.98630 | 36.83168 | 2015 | 3 |
| -76.06233 | 36.84297 | 2015 | 3 |
| -76.15517 | 36.87963 | 2015 | 3 |
| -76.03359 | 36.89880 | 2015 | 3 |
| -76.06614 | 36.64680 | 2015 | 3 |
| -75.99848 | 36.73882 | 2015 | 3 |
| -76.21068 | 36.81201 | 2015 | 3 |
| -76.03586 | 36.59509 | 2015 | 3 |
| -75.92794 | 36.55713 | 2015 | 3 |
| -76.61758 | 36.78092 | 2015 | 3 |
| -76.24132 | 36.72625 | 2015 | 3 |
| -76.16880 | 36.82441 | 2015 | 4 |
| -76.00411 | 36.77321 | 2015 | 4 |
| -76.09472 | 36.79135 | 2015 | 4 |
| -76.09478 | 36.83281 | 2015 | 4 |
| -76.14975 | 36.88705 | 2015 | 4 |
| -76.03753 | 36.86876 | 2015 | 4 |
| -75.98650 | 36.84369 | 2015 | 4 |
| -76.07234 | 36.73014 | 2015 | 4 |
| -76.05966 | 36.64005 | 2015 | 4 |
| -76.02487 | 36.58301 | 2015 | 4 |
| -75.96423 | 36.69486 | 2015 | 4 |
| -75.92794 | 36.55713 | 2015 | 4 |
| -76.27800 | 36.96689 | 2015 | 4 |
| -76.09657 | 36.81545 | 2016 | 1 |
| -76.15052 | 36.86439 | 2016 | 1 |
| -75.98939 | 36.83302 | 2016 | 1 |
| -76.18118 | 36.81084 | 2016 | 1 |
| -76.02372 | 36.86374 | 2016 | 1 |
| -76.08511 | 36.76251 | 2016 | 1 |
| -75.99933 | 36.75856 | 2016 | 1 |
| -76.09852 | 36.63456 | 2016 | 1 |
| -76.04051 | 36.59794 | 2016 | 1 |
| -76.08285 | 36.80269 | 2016 | 2 |
| -76.14675 | 36.78873 | 2016 | 2 |
| -76.00377 | 36.84691 | 2016 | 2 |
| -76.13596 | 36.85250 | 2016 | 2 |
| -76.01876 | 36.77372 | 2016 | 2 |
| -76.19260 | 36.81848 | 2016 | 2 |
| -75.97710 | 36.73120 | 2016 | 2 |
| -76.05397 | 36.90491 | 2016 | 2 |
| -76.03404 | 36.61038 | 2016 | 2 |
| -75.92794 | 36.55713 | 2016 | 2 |
| -76.39741 | 36.79634 | 2016 | 2 |
| -75.99087 | 36.83617 | 2016 | 3 |
| -76.08777 | 36.76733 | 2016 | 3 |
| -76.16407 | 36.81324 | 2016 | 3 |
| -76.08648 | 36.82120 | 2016 | 3 |
| -76.03554 | 36.87467 | 2016 | 3 |
| -76.15083 | 36.86763 | 2016 | 3 |
| -76.00345 | 36.75654 | 2016 | 3 |
| -75.94756 | 36.70974 | 2016 | 3 |
| -76.06446 | 36.63030 | 2016 | 3 |
| -76.00887 | 36.57641 | 2016 | 3 |
| -76.58114 | 36.72202 | 2016 | 3 |
| -76.15047 | 36.85937 | 2016 | 4 |
| -75.99183 | 36.84410 | 2016 | 4 |
| -76.18423 | 36.80505 | 2016 | 4 |
| -76.11407 | 36.79458 | 2016 | 4 |
| -75.98934 | 36.73385 | 2016 | 4 |
| -76.01770 | 36.77236 | 2016 | 4 |
| -76.07714 | 36.82239 | 2016 | 4 |
| -76.04313 | 36.87498 | 2016 | 4 |
| -76.05950 | 36.60152 | 2016 | 4 |
| -75.99386 | 36.64889 | 2016 | 4 |
| -76.61108 | 36.98102 | 2016 | 4 |
| -76.36720 | 36.81620 | 2016 | 4 |
| -76.00232 | 36.84676 | 2017 | 1 |
| -76.04587 | 36.75427 | 2017 | 1 |
| -76.15070 | 36.88637 | 2017 | 1 |
| -76.05714 | 36.60357 | 2017 | 1 |
| -76.15445 | 36.78597 | 2017 | 1 |
| -76.13648 | 36.84202 | 2017 | 1 |
| -76.09295 | 36.80164 | 2017 | 1 |
| -76.04020 | 36.89625 | 2017 | 1 |
| -75.99679 | 36.78153 | 2017 | 1 |
| -75.97625 | 36.71470 | 2017 | 1 |
| -76.21068 | 36.81264 | 2017 | 1 |
| -76.15602 | 36.82267 | 2017 | 2 |
| -75.98654 | 36.84532 | 2017 | 2 |
| -76.01895 | 36.76853 | 2017 | 2 |
| -76.15663 | 36.88043 | 2017 | 2 |
| -76.09067 | 36.80539 | 2017 | 2 |
| -76.05701 | 36.85978 | 2017 | 2 |
| -75.96616 | 36.72822 | 2017 | 2 |
| -76.03452 | 36.59083 | 2017 | 2 |
| -76.06946 | 36.72690 | 2017 | 2 |
| -76.02936 | 36.65760 | 2017 | 2 |
| -76.28092 | 36.84770 | 2017 | 2 |
| -76.10429 | 36.82847 | 2017 | 3 |
| -76.09695 | 36.77699 | 2017 | 3 |
| -76.18029 | 36.80769 | 2017 | 3 |
| -76.11930 | 36.88990 | 2017 | 3 |
| -76.17162 | 36.86872 | 2017 | 3 |
| -75.99523 | 36.84957 | 2017 | 3 |
| -75.99597 | 36.77748 | 2017 | 3 |
| -75.97947 | 36.69822 | 2017 | 3 |
| -76.02135 | 36.60560 | 2017 | 3 |
| -76.09722 | 36.64208 | 2017 | 3 |
| -76.30223 | 36.83224 | 2017 | 3 |
| -76.15995 | 36.86252 | 2017 | 4 |
| -76.18730 | 36.79979 | 2017 | 4 |
| -76.04568 | 36.75358 | 2017 | 4 |
| -75.99744 | 36.85006 | 2017 | 4 |
| -75.99351 | 36.79358 | 2017 | 4 |
| -76.10830 | 36.79751 | 2017 | 4 |
| -76.09079 | 36.83211 | 2017 | 4 |
| -75.94111 | 36.73569 | 2017 | 4 |
| -76.06727 | 36.90225 | 2017 | 4 |
| -76.02060 | 36.66728 | 2017 | 4 |
| -76.06952 | 36.59869 | 2017 | 4 |
| -75.92794 | 36.55713 | 2017 | 4 |
for (y in start_year:end_year) {
c <- df.centers %>% filter(year == y)
print(ggplot(c
, aes(lng, lat)) +
geom_point() +
geom_text(aes(label=paste0(round(lng, 3), ',', round(lat,3)), hjust = 0.5, vjust=1)
, size=2.5
, color="blue") +
facet_wrap( ~ qtr, labeller = label_both) +
labs(title = paste(y, " Cluster"), x = "Longitude", y = "Latitude"))
}
rm(c)
library(leaflet)
# create a palette for cluster years
pal <- colorFactor(c(palette()[3], palette()[4], palette()[5], palette()[6])
, domain = as.factor(df.centers$year))
# get Virginia Beach police precincts
vbPrecincts <- read_csv("Data/vb_precincts.csv")
# create custom icon for precincts
icons <- awesomeIcons(
icon = 'glyphicon-earphone'
, iconColor = 'black'
, library = 'glyphicon'
)
map <- leaflet(df.centers) %>%
addTiles() %>%
addAwesomeMarkers(lng=vbPrecincts$lng, lat=vbPrecincts$lat, popup=vbPrecincts$descr, icon=icons) %>%
addCircleMarkers(lng=df.centers$lng
, lat=df.centers$lat
, popup=paste0('[', df.centers$year, '-', df.centers$qtr, '] ('
, round(df.centers$lng, 3), ',', round(df.centers$lat,3), ')')
, color = ~pal(df.centers$year)
, radius = 3
, fillOpacity = 0.5
) %>%
setView(-76.1339487, 36.8464062, 11)
map
Loaded packages.
sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] leaflet_1.1.0 rgeos_0.3-25 dismo_1.1-4 raster_2.5-8
## [5] geosphere_1.5-5 rgdal_1.2-15 sp_1.2-5 bindrcpp_0.2
## [9] dplyr_0.7.4 purrr_0.2.4 readr_1.1.1 tidyr_0.7.2
## [13] tibble_1.3.4 ggplot2_2.2.1 tidyverse_1.1.1
##
## loaded via a namespace (and not attached):
## [1] reshape2_1.4.2 haven_1.1.0 lattice_0.20-35 colorspace_1.3-2
## [5] htmltools_0.3.6 yaml_2.1.14 rlang_0.1.2 foreign_0.8-69
## [9] glue_1.2.0 modelr_0.1.1 readxl_1.0.0 bindr_0.1
## [13] plyr_1.8.4 stringr_1.2.0 munsell_0.4.3 gtable_0.2.0
## [17] cellranger_1.1.0 rvest_0.3.2 htmlwidgets_0.9 psych_1.7.8
## [21] evaluate_0.10.1 labeling_0.3 knitr_1.17 forcats_0.2.0
## [25] httpuv_1.3.5 crosstalk_1.0.0 parallel_3.4.2 highr_0.6
## [29] broom_0.4.2 Rcpp_0.12.13 xtable_1.8-2 scales_0.5.0
## [33] backports_1.1.1 jsonlite_1.5 mime_0.5 mnormt_1.5-5
## [37] hms_0.3 digest_0.6.12 stringi_1.1.5 shiny_1.0.5
## [41] grid_3.4.2 rprojroot_1.2 tools_3.4.2 magrittr_1.5
## [45] lazyeval_0.2.1 pkgconfig_2.0.1 xml2_1.1.1 lubridate_1.7.0
## [49] assertthat_0.2.0 rmarkdown_1.6 httr_1.3.1 R6_2.2.2
## [53] nlme_3.1-131 compiler_3.4.2