Introduction
Link NYC is a network of free Wifi Service in New York City, through which public wifi kiosks were installed beginning in late 2015 to replace the city’s payphones. In this project, I create a ShinyApp to visualize the proximity of these public Wifi kiosks to arrests. The goal is to determine if a visual relationship can be established that connects the presence of wifi kiosks to reduced crime. Such a relationship could allow government agencies to direct more funding to establishing wifi hotspots in high crime areas.
Data Acquisition
Three datasets provided by the City of New York were acquired by SoQL queries using the Open Data API, cleaned, and stored to GitHub:
Data Wrangling
The following steps were taken during the data wrangling process to prepare the data for exploration:
Wifi Data
First we read in the data,
# API endpoint
wifi.url <- 'https://data.cityofnewyork.us/resource/yjub-udmw.csv'
# querying the desired columns from website:
wifi.query <- paste0(wifi.url, "?$select=activated,boroname+AS+borough,ntaname+AS+neighborhood,latitude,longitude")
wifis <- read.csv(wifi.query)
kable_styling(kable(head(wifis), "html"), "hover", "left", font_size=12)
activated | borough | neighborhood | latitude | longitude |
---|---|---|---|---|
9999-09-09T00:00:00.000 | Queens | Springfield Gardens North | 40.67486 | -73.78412 |
9999-09-09T00:00:00.000 | Queens | Flushing | 40.74756 | -73.81815 |
9999-09-09T00:00:00.000 | Brooklyn | East Williamsburg | 40.71193 | -73.94067 |
9999-09-09T00:00:00.000 | Brooklyn | Brooklyn Heights-Cobble Hill | 40.68999 | -73.99200 |
9999-09-09T00:00:00.000 | Manhattan | Upper East Side-Carnegie Hill | 40.76463 | -73.96612 |
9999-09-09T00:00:00.000 | Queens | Queensboro Hill | 40.74243 | -73.81151 |
Next we do some cleaning to the date column. We deal with the bad dates and assume those dates represent active hotspots when the data was first collected. Then we write the file to disk.
# parse the date time column
wifis$activated <- as.Date(lubridate::parse_date_time(as.Date(wifis$activated), orders=c('mdY', 'Ymd')))
# store the "bad date" and look for the next earliest date to convert the bad dates to
bad.date <- "9999-09-09"
# find earliest date not 9999-09-09
earliest.date <- wifis |>
filter(activated != bad.date) |>
select(activated) |>
summarise(min(activated))
# isolate the date
earliest.date <- earliest.date[[1]]
# replace 9999-09-09 with earliest date
indices <- which(wifis$activated == bad.date)
wifis$activated <- wifis$activated |>
replace(indices, earliest.date)
# group dates by month
wifis <- wifis |>
mutate(activated = floor_date(activated, unit = "month"))
# write file to disk
write.csv(wifis, "wifis.csv", row.names = FALSE)
kable_styling(kable(head(wifis), "html"), "hover", "left", font_size=12)
activated | borough | neighborhood | latitude | longitude |
---|---|---|---|---|
2016-01-01 | Queens | Springfield Gardens North | 40.67486 | -73.78412 |
2016-01-01 | Queens | Flushing | 40.74756 | -73.81815 |
2016-01-01 | Brooklyn | East Williamsburg | 40.71193 | -73.94067 |
2016-01-01 | Brooklyn | Brooklyn Heights-Cobble Hill | 40.68999 | -73.99200 |
2016-01-01 | Manhattan | Upper East Side-Carnegie Hill | 40.76463 | -73.96612 |
2016-01-01 | Queens | Queensboro Hill | 40.74243 | -73.81151 |
Arrests Data
Now we take a look at the arrests data,
# API endpoint
arrests.url <- 'https://data.cityofnewyork.us/resource/8h9b-rp9u.csv'
# SoQL query for desired columns in years 2016-2019
arrests.query <- paste0(arrests.url, "?$select=date_trunc_ymd(arrest_date)+AS+arrest_date,arrest_precinct,arrest_boro,count(*)+AS+total_arrests&$where=date_extract_y(arrest_date)in('2016','2017','2018','2019')&$group=arrest_date,arrest_precinct,arrest_boro&$limit=5000000")
# read the file
arrests <- read.csv(arrests.query)
kable_styling(kable(head(arrests), "html"), "hover", "left", font_size=12)
arrest_date | arrest_precinct | arrest_boro | total_arrests |
---|---|---|---|
2018-10-25T00:00:00.000 | 102 | Q | 3 |
2017-03-16T00:00:00.000 | 19 | M | 7 |
2018-12-16T00:00:00.000 | 75 | K | 12 |
2018-12-18T00:00:00.000 | 104 | Q | 3 |
2017-12-25T00:00:00.000 | 34 | M | 3 |
2019-05-21T00:00:00.000 | 115 | Q | 10 |
First we map the values for the arrest_borough
column,
arrests$arrest_boro <- plyr::mapvalues(arrests$arrest_boro,
from=c("Q","M","K","B","S"),
to=c("Queens","Manhattan","Brooklyn",
"The Bronx","Staten Island"))
kable_styling(kable(head(arrests), "html"), "hover", "left", font_size=12)
arrest_date | arrest_precinct | arrest_boro | total_arrests |
---|---|---|---|
2018-10-25T00:00:00.000 | 102 | Queens | 3 |
2017-03-16T00:00:00.000 | 19 | Manhattan | 7 |
2018-12-16T00:00:00.000 | 75 | Brooklyn | 12 |
2018-12-18T00:00:00.000 | 104 | Queens | 3 |
2017-12-25T00:00:00.000 | 34 | Manhattan | 3 |
2019-05-21T00:00:00.000 | 115 | Queens | 10 |
And perform the same date grouping as with the wifi data, grouping by month, and once again aggregate the data by month:
arrests$arrest_date = as.Date(lubridate::parse_date_time(as.Date(arrests$arrest_date), orders=c('mdY', 'Ymd')))
arrests <- arrests |>
mutate(date = as.Date(floor_date(as.Date(arrest_date), "month"))) |>
dplyr::group_by(date, arrest_precinct, arrest_boro) |>
dplyr::summarise(total_arrests = sum(total_arrests)) |>
dplyr::rename(precinct = arrest_precinct,
borough = arrest_boro)
## `summarise()` has grouped output by 'date', 'arrest_precinct'. You can override
## using the `.groups` argument.
kable_styling(kable(head(arrests), "html"), "hover", "left", font_size=12)
date | precinct | borough | total_arrests |
---|---|---|---|
2016-01-01 | 1 | Manhattan | 313 |
2016-01-01 | 5 | Manhattan | 501 |
2016-01-01 | 6 | Manhattan | 291 |
2016-01-01 | 7 | Manhattan | 262 |
2016-01-01 | 9 | Manhattan | 317 |
2016-01-01 | 10 | Manhattan | 315 |
Police Precinct Data
Now we can read in the Police Precinct geojson data to combine with the arrests data.
geojson <- 'https://raw.githubusercontent.com/josh1den/DATA-608/main/FINAL%20PROJECT/data/PolicePrecincts.geojson'
# convert geojson to shape file
p.sf <- geojson_sf(geojson)
# merge geojson and precincts data
precinct.arrests <- merge(arrests, p.sf) |>
st_as_sf()
Combining Arrests and Wifi by Date
Lastly, we create a dataframe combining the total arrests and wifi locations by date,
# wifi by date
wifis.by.date <- wifis |>
group_by(date = as.Date(activated)) |>
dplyr::summarise(total = n()) |>
mutate(total = cumsum(total))
# arrests by date
arrests.by.date <- arrests |>
group_by(date) |>
dplyr::summarise(total = sum(total_arrests))
# combine the dataframes
wifis.v.arrests <- wifis.by.date |>
left_join(arrests.by.date, join_by(date)) |>
dplyr::rename(wifi = total.x, arrests = total.y)
kable_styling(kable(head(wifis.v.arrests), "html"), "hover", "left", font_size=12)
date | wifi | arrests |
---|---|---|
2016-01-01 | 461 | 27343 |
2016-02-01 | 464 | 27036 |
2016-03-01 | 479 | 28583 |
2016-04-01 | 495 | 26907 |
2016-05-01 | 506 | 28176 |
2016-06-01 | 513 | 26470 |
Data Visualization
Now we are ready to build our ShinyApp.
The ui
and server
functions are defined in
the local app.R
file, and the app is deployed to ShinyApps
knitr::include_app('https://josh-iden.shinyapps.io/ArrestsVsWifi/')
In this app, I used a leaflet map to visualize crime by police precinct, with wifi icons denoting the location of kiosks on the map. The map is controllable by the slider bar on the bottom right, where the bar can be dragged to observe the change in crime and wifi locations over time. There is also a dropdown to zoom in by borough. On the bottom left, a plotly graph displays the total arrests and wifi locations during the time frame. Hovering over the map displays the data points by their respective dates. We can also toggle the wifi locations to show or hide the hotspots.
Analysis
We observe from the graph on the bottom left that arrests decreased between January 2016 and January 2019, while total wifi locations increased during that time.
If we zoom in on specific precincts, for example, precinct 75, which includes the Ocean Hill, Cypress Hill, and East New York neighborhoods, we can observe that as wifi locations increased substantially, arrests did in fact decrease: in January 2016, there were 758 arrests and 12 hotspots, compared with January 2019, when there were 485 arrests and 26 hotspots.
Conclusion
The map does make it possible to visually identify high arrest areas and the distribution of wifi hotspots in those areas, but unfortunately one of the limitations of the available data is connecting the total number of wifi locations per police precinct. Another limitiation of the data is that crime is not available per capita by precinct, as population by police precinct data is not available, and the transformation of census data by geocoordinates was determined to be outside the scope of this analysis. That said, this map provides an immediate visual aid to identifying high crime areas and the distribution of hotspots in those areas over time, and provides the ability to zoom in on specific areas and glean insights that are not as easily recognizable in tabular form.
Appendix
library(shiny)
library(shinyWidgets)
library(tidyverse)
library(geojsonio)
library(sf)
library(leaflet)
library(plotly)
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(htmltools)
library(bslib)
## github paths ##
wifis <- read.csv('https://raw.githubusercontent.com/josh1den/DATA-608/main/FINAL%20PROJECT/data/wifis.csv')
pre <- read.csv('https://raw.githubusercontent.com/josh1den/DATA-608/main/FINAL%20PROJECT/data/precincts.csv')
wifi.v.arrests <- read.csv('https://raw.githubusercontent.com/josh1den/DATA-608/main/FINAL%20PROJECT/data/wifivarrests.csv')
wifi.v.arrests$date <- as.Date(wifi.v.arrests$date, "%Y-%m-%d")
geojson <- 'https://raw.githubusercontent.com/josh1den/DATA-608/main/FINAL%20PROJECT/data/PolicePrecincts.geojson'
pre.geo <- geojson_sf(geojson)
# store borough coordinates from web
borough_coords <- data.frame(borough = c("Full Overview",
"Manhattan",
"Brooklyn",
"The Bronx",
"Queens",
"Staten Island"),
longitude = c(-73.935242,
-73.971321,
-73.949997,
-73.865433,
-73.769417,
-74.151535),
latitude = c(40.730610,
40.776676,
40.650002,
40.837048,
40.742054,
40.579021),
zoom = c(10,
12,
12,
12,
12,
12))
# store list of months
choices_month <- format(seq.Date(from = as.Date("2016-01-01"), by = "month",
length.out = 37), "%b-%Y")
# set bins and color range
bins <- c(0, 135, 300, 400, 500, 700, 850, 1000, 1200)
pal <- colorBin("YlOrRd", domain = pre$arrests, bins = bins)
# load wifi icon
wifi.Icon <- makeIcon(
iconUrl = "wifi_icon.png",
iconWidth = 10, iconHeight = 10
)
# build Shiny ui
ui <- fluidPage(
# bootswatch theme
theme = bs_theme(version = 4, bootswatch = "minty"),
# Application title
tags$h1("Arrests in Proximity to Free Wifi Kiosks"),
leafletOutput("mymap"),
p(),
hr(), # horizontal rule
fluidRow(
column(
7,
fluidRow(
plotlyOutput(outputId = "areaPlot"),
style = "height:400px")
),
column(
4, offset = 1,
verticalLayout(
sliderTextInput(inputId = "date",
label = "Select Month:",
choices = choices_month),
pickerInput(
inputId = "borough",
label = "Zoom to Borough",
choices = c("Full Overview",
"Manhattan",
"Brooklyn",
"The Bronx",
"Queens",
"Staten Island"),
selected = "Full Overview")
)
)
)
)
server <- function(input, output) {
# reactive expression for the subset data (by date)
hotspots <- reactive({
stamp <- as.Date(paste("01", unlist(strsplit(input$date, ";")), sep="-"), "%d-%B-%Y")
wifis[wifis$Activated <= stamp, ]
})
arrests <- reactive({
stamp <- as.Date(paste("01", unlist(strsplit(input$date, ";")), sep="-"), "%d-%B-%Y")
pre[pre$date <= stamp, ]
})
borough <- reactive({
borough_coords[borough_coords$borough == as.character(input$borough), ]
})
output$areaPlot <- renderPlotly({
plot_ly(wifi.v.arrests, x= ~date, y = ~wifi, name = 'wifi', type = 'scatter',
mode = 'none', stackgroup = 'one', fillcolor = '#F5FF8D') |>
add_trace(y = ~arrests, name = 'arrests', fillcolor = '#50CB86') |>
layout(title = 'Arrests and Wifi, 2016-2019',
xaxis = list(title = "",
showgrid = FALSE),
yaxis = list(title = "Total",
showgrid = FALSE,
tickvals = list("", 5000, 10000, 15000, 20000, 25000, 30000),
ticktext = list("", "5k", "10k", "15k", "20k", "25k", "30k"),
tickmode = "array"))
})
output$mymap <- renderLeaflet({
leaflet(options = leafletOptions(minZoom = -30, maxZoom = 30)) |>
addTiles() |>
setView(lng = borough()$longitude, lat = borough()$latitude, zoom=borough()$zoom) |>
addPolygons(data = pre.geo,
color = "white", fillColor = ~pal(arrests()$arrests),
weight = 2,
opacity = 1,
dashArray = "3",
fillOpacity = 0.7,
highlightOptions = highlightOptions (
weight = 2,
color = "#800000",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = paste("Precinct:",
arrests()$precinct,
"<br>",
"Arrests:",
arrests()$arrests) |>
lapply(htmltools::HTML)) |>
addMarkers(data = hotspots(), lat = ~ Latitude, lng = ~ Longitude, icon = wifi.Icon,
label = ~as.character(Neighborhood), group = "Show/Hide Wifi") |>
addLayersControl(
overlayGroups = "Show/Hide Wifi",
options = layersControlOptions(collapsed = FALSE)
) |>
addLegend("bottomright", pal = pal, values = bins,
title = "Arrests",
opacity = 1)
})
}
# shinyApp(ui = ui, server = server) ##