#libraries used in producing this report
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.1.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(padr)
## Warning: package 'padr' was built under R version 4.1.2
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.1.2
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
#List of every shooting incident that occurred in NYC going back to 2006
#through the end of the previous calendar year (2020).
nypd_url <- "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv"
nypd_data <- read_csv(nypd_url, show_col_types = FALSE)
summary(nypd_data)
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME BORO
## Min. : 9953245 Length:23585 Length:23585 Length:23585
## 1st Qu.: 55322804 Class :character Class1:hms Class :character
## Median : 83435362 Mode :character Class2:difftime Mode :character
## Mean :102280741 Mode :numeric
## 3rd Qu.:150911774
## Max. :230611229
##
## PRECINCT JURISDICTION_CODE LOCATION_DESC STATISTICAL_MURDER_FLAG
## Min. : 1.00 Min. :0.000 Length:23585 Mode :logical
## 1st Qu.: 44.00 1st Qu.:0.000 Class :character FALSE:19085
## Median : 69.00 Median :0.000 Mode :character TRUE :4500
## Mean : 66.21 Mean :0.333
## 3rd Qu.: 81.00 3rd Qu.:0.000
## Max. :123.00 Max. :2.000
## NA's :2
## PERP_AGE_GROUP PERP_SEX PERP_RACE VIC_AGE_GROUP
## Length:23585 Length:23585 Length:23585 Length:23585
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## VIC_SEX VIC_RACE X_COORD_CD Y_COORD_CD
## Length:23585 Length:23585 Min. : 914928 Min. :125757
## Class :character Class :character 1st Qu.: 999925 1st Qu.:182539
## Mode :character Mode :character Median :1007654 Median :193470
## Mean :1009379 Mean :207300
## 3rd Qu.:1016782 3rd Qu.:239163
## Max. :1066815 Max. :271128
##
## Latitude Longitude Lon_Lat
## Min. :40.51 Min. :-74.25 Length:23585
## 1st Qu.:40.67 1st Qu.:-73.94 Class :character
## Median :40.70 Median :-73.92 Mode :character
## Mean :40.74 Mean :-73.91
## 3rd Qu.:40.82 3rd Qu.:-73.88
## Max. :40.91 Max. :-73.70
##
nypd_data <- nypd_data %>%
mutate(OCCUR_DATE = mdy(OCCUR_DATE)) %>%
rename(DATE = OCCUR_DATE) %>%
select(INCIDENT_KEY, DATE, BORO) %>%
arrange(DATE)
head(nypd_data, n=10)
## # A tibble: 10 x 3
## INCIDENT_KEY DATE BORO
## <dbl> <date> <chr>
## 1 9953252 2006-01-01 MANHATTAN
## 2 9953247 2006-01-01 BROOKLYN
## 3 9953246 2006-01-01 BRONX
## 4 139716503 2006-01-01 BROOKLYN
## 5 9953248 2006-01-01 QUEENS
## 6 9953250 2006-01-01 QUEENS
## 7 9953250 2006-01-01 QUEENS
## 8 9953245 2006-01-01 BRONX
## 9 9953249 2006-01-02 BROOKLYN
## 10 9953257 2006-01-02 STATEN ISLAND
daily_totals <- nypd_data %>%
group_by(DATE,
BORO) %>%
arrange(DATE) %>%
mutate(DAILY_BORO_TOTAL = length(INCIDENT_KEY)) %>% #get daily victim count
pivot_wider(id_cols=DATE,
names_from=BORO,
values_from=DAILY_BORO_TOTAL,
values_fn=mean,
values_fill = 0) %>%
rename(STATEN_ISLAND = `STATEN ISLAND`) %>%
mutate(DAILY_TOTAL = sum(c_across(QUEENS:STATEN_ISLAND))) %>%
ungroup() %>%
pad() %>% #automatically add omitted dates (days with 0 shootings)
setnafill(type='const', fill=0) %>% #replace 'NA' with '0'
mutate( WEEK = (year(DATE) - year(min(DATE)))*52
+ week(DATE) - week(min(DATE)),
YEAR = year(DATE),
MONTH = as.character(lubridate::month(DATE,label=TRUE)))
## pad applied on the interval: day
head(daily_totals)
## # A tibble: 6 x 10
## DATE MANHATTAN BROOKLYN BRONX QUEENS STATEN_ISLAND DAILY_TOTAL WEEK
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2006-01-01 1 2 2 3 0 3 0
## 2 2006-01-02 0 3 0 0 1 1 0
## 3 2006-01-03 0 2 0 2 0 2 0
## 4 2006-01-04 0 1 1 2 0 2 0
## 5 2006-01-05 0 2 2 0 0 0 0
## 6 2006-01-06 0 0 3 1 0 1 0
## # ... with 2 more variables: YEAR <int>, MONTH <chr>
daily_totals %>%
ggplot(aes(x=DATE, y=DAILY_TOTAL)) +
geom_point(color="tomato", aes(y=DAILY_TOTAL)) +
labs(x="Date", y="Victims", title = "NYC Shooting Victims", subtitle ="Daily total" )

weekly_totals <- daily_totals %>%
select(WEEK,DATE, everything()) %>%
filter(WEEK != 0 & WEEK != 780) %>% #ignore partial weeks
group_by(WEEK) %>%
mutate(across(QUEENS:DAILY_TOTAL, sum)) %>%
rename(WEEKLY_TOTAL = DAILY_TOTAL) %>%
ungroup()
weekly_totals <- data.table(weekly_totals) %>%
unique(by=1)
weekly_totals %>%
ggplot(aes(x=DATE, y=WEEKLY_TOTAL)) +
geom_point(color="tomato", aes(y=WEEKLY_TOTAL)) +
labs(x="Week", y="Victims", title="NYC Shooting Victims", subtitle="Weekly total")

weekly_totals %>%
select(YEAR, WEEKLY_TOTAL) %>%
mutate(YEAR = as.factor(YEAR)) %>%
ggplot(aes(x=YEAR, y=WEEKLY_TOTAL)) +
geom_boxplot() +
labs(x="Year", y="Victims", title="NYC Shooting Victims", subtitle="Weekly totals")

#Includes monthly employment data for New York state from 2006-2020
labor_url <- "https://raw.githubusercontent.com/r-quillen/DTSA-5301/main/BLS_NY_employment.csv"
labor_data <- read_csv(labor_url, show_col_types = FALSE)
labor_data <- labor_data %>%
rename(YEAR = Year,
MONTH = Period,
UNEMP_RATE = `unemployment rate`) %>%
select(YEAR, MONTH, UNEMP_RATE) %>%
mutate(MONTH = as.character(MONTH),
YEAR = as.integer(YEAR))
head(labor_data)
## # A tibble: 6 x 3
## YEAR MONTH UNEMP_RATE
## <int> <chr> <dbl>
## 1 2006 Jan 4.8
## 2 2006 Feb 4.7
## 3 2006 Mar 4.7
## 4 2006 Apr 4.7
## 5 2006 May 4.7
## 6 2006 Jun 4.6
monthly_totals <- daily_totals %>%
select(DATE, YEAR, MONTH, DAILY_TOTAL) %>%
group_by(YEAR, MONTH) %>%
mutate(DAILY_TOTAL = sum(DAILY_TOTAL)) %>%
rename(MONTHLY_TOTAL = DAILY_TOTAL)
monthly_totals <- data.table(monthly_totals) %>%
unique(by=c(2,3)) %>%
full_join(labor_data)
## Joining, by = c("YEAR", "MONTH")
pl1 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="tomato", aes(y=UNEMP_RATE)) +
labs(x="", y="Unemployment Rate (%)",
title="Unemployment Rate", subtitle="NY state - seasonally adjusted")
pl2 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="dodgerblue", aes(y=MONTHLY_TOTAL)) +
labs(x="", y="Victims",
title="NYC Shooting Victims", subtitle="Monthly totals", )
pl3 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="tomato", aes(y=(max(MONTHLY_TOTAL)*UNEMP_RATE/max(UNEMP_RATE)))) +
geom_point(color="dodgerblue", aes(y=MONTHLY_TOTAL)) +
labs(x="", y="", title="Unemployment rate & Observed victims", subtitle="") +
theme(axis.text.y = element_blank())
plot_grid(pl1, pl2, pl3, nrow=1)

model <- lm(MONTHLY_TOTAL ~ UNEMP_RATE,monthly_totals)
monthly_totals <- monthly_totals %>%
mutate(PRED = predict(model),
RESID = resid(model))
p_mod1 <- monthly_totals %>%
ggplot(aes(x=DATE)) +
geom_point(aes(color="Observed", y=MONTHLY_TOTAL)) +
geom_point(aes(color = "Predicted", y=PRED)) +
labs(x="Date", y="Victims", title="Predicted & Observed Victims")
p_mod2 <- monthly_totals %>%
ggplot(aes(x=DATE)) +
geom_point(color="dodgerblue", aes(y=RESID)) +
labs(x="Date", y="residuals", title="Model residuals")
summary(model)
##
## Call:
## lm(formula = MONTHLY_TOTAL ~ UNEMP_RATE, data = monthly_totals)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.105 -6.733 -0.889 5.401 34.517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0624 1.9993 7.034 4.17e-11 ***
## UNEMP_RATE 1.4841 0.2954 5.023 1.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.254 on 178 degrees of freedom
## Multiple R-squared: 0.1242, Adjusted R-squared: 0.1192
## F-statistic: 25.23 on 1 and 178 DF, p-value: 1.228e-06
plot_grid(p_mod1, p_mod2, rel_widths=c(3,2))
