#libraries used in producing this report
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.1.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(padr)
## Warning: package 'padr' was built under R version 4.1.2
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.1.2
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
#List of every shooting incident that occurred in NYC going back to 2006
#through the end of the previous calendar year (2020).
nypd_url <- "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv"
nypd_data <- read_csv(nypd_url, show_col_types = FALSE)
summary(nypd_data)
##   INCIDENT_KEY        OCCUR_DATE         OCCUR_TIME           BORO          
##  Min.   :  9953245   Length:23585       Length:23585      Length:23585      
##  1st Qu.: 55322804   Class :character   Class1:hms        Class :character  
##  Median : 83435362   Mode  :character   Class2:difftime   Mode  :character  
##  Mean   :102280741                      Mode  :numeric                      
##  3rd Qu.:150911774                                                          
##  Max.   :230611229                                                          
##                                                                             
##     PRECINCT      JURISDICTION_CODE LOCATION_DESC      STATISTICAL_MURDER_FLAG
##  Min.   :  1.00   Min.   :0.000     Length:23585       Mode :logical          
##  1st Qu.: 44.00   1st Qu.:0.000     Class :character   FALSE:19085            
##  Median : 69.00   Median :0.000     Mode  :character   TRUE :4500             
##  Mean   : 66.21   Mean   :0.333                                               
##  3rd Qu.: 81.00   3rd Qu.:0.000                                               
##  Max.   :123.00   Max.   :2.000                                               
##                   NA's   :2                                                   
##  PERP_AGE_GROUP       PERP_SEX          PERP_RACE         VIC_AGE_GROUP     
##  Length:23585       Length:23585       Length:23585       Length:23585      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    VIC_SEX            VIC_RACE           X_COORD_CD        Y_COORD_CD    
##  Length:23585       Length:23585       Min.   : 914928   Min.   :125757  
##  Class :character   Class :character   1st Qu.: 999925   1st Qu.:182539  
##  Mode  :character   Mode  :character   Median :1007654   Median :193470  
##                                        Mean   :1009379   Mean   :207300  
##                                        3rd Qu.:1016782   3rd Qu.:239163  
##                                        Max.   :1066815   Max.   :271128  
##                                                                          
##     Latitude       Longitude        Lon_Lat         
##  Min.   :40.51   Min.   :-74.25   Length:23585      
##  1st Qu.:40.67   1st Qu.:-73.94   Class :character  
##  Median :40.70   Median :-73.92   Mode  :character  
##  Mean   :40.74   Mean   :-73.91                     
##  3rd Qu.:40.82   3rd Qu.:-73.88                     
##  Max.   :40.91   Max.   :-73.70                     
## 
nypd_data <- nypd_data %>%
mutate(OCCUR_DATE = mdy(OCCUR_DATE)) %>%
rename(DATE = OCCUR_DATE) %>%
select(INCIDENT_KEY, DATE, BORO) %>%
arrange(DATE)
head(nypd_data, n=10)
## # A tibble: 10 x 3
##    INCIDENT_KEY DATE       BORO         
##           <dbl> <date>     <chr>        
##  1      9953252 2006-01-01 MANHATTAN    
##  2      9953247 2006-01-01 BROOKLYN     
##  3      9953246 2006-01-01 BRONX        
##  4    139716503 2006-01-01 BROOKLYN     
##  5      9953248 2006-01-01 QUEENS       
##  6      9953250 2006-01-01 QUEENS       
##  7      9953250 2006-01-01 QUEENS       
##  8      9953245 2006-01-01 BRONX        
##  9      9953249 2006-01-02 BROOKLYN     
## 10      9953257 2006-01-02 STATEN ISLAND
daily_totals <- nypd_data %>%
group_by(DATE,
BORO) %>%
arrange(DATE) %>%
mutate(DAILY_BORO_TOTAL = length(INCIDENT_KEY)) %>% #get daily victim count
pivot_wider(id_cols=DATE,
names_from=BORO,
values_from=DAILY_BORO_TOTAL,
values_fn=mean,
values_fill = 0) %>%
rename(STATEN_ISLAND = `STATEN ISLAND`) %>%
mutate(DAILY_TOTAL = sum(c_across(QUEENS:STATEN_ISLAND))) %>%
ungroup() %>%
pad() %>% #automatically add omitted dates (days with 0 shootings)
setnafill(type='const', fill=0) %>% #replace 'NA' with '0'
mutate( WEEK = (year(DATE) - year(min(DATE)))*52
+ week(DATE) - week(min(DATE)),
YEAR = year(DATE),
MONTH = as.character(lubridate::month(DATE,label=TRUE)))
## pad applied on the interval: day
head(daily_totals)
## # A tibble: 6 x 10
##   DATE       MANHATTAN BROOKLYN BRONX QUEENS STATEN_ISLAND DAILY_TOTAL  WEEK
##   <date>         <dbl>    <dbl> <dbl>  <dbl>         <dbl>       <dbl> <dbl>
## 1 2006-01-01         1        2     2      3             0           3     0
## 2 2006-01-02         0        3     0      0             1           1     0
## 3 2006-01-03         0        2     0      2             0           2     0
## 4 2006-01-04         0        1     1      2             0           2     0
## 5 2006-01-05         0        2     2      0             0           0     0
## 6 2006-01-06         0        0     3      1             0           1     0
## # ... with 2 more variables: YEAR <int>, MONTH <chr>
daily_totals %>%
ggplot(aes(x=DATE, y=DAILY_TOTAL)) +
geom_point(color="tomato", aes(y=DAILY_TOTAL)) +
labs(x="Date", y="Victims", title = "NYC Shooting Victims", subtitle ="Daily total" )

weekly_totals <- daily_totals %>%
select(WEEK,DATE, everything()) %>%
filter(WEEK != 0 & WEEK != 780) %>% #ignore partial weeks
group_by(WEEK) %>%
mutate(across(QUEENS:DAILY_TOTAL, sum)) %>%
rename(WEEKLY_TOTAL = DAILY_TOTAL) %>%
ungroup()
weekly_totals <- data.table(weekly_totals) %>%
unique(by=1)
weekly_totals %>%
ggplot(aes(x=DATE, y=WEEKLY_TOTAL)) +
geom_point(color="tomato", aes(y=WEEKLY_TOTAL)) +
labs(x="Week", y="Victims", title="NYC Shooting Victims", subtitle="Weekly total")

weekly_totals %>%
select(YEAR, WEEKLY_TOTAL) %>%
mutate(YEAR = as.factor(YEAR)) %>%
ggplot(aes(x=YEAR, y=WEEKLY_TOTAL)) +
geom_boxplot() +
labs(x="Year", y="Victims", title="NYC Shooting Victims", subtitle="Weekly totals")

#Includes monthly employment data for New York state from 2006-2020
labor_url <- "https://raw.githubusercontent.com/r-quillen/DTSA-5301/main/BLS_NY_employment.csv"
labor_data <- read_csv(labor_url, show_col_types = FALSE)
labor_data <- labor_data %>%
rename(YEAR = Year,
MONTH = Period,
UNEMP_RATE = `unemployment rate`) %>%
select(YEAR, MONTH, UNEMP_RATE) %>%
mutate(MONTH = as.character(MONTH),
YEAR = as.integer(YEAR))
head(labor_data)
## # A tibble: 6 x 3
##    YEAR MONTH UNEMP_RATE
##   <int> <chr>      <dbl>
## 1  2006 Jan          4.8
## 2  2006 Feb          4.7
## 3  2006 Mar          4.7
## 4  2006 Apr          4.7
## 5  2006 May          4.7
## 6  2006 Jun          4.6
monthly_totals <- daily_totals %>%
select(DATE, YEAR, MONTH, DAILY_TOTAL) %>%
group_by(YEAR, MONTH) %>%
mutate(DAILY_TOTAL = sum(DAILY_TOTAL)) %>%
rename(MONTHLY_TOTAL = DAILY_TOTAL)
monthly_totals <- data.table(monthly_totals) %>%
unique(by=c(2,3)) %>%
full_join(labor_data)
## Joining, by = c("YEAR", "MONTH")
pl1 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="tomato", aes(y=UNEMP_RATE)) +
labs(x="", y="Unemployment Rate (%)",
title="Unemployment Rate", subtitle="NY state - seasonally adjusted")
pl2 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="dodgerblue", aes(y=MONTHLY_TOTAL)) +
labs(x="", y="Victims",
title="NYC Shooting Victims", subtitle="Monthly totals", )
pl3 <- monthly_totals %>%
ggplot(aes(DATE)) +
geom_point(color="tomato", aes(y=(max(MONTHLY_TOTAL)*UNEMP_RATE/max(UNEMP_RATE)))) +
geom_point(color="dodgerblue", aes(y=MONTHLY_TOTAL)) +
labs(x="", y="", title="Unemployment rate & Observed victims", subtitle="") +
theme(axis.text.y = element_blank())
plot_grid(pl1, pl2, pl3, nrow=1)

model <- lm(MONTHLY_TOTAL ~ UNEMP_RATE,monthly_totals)
monthly_totals <- monthly_totals %>%
mutate(PRED = predict(model),
RESID = resid(model))
p_mod1 <- monthly_totals %>%
ggplot(aes(x=DATE)) +
geom_point(aes(color="Observed", y=MONTHLY_TOTAL)) +
geom_point(aes(color = "Predicted", y=PRED)) +
labs(x="Date", y="Victims", title="Predicted & Observed Victims")
p_mod2 <- monthly_totals %>%
ggplot(aes(x=DATE)) +
geom_point(color="dodgerblue", aes(y=RESID)) +
labs(x="Date", y="residuals", title="Model residuals")
summary(model)
## 
## Call:
## lm(formula = MONTHLY_TOTAL ~ UNEMP_RATE, data = monthly_totals)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.105  -6.733  -0.889   5.401  34.517 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0624     1.9993   7.034 4.17e-11 ***
## UNEMP_RATE    1.4841     0.2954   5.023 1.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.254 on 178 degrees of freedom
## Multiple R-squared:  0.1242, Adjusted R-squared:  0.1192 
## F-statistic: 25.23 on 1 and 178 DF,  p-value: 1.228e-06
plot_grid(p_mod1, p_mod2, rel_widths=c(3,2))