1 A Socio-Economic Investigation into Crime

1.1 Introduction

This project provided us with the opportunity of showcasing many of the skills we have learned throughout this course and of applying them to an investigation into datasets of our choosing. We narrowed our scope to a few datasets containing information on social economic information, namely unemployment crime data in NYC. We hoped that this investigation would reveal valuable information that could be used to formulate policy proposals.

1.2 Sources

1.3 Environment Setup

source("environment_setup.R", echo = T, prompt.echo = "", spaced = F)

## if (!require("dplyr")) install.packages("dplyr")
## if (!require("RSocrata")) install.packages("RSocrata")
## if (!require("tidyverse")) install.packages("tidyverse")
## if (!require("ggplot2")) install.packages("ggplot2")
## if (!require("readxl")) install.packages("readxl")
## if (!require("plyr")) install.packages("plyr")
## if (!require("treemap")) install.packages("treemap")
## if (!require("leaflet")) install.packages("leaflet")
## if (!require("forcats")) install.packages("forcats")
## if (!require("ggExtra")) install.packages("ggExtra")
## if (!require("GGally")) install.packages("GGally")

1.4 NYPD Complaint Data Import via API

variable	description
`arrest_date`	Exact date of arrest for the reported event.
`ofns_desc`	Description of internal classification corresponding with KY code (more general category than PD description).
`arrest_boro`	Borough of arrest. B(Bronx), S(Staten Island), K(Brooklyn), M(Manhattan), Q(Queens)
`language`	language of school where professor received education: english or non-english.
`age_group`	Perpetrator’s age within a category.
`perp_sex`	Perpetrator’s sex description.
`perp_race`	Perpetrator’s race description.
`x_coord_cd`	Midblock X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104).
`y_coord_cd`	Midblock Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)
`latitude`	Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)
`longitude`	Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)

Load the data into R using tthe RSocratat API.

source("arrests_dataset.R", echo = F, prompt.echo = "", spaced = F)
head(arrests_df, 10)

1.5 2014-2018 Unemployment Data Import via .csv

source("unemployed_dataset.R", echo = F, prompt.echo = "", spaced = F)
head(bronx)

head(queens)

head(brooklyn)

head(manhattan)

head(staten)

2 Data Transformation

clean_table <- function (table) {
  table_content <- table %>%
    na.omit()
  colnames(table_content) = c("arrest_boro","year","month","labor_force","employed","unemployed","unemployment_rate")
  final_table <- table_content %>%
    select(arrest_boro, year, labor_force, employed, unemployed, unemployment_rate) 
  return(final_table)
}
bronx_income <- clean_table(bronx)
queens_income <- clean_table(queens)
brooklyn_income <- clean_table(brooklyn)
manhattan_income <- clean_table(manhattan)
staten_income <- clean_table(staten)
income_table <- Reduce(function(...) merge(..., all=T), list(bronx_income, queens_income, brooklyn_income, manhattan_income, staten_income))
income_table

2.1 Data Analysis

theme_set(theme_bw())
ggplot(income_table, aes(x=arrest_boro, y=unemployment_rate)) + 
    geom_boxplot(main = "Different boxplot for uneployment rate in the 5 counties over years",
        ylab = "Unemployment rate %",   
        xlab = "",  
        col = "blue",          
        border = "blue") +
facet_wrap(~year, scale="free") + 
  theme(axis.text.x = element_text(angle=60, vjust = 0.6))

## Warning: Ignoring unknown parameters: main, ylab, xlab, border

by_income <- income_table %>%
  group_by(arrest_boro, year) %>%
  dplyr::summarise(avg_unemployment_rate = max(unemployment_rate)) %>%
  arrange(desc(year))
by_income

## I need to change symbole to full name to be able to merge. 
arrests_df$arrest_boro <- revalue(arrests_df$arrest_boro, c("B"="Bronx", "Q"="Queens", "K"="Brooklyn", "S"="Staten Island", "M"="Manhattan")) 
head(arrests_df)

murder_counts <- arrests_df %>%
  group_by(arrest_boro, year, perp_race) %>%
  dplyr::summarise(murder_counts = n()) %>%
  arrange(desc(year))
murder_counts

merged <- Reduce(function(...) merge(..., all=T), list(murder_counts, by_income)) %>%
  na.omit() %>%
  arrange(desc(year))
merged

# Data Prep
merged$boro <- rownames(merged)  # create new column for boro names
merged$unemployment_z <- round((merged$avg_unemployment_rate - mean(merged$avg_unemployment_rate))/sd(merged$avg_unemployment_rate), 2)  # compute normalized 
merged$unemployyment_type <- ifelse(merged$unemployment_z < 0, "below", "above")  # above / below avg flag
merged <- merged[order(merged$unemployment_z), ]  # sort
# merged$arrest_boro <- factor(merged$arrest_boro, levels = merged$arrest_boro)  # convert to factor to retain sorted order in plot.
merged$arrest_boro <- factor(merged$arrest_boro, levels = rev(unique(merged$arrest_boro)), ordered=TRUE)
# Diverging Barcharts
ggplot(merged, aes(x=`arrest_boro`, y=unemployment_z, label=unemployment_z)) + 
  geom_bar(stat='identity', aes(fill=unemployyment_type), width=.5)  +
  scale_fill_manual(name="Unemployment", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#f8766d", "below"="#00ba38")) + 
  labs(subtitle="Normalised unemployment rate from 'merged'", 
       title= "Diverging Bars of unemployment rates in boro") + 
  coord_flip()

The Bronx seems to have the largest amount of unemployment rate which is above the average and in the same time it has the highest crime amongst other counties.

# Data Prep
merged$boro <- rownames(merged)  # create new column for boro names
merged$crime_z <- round((merged$murder_counts - mean(merged$murder_counts))/sd(merged$murder_counts), 2)  # compute normalized 
merged$murder_type <- ifelse(merged$crime_z < 0, "below", "above")  # above / below avg flag
merged <- merged[order(merged$crime_z), ]  # sort
# merged$arrest_boro <- factor(merged$arrest_boro, levels = merged$arrest_boro)  # convert to factor to retain sorted order in plot.
merged$arrest_boro <- factor(merged$arrest_boro, levels = rev(unique(merged$arrest_boro)), ordered=TRUE)
# Diverging Barcharts
ggplot(merged, aes(x=`arrest_boro`, y=crime_z, label=crime_z)) + 
  geom_bar(stat='identity', aes(fill=murder_type), width=.5)  +
  scale_fill_manual(name="Crimes", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#f8766d", "below"="#00ba38")) + 
  labs(subtitle="Normalised murders'", 
       title= "Diverging Bars of number of murders per county") + 
  coord_flip()

As illustrated from the diverge bar that Bronx also has above average number of crimes

theme_set(theme_classic())
# Plot
g <- ggplot(arrests_df, aes(arrest_boro))
g + geom_density(aes(fill=factor(age_group)), alpha=0.8) + 
    labs(title="Density plot", 
         subtitle="Number of crimes per boro per age group distribution",
         caption="Source: by_boro",
         x="Borough",
         fill="# crimes committed per age group")

We can see that the 25-44 age category is the most age category for committing a crime. As also demonstrated, the Bronx seems to have the highest density of crimes am

map <- murder_counts %>% filter(year == 2018)
treemap(map, #Your data frame object
        index=c("perp_race","arrest_boro"),  #A list of your categorical variables
        vSize = "murder_counts",  #This is your quantitative variable
        type="categorical", #Type sets the organization and color scheme of your treemap
        vColor = "arrest_boro", #Type sets the organization and color scheme of your treemap
        palette = "Set1",  #Select your color palette from the RColorBrewer presets or make your own.
        title="Crime distribution committed by different races - year 2018", #Customize your title
        fontsize.title = 14 #Change the font size of the title
        )

## Warning in `[.data.table`(dtfDT, , `:=`("c", fact), with = FALSE):
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.

Let’s study the evolution of crime over the period of interest (2014-2018).

grouped_boro <- arrests_df %>% 
  group_by(year, arrest_boro) %>% 
  dplyr::summarize(count = n()) %>% 
  arrange(desc(count))

What the plot below reveals is that overall crime is decreasing for all boroughs of NYC. The data year over year is very similar, appearing to simply scale down over time. What we can note as suprising is the fact that total crime between Manhattan and Brooklyn is at fairly similar levels. Total crime is aggregated without accounting for different types of crime so we will further our investigation by dissecting crime per borough.

ggplot(grouped_boro, aes(x = reorder(year, -count), y = count, fill = arrest_boro)) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE), breaks = seq(0,120000,10000)) +
  xlab("year") + ylab("total crime") + ggtitle("Crime by Borough Time Series") +
  scale_fill_brewer(palette="Blues") + theme_minimal()

grouped_offenses <- arrests_df %>% 
  group_by(year, arrest_boro, ofns_desc) %>% 
  dplyr::summarize(count = n())

t5 <- grouped_offenses %>% top_n(5)

## Selecting by count

ggplot(t5, aes(x = reorder(arrest_boro, -count), y = count, fill=ofns_desc)) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE), breaks = seq(0,80000,5000)) +
  xlab("borough") + ylab("crime rate") + ggtitle("Most Common Crimes by Borough 2014-2018") +
  scale_fill_brewer(palette="Blues") + theme_minimal()

Perhaps we can comment on effective crime measures by looking at the least common crimes. Fix this plot

ggplot(grouped_offenses %>% top_n(-25), aes(x = reorder(ofns_desc, -count), y = count)) + 
  geom_bar(stat = 'identity', fill= 'lightblue') + 
  coord_flip() + 
  xlab("offense") + ylab("count")  + ggtitle("Least Common Crimes 2014-2018") +
  theme(axis.text.x = element_text(size=10), axis.text.y = element_text(size=8))

drugs <- arrests_df %>% filter(ofns_desc == 'DANGEROUS DRUGS') %>% group_by(year, arrest_boro) %>% dplyr::summarize(count = n())

ggplot(drugs, aes(x = year, y = count, color = arrest_boro)) +
  geom_jitter() + 
  xlab("year") + ylab("count")  + ggtitle("Dangerous Drugs Crime by Bourough") +
  theme(axis.text.x = element_text(size=10), axis.text.y = element_text(size=8))

ggplot(arrests_df, aes(x = age_group, fill = perp_sex)) + 
  geom_histogram(stat = "count", position=position_dodge()) +
  scale_fill_brewer(palette="Blues") + 
  xlab("age group") + ylab("count")  + ggtitle("Perpetrator Age Group and Gender Distribution") +
  scale_y_continuous(labels=function(x) format(x, big.mark = ",", scientific = FALSE), breaks = seq(0,7000000,100000))

grouped_arrests <- arrests_df %>% group_by(year, arrest_boro, ofns_desc, age_group, perp_sex, perp_race) %>% 
  dplyr::summarize(count = n())

source("complain_dataset.R", echo = F, prompt.echo = "", spaced = F)
dat

res <- dat %>%
  group_by(year, boro_nm) %>%
  filter(boro_nm != "") %>%
  dplyr::summarise(crimes = n()) 
res

by_crime <- res %>% mutate_if(is.character, str_to_lower) 
by_crime

dat2 <- income_table
dat2$unemployed <- as.numeric(gsub(",", "", dat2$unemployed))
names(dat2)[1] <- "boro_nm"
subet <- dat2[c(1:2,5)]
by_unemployment <- subet %>%
  group_by(year, boro_nm) %>%
  dplyr::summarise(unemployed = round(mean(unemployed), 2)) %>%
  mutate_if(is.character, str_to_lower)
by_unemployment

joined <- Reduce(function(...) merge(..., all=T), list(by_crime, by_unemployment)) %>%
  na.omit()
joined

To decide whether we can make a predictive model, the first step is to see if there appears to be a relationship between our predictor and response variables (in this case crimes and unemployed). Let’s do some exploratory data visualization. I’ll use the ggpairs() function from the GGally package to create a plot matrix to see how the variables relate to one another.

ggpairs(data=joined, columns = 3:4, title="Crimes employed data")

There is an important aspect that correlation doesn’t imply causation, so I will use the regression to see if I can use this variable as a predictor.

theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(joined, aes(unemployed, crimes)) + 
  geom_jitter() +
  geom_smooth(method="lm") +
  labs(subtitle="Crimes per unemployed: crimes vs unemployed numbers", 
       y="Crimes", 
       x="Unemployed", 
       title="Scatterplot with overlapping points", 
       caption="Source: joined")
ggMarginal(g, type = "boxplot", fill="transparent")

m_unemployed <- lm(crimes ~ unemployed, data = joined)
summary(m_unemployed)

## 
## Call:
## lm(formula = crimes ~ unemployed, data = joined)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39272 -15713    742  16157  32022 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.690e+04  9.987e+03   1.692    0.109    
## unemployed  1.833e+00  2.083e-01   8.798 9.77e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20450 on 17 degrees of freedom
## Multiple R-squared:  0.8199, Adjusted R-squared:  0.8093 
## F-statistic:  77.4 on 1 and 17 DF,  p-value: 9.772e-08

As shown by the simple linear regression, unemployment has an R-squared ~0.8 which makes it a better predictor for the crimes number. However, may be there are other factors that can be a confounding factors that increase crimes, for instance race or sex. Let’s predict use linear prediction

predict(m_unemployed, data.frame(unemployed = 65783.31))

##        1 
## 137472.7

# 137472.7  actual 140338 from joined table

so close number; however, we need more variables to make the model predict better. Now, I will inspect the other categorical independent variables associated with the number of commited murders.

by_cat <- dat %>%
  group_by(year, boro_nm, susp_age_group, susp_sex, susp_race) %>%
  filter(boro_nm != "" & susp_age_group !="") %>%
  dplyr::summarise(crimes = n()) 
by_cat

Categorize age-group preparing for dummy variables

dummy_df <- by_cat
dummy_df$susp_age_group <- revalue(by_cat$susp_age_group, c("<18"="child", "18-24"="youth", "25-44"="adult", "45-64"="senior", "65+" = "senior")) 
dummy_df

Converting the independent variables into factors

#function to categorize and indicate the cofounder variable
#
# function to categorise - dummy variables
filtered <- dummy_df %>%
  filter(susp_age_group %in% c("child", "youth", "adult", "senior")) 
## change to factor level
filtered$boro_nm <- as.factor(filtered$boro_nm)
filtered$susp_age_group <- as.factor(filtered$susp_age_group)
filtered$susp_sex <- as.factor(filtered$susp_sex)
filtered$susp_race <- as.factor(filtered$susp_race)
# contrasts(filtered$susp_race)
# reference variable is AMERICAN INDIAN/ALASKAN NATIVE, we can rereference by using relevel(var, ref = "new_ref")

# contrasts(filtered$susp_sex) -> female is the reference
# filtered %>%
#   mutate(susp_race = relevel(susp_race, ref = "BLACK")) -> filtered
# contrasts(filtered$susp_race)  #- > now BLACK is the ref
m_race <- lm(crimes ~ susp_race, data = filtered)
summary(m_race) # no significance

## 
## Call:
## lm(formula = crimes ~ susp_race, data = filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1222.0  -321.8  -102.8    -0.7 13004.0 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          15.01      68.18   0.220 0.825737    
## susp_raceASIAN / PACIFIC ISLANDER   124.78      89.45   1.395 0.163135    
## susp_raceBLACK                     1208.02      87.54  13.800  < 2e-16 ***
## susp_raceBLACK HISPANIC             199.50      89.50   2.229 0.025912 *  
## susp_raceUNKNOWN                     81.64      87.73   0.931 0.352126    
## susp_raceWHITE                      338.78      87.73   3.862 0.000116 ***
## susp_raceWHITE HISPANIC             618.94      87.87   7.044 2.45e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1052 on 2343 degrees of freedom
## Multiple R-squared:  0.1253, Adjusted R-squared:  0.123 
## F-statistic: 55.93 on 6 and 2343 DF,  p-value: < 2.2e-16

m_all <- lm(crimes ~ susp_age_group + susp_sex + susp_race + boro_nm, data = filtered)
anova(m_all)

Taking other variables (susp_age_group, susp_sex, and boro_nm) into account, it can be seen that the categorical variable boro_nm the most significant variable with lowest Sum Sq.

summary(m_all)

## 
## Call:
## lm(formula = crimes ~ susp_age_group + susp_sex + susp_race + 
##     boro_nm, data = filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1552.2  -450.7   -81.4   277.9 11664.1 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         406.16      81.54   4.981 6.78e-07 ***
## susp_age_groupchild                -926.82      58.55 -15.829  < 2e-16 ***
## susp_age_groupsenior               -832.74      50.48 -16.496  < 2e-16 ***
## susp_age_groupyouth                -600.08      57.57 -10.423  < 2e-16 ***
## susp_sexM                           604.39      43.96  13.749  < 2e-16 ***
## susp_sexU                          -367.70      47.88  -7.679 2.33e-14 ***
## susp_raceASIAN / PACIFIC ISLANDER   304.60      77.46   3.932 8.65e-05 ***
## susp_raceBLACK                     1454.98      76.20  19.095  < 2e-16 ***
## susp_raceBLACK HISPANIC             374.76      77.55   4.833 1.43e-06 ***
## susp_raceUNKNOWN                    319.68      76.32   4.189 2.91e-05 ***
## susp_raceWHITE                      587.14      76.34   7.691 2.13e-14 ***
## susp_raceWHITE HISPANIC             853.51      76.41  11.170  < 2e-16 ***
## boro_nmBROOKLYN                      97.37      58.94   1.652   0.0987 .  
## boro_nmMANHATTAN                    -41.08      58.91  -0.697   0.4857    
## boro_nmQUEENS                       -86.35      58.82  -1.468   0.1422    
## boro_nmSTATEN ISLAND               -455.92      60.83  -7.495 9.37e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 906.9 on 2334 degrees of freedom
## Multiple R-squared:  0.3522, Adjusted R-squared:  0.3481 
## F-statistic:  84.6 on 15 and 2334 DF,  p-value: < 2.2e-16

It can be seen that being Black is significantly associated with an average increase of 1454 in crime compared to other races with R-squared ~ 0.35. Having some information shown on plots would give a good idea as to which categorical variables are good predictive features and can be used to build a machine learning model. Best plots for factor to factor variable comparison would be any of a jitter plot or heat map. I would use a jitter plot in this case for all of our factor-factor plots.

ggplot(data = filtered, mapping = aes(x = susp_race, y = crimes)) + 
  geom_jitter(aes(colour = susp_race))

ggplot(data = filtered, mapping = aes(x = susp_sex, y = crimes)) + 
  geom_jitter(aes(colour = susp_sex))

ggplot(data = filtered, mapping = aes(x = susp_age_group, y = crimes)) + 
  geom_jitter(aes(colour = susp_age_group))

ggplot(data = filtered, mapping = aes(x = boro_nm, y = crimes)) + 
  geom_jitter(aes(colour = boro_nm)) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

Building the predictive model to predict crimes

# # round 1
# crimeTrain <- createDataPartition(filtered$crimes, list = FALSE, times = 1, p = 0.67)
# filtered <- filtered[crimeTrain,]
# crimeTest <- filtered[-crimeTrain,]
# 
# crime_model <- train(crimes ~ ., data=filtered, method = "knn")
# crimeTestPred <- predict(crime_model, crimeTest) 
# newTest <- crimeTest
# newTest$crimes <- as.factor(newTest$crimes)
# newTest$year <- as.factor(newTest$year)
# confusionMatrix(crimeTestPred, newTest$crimes)
# #Accuracy -  0.5897

# library(caret)
# index = createDataPartition(y=filtered$susp_race, p=0.7, list=FALSE)
# train.set = filtered[index, ]
# test.set = filtered[-index, ]
# dim(train.set)
# dim(test.set)

# fitControl <- trainControl(method = "repeatedcv",   
#                            number = 10,     # number of folds
#                            repeats = 10,   # repeated ten times
#                            search = "random")    # Hyper-parameters random seach
# 
# model.cv <- train(crimes ~ .,
#                   data = filtered,
#                   method = "ridge",
#                   trControl = fitControl,
#                   preProcess = c('scale','center'))
# model.cv

# crime.pred = predict(crime.tree, newdata = test.set)
# table(crime.pred, test.set$susp_race)
# error.rate = round(mean(crime.pred != test.set$susp_race),2)
# error.rate

# plot(crime.tree$finalModel, uniform=TRUE,
#      main="Classification Tree")
# text(crime.tree$finalModel, use.n.=TRUE, all=TRUE, cex=.8)

statistical_exploratory_analysis

Salma Elshahawy

11/18/2019