This project provided us with the opportunity of showcasing many of the skills we have learned throughout this course and of applying them to an investigation into datasets of our choosing. We narrowed our scope to a few datasets containing information on social economic information, namely unemployment crime data in NYC. We hoped that this investigation would reveal valuable information that could be used to formulate policy proposals.
source("environment_setup.R", echo = T, prompt.echo = "", spaced = F)
## if (!require("dplyr")) install.packages("dplyr")
## if (!require("RSocrata")) install.packages("RSocrata")
## if (!require("tidyverse")) install.packages("tidyverse")
## if (!require("ggplot2")) install.packages("ggplot2")
## if (!require("readxl")) install.packages("readxl")
## if (!require("plyr")) install.packages("plyr")
## if (!require("treemap")) install.packages("treemap")
## if (!require("leaflet")) install.packages("leaflet")
## if (!require("forcats")) install.packages("forcats")
## if (!require("ggExtra")) install.packages("ggExtra")
## if (!require("GGally")) install.packages("GGally")
| variable | description |
|---|---|
arrest_date |
Exact date of arrest for the reported event. |
ofns_desc |
Description of internal classification corresponding with KY code (more general category than PD description). |
arrest_boro |
Borough of arrest. B(Bronx), S(Staten Island), K(Brooklyn), M(Manhattan), Q(Queens) |
language |
language of school where professor received education: english or non-english. |
age_group |
Perpetrator’s age within a category. |
perp_sex |
Perpetrator’s sex description. |
perp_race |
Perpetrator’s race description. |
x_coord_cd |
Midblock X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104). |
y_coord_cd |
Midblock Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
latitude |
Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |
longitude |
Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |
Load the data into R using tthe RSocratat API.
source("arrests_dataset.R", echo = F, prompt.echo = "", spaced = F)
head(arrests_df, 10)
source("unemployed_dataset.R", echo = F, prompt.echo = "", spaced = F)
head(bronx)
head(queens)
head(brooklyn)
head(manhattan)
head(staten)
clean_table <- function (table) {
table_content <- table %>%
na.omit()
colnames(table_content) = c("arrest_boro","year","month","labor_force","employed","unemployed","unemployment_rate")
final_table <- table_content %>%
select(arrest_boro, year, labor_force, employed, unemployed, unemployment_rate)
return(final_table)
}
bronx_income <- clean_table(bronx)
queens_income <- clean_table(queens)
brooklyn_income <- clean_table(brooklyn)
manhattan_income <- clean_table(manhattan)
staten_income <- clean_table(staten)
income_table <- Reduce(function(...) merge(..., all=T), list(bronx_income, queens_income, brooklyn_income, manhattan_income, staten_income))
income_table
theme_set(theme_bw())
ggplot(income_table, aes(x=arrest_boro, y=unemployment_rate)) +
geom_boxplot(main = "Different boxplot for uneployment rate in the 5 counties over years",
ylab = "Unemployment rate %",
xlab = "",
col = "blue",
border = "blue") +
facet_wrap(~year, scale="free") +
theme(axis.text.x = element_text(angle=60, vjust = 0.6))
## Warning: Ignoring unknown parameters: main, ylab, xlab, border
by_income <- income_table %>%
group_by(arrest_boro, year) %>%
dplyr::summarise(avg_unemployment_rate = max(unemployment_rate)) %>%
arrange(desc(year))
by_income
## I need to change symbole to full name to be able to merge.
arrests_df$arrest_boro <- revalue(arrests_df$arrest_boro, c("B"="Bronx", "Q"="Queens", "K"="Brooklyn", "S"="Staten Island", "M"="Manhattan"))
head(arrests_df)
murder_counts <- arrests_df %>%
group_by(arrest_boro, year, perp_race) %>%
dplyr::summarise(murder_counts = n()) %>%
arrange(desc(year))
murder_counts
merged <- Reduce(function(...) merge(..., all=T), list(murder_counts, by_income)) %>%
na.omit() %>%
arrange(desc(year))
merged
# Data Prep
merged$boro <- rownames(merged) # create new column for boro names
merged$unemployment_z <- round((merged$avg_unemployment_rate - mean(merged$avg_unemployment_rate))/sd(merged$avg_unemployment_rate), 2) # compute normalized
merged$unemployyment_type <- ifelse(merged$unemployment_z < 0, "below", "above") # above / below avg flag
merged <- merged[order(merged$unemployment_z), ] # sort
# merged$arrest_boro <- factor(merged$arrest_boro, levels = merged$arrest_boro) # convert to factor to retain sorted order in plot.
merged$arrest_boro <- factor(merged$arrest_boro, levels = rev(unique(merged$arrest_boro)), ordered=TRUE)
# Diverging Barcharts
ggplot(merged, aes(x=`arrest_boro`, y=unemployment_z, label=unemployment_z)) +
geom_bar(stat='identity', aes(fill=unemployyment_type), width=.5) +
scale_fill_manual(name="Unemployment",
labels = c("Above Average", "Below Average"),
values = c("above"="#f8766d", "below"="#00ba38")) +
labs(subtitle="Normalised unemployment rate from 'merged'",
title= "Diverging Bars of unemployment rates in boro") +
coord_flip()
The Bronx seems to have the largest amount of unemployment rate which is above the average and in the same time it has the highest crime amongst other counties.
# Data Prep
merged$boro <- rownames(merged) # create new column for boro names
merged$crime_z <- round((merged$murder_counts - mean(merged$murder_counts))/sd(merged$murder_counts), 2) # compute normalized
merged$murder_type <- ifelse(merged$crime_z < 0, "below", "above") # above / below avg flag
merged <- merged[order(merged$crime_z), ] # sort
# merged$arrest_boro <- factor(merged$arrest_boro, levels = merged$arrest_boro) # convert to factor to retain sorted order in plot.
merged$arrest_boro <- factor(merged$arrest_boro, levels = rev(unique(merged$arrest_boro)), ordered=TRUE)
# Diverging Barcharts
ggplot(merged, aes(x=`arrest_boro`, y=crime_z, label=crime_z)) +
geom_bar(stat='identity', aes(fill=murder_type), width=.5) +
scale_fill_manual(name="Crimes",
labels = c("Above Average", "Below Average"),
values = c("above"="#f8766d", "below"="#00ba38")) +
labs(subtitle="Normalised murders'",
title= "Diverging Bars of number of murders per county") +
coord_flip()
As illustrated from the diverge bar that Bronx also has above average number of crimes
theme_set(theme_classic())
# Plot
g <- ggplot(arrests_df, aes(arrest_boro))
g + geom_density(aes(fill=factor(age_group)), alpha=0.8) +
labs(title="Density plot",
subtitle="Number of crimes per boro per age group distribution",
caption="Source: by_boro",
x="Borough",
fill="# crimes committed per age group")
We can see that the 25-44 age category is the most age category for committing a crime. As also demonstrated, the Bronx seems to have the highest density of crimes am
map <- murder_counts %>% filter(year == 2018)
treemap(map, #Your data frame object
index=c("perp_race","arrest_boro"), #A list of your categorical variables
vSize = "murder_counts", #This is your quantitative variable
type="categorical", #Type sets the organization and color scheme of your treemap
vColor = "arrest_boro", #Type sets the organization and color scheme of your treemap
palette = "Set1", #Select your color palette from the RColorBrewer presets or make your own.
title="Crime distribution committed by different races - year 2018", #Customize your title
fontsize.title = 14 #Change the font size of the title
)
## Warning in `[.data.table`(dtfDT, , `:=`("c", fact), with = FALSE):
## with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples.