library(tidyverse)
library(ggmap)
library(rgdal)
library(readr)
library(dplyr)
library(highcharter)
library(ggplot2)
library(viridis)
library(lubridate)
library(xts)
library(purrr)
library(plotly)
library(knitr)
I am analyzing the Chicago Crimes data set published by the City of Chicago; I am firsthand interested to see the trend in overall crime from the start date of this data, 2012, to 2017. Is it going up? Going down?
#loading
chi_crime_2012_16 <- read_csv("Chicago_Crimes_2012_to_2017.csv")
#naming and selecting columns
crimeData <- chi_crime_2012_16 %>%
separate(Location, into = c("lat", "lon"), sep = ",") %>%
dplyr::select(`Case Number`, Date, `Primary Type`,
`Location Description`, Ward, lon, lat, Arrest)
colnames(crimeData) <- c("case", "date", "offense", "setting",
"ward", "lon", "lat", "arrest")
crimeData$ward <- as.character(crimeData$ward)
# cleaning longitude, latitude data
crimeData$lon <- parse_number(crimeData$lon, na = ")")
crimeData$lat <- parse_number(crimeData$lat, na = "(")
#analyzing for missing values - there's a lot; i'll filter where i see fitting
sum(is.na(crimeData))
## [1] 75839
# formatting date variables
crimeData$date <- as.POSIXlt(crimeData$date, format = "%m/%d/%Y %I:%M:%S %p")
crimeData$day <- as.factor(crimeData$date$mday)
crimeData$hour <- as.factor(crimeData$date$hour)
crimeData$month <- as.factor(crimeData$date$mon+1)
crimeData$year <- as.factor(crimeData$date$year+1900)
crimeData$weekday <- as.factor(crimeData$date$wday+1)
# the date column needs to be a "date" type for visualizations
crimeData$date <- as.Date(crimeData$date, format = "%m/%d/%Y %I:%M:%S %p")
To do this, I will visualize a time series map of the data, to measure overall trends in a meaningful and interactive way with the highcharter package.
# summarizing total reports by time - tseries1
crime_by_date <- na.omit(crimeData) %>%
group_by(date) %>%
summarise(total = n())
tseries_crime <- xts(crime_by_date$total, order.by =as.POSIXct(crime_by_date$date))
# summarizing arrests by time - tseries2
arrests_by_date <- na.omit(crimeData[crimeData$arrest == 'True', ]) %>%
group_by(date) %>%
summarise(total = n())
tseries_arrests <- xts(arrests_by_date$total, order.by = as.POSIXct(arrests_by_date$date))
# visualizing time series
ts <- hchart(tseries_crime, name = "Crimes") %>%
hc_add_series(tseries_arrests, name = "Arrests") %>%
hc_add_theme(hc_theme_darkunica()) %>%
hc_credits(enabled = TRUE, text = "Sources: City of Chicago Administration and The City of Chicago Police Department", style = list(fontSize = "12px")) %>%
hc_title(text = "Time Series Plot of Chicago Crime and Arrests") %>%
hc_legend(enabled = TRUE)
ts
Looks like it is going down. We have a lot of jumping up and down, but the overall trend is decreasing. Also interesting, you can zoom in to see that Janurary 1st and December 31st have some high spikes in crime reports. I’ll try to figure out which crimes are being comitted on those days in a bit!
The degree of crime reported and arrests being made (represented in the two different lines) do not seem to be varying, so this checks out whether police are arresting more or less often. Nothing has changed in that policy as can be reflected in the data.
I will now look at these trends by different crime types, to see if some crimes may be going up while others are decreasing - this might be a bit overwhelming because there are many different offenses recorded - i’ll limit some meaningless reports, but that will still leave us with 27 graphs from this output:
# summary table crime type and date
crimes_by_type_day <- crimeData %>%
group_by(offense, date) %>%
summarize(total = n()) %>%
filter(offense != "CONCEALED CARRY LICENSE VIOLATION",
offense!= "NON-CRIMINAL",
offense!= "NON-CRIMINAL (SUBJECT SPECIFIED)",
offense!= "NON - CRIMINAL",
offense!= "OTHER NARCOTIC VIOLATION",
offense!= "OTHER OFFENSE")
crimes_broad <- crimes_by_type_day %>%
ggplot(aes(x = date, y = total)) +
geom_line(color = "red") +
facet_wrap(~offense, scales = "free") +
ggtitle(label = "Breakdown of Crime Trends by Type of Offense") +
scale_y_continuous(name = "Reports of Violations")
crimes_broad
Yup! Pretty dazzling to the eyes, and I should make some work to clean up the lines for some dense plots. I’ll come back to this and change the scales. A few plots I want to zoom in to: Narcotics is going down tremendously! Grand theft auto is also going down! We have sme funky graphs, like public indencency. I bet that’s because police are less likely to report that one violation specifically, it’ll most likely be tagged with peace violations? Arson has an interesting spike. Any significant increases in the trends? Nope!
narc_cases <- crimes_by_type_day %>%
filter(offense == "NARCOTICS")
narc_cases %>%
ggplot(aes(x = date, y = total)) +
geom_line(color = "red") +
ggtitle(label = "Breakdown of Narcotic Cases Over Time") +
scale_y_continuous(name = "Reports of Violations")
An uplifting story! Narcotic cases have been declining over time, to almost nothing every day! Yay!
gta_cases <- crimes_by_type_day %>%
filter(offense == "MOTOR VEHICLE THEFT")
gta_cases %>%
ggplot(aes(x = date, y = total)) +
geom_line(color = "red") +
ggtitle(label = "Breakdown of Vechile Theft Over Time") +
scale_y_continuous(name = "Reports of Violations")
After 2013, some kind of policy must’ve happened! Thats a notable transition.
arson_cases <- crimes_by_type_day %>%
filter(offense == "ARSON")
# line graph
arson_cases %>%
ggplot(aes(x = date, y = total)) +
geom_line(color = "red") +
ggtitle(label = "Breakdown of Arson Cases Over Time") +
scale_y_continuous(name = "Reports of Violations")
# boxplot
arson_cases %>%
ggplot(aes(x = 1, y = total), color = "red", fill = "red") +
geom_boxplot() +
coord_flip() +
ggtitle("Boxplot of Arson Daily Totals: Visualizing Counts of Extreme Arson")
arson_days <- arson_cases[arson_cases$total >= 6, ]
arson_days <- arson_days %>%
select(date, total)
kable(arson_days)
| offense | date | total |
|---|---|---|
| ARSON | 2012-01-31 | 6 |
| ARSON | 2013-08-10 | 6 |
| ARSON | 2014-05-28 | 11 |
| ARSON | 2014-06-10 | 8 |
| ARSON | 2015-07-05 | 8 |
| ARSON | 2015-09-14 | 8 |
| ARSON | 2015-11-03 | 6 |
| ARSON | 2016-09-10 | 6 |
| ARSON | 2016-09-16 | 7 |
| ARSON | 2016-10-29 | 8 |
Yup, there’s some strange, arson-heavy days in the city! BUt not too bad. I will map a boxplot to look at if there’s any clear outliers, and lo and behold! There were. The table produces and accounts fot he date of all of these outlier days. I googled the date for our 11 count evening, but nothing too surprising.
Next, let’s look at the sspread of crime across the days of the week.
crimes_by_day_of_week <-
crimeData %>%
group_by(weekday) %>%
summarize(total = n())
ggplot(crimes_by_day_of_week, aes(x = weekday, y = total/100000)) +
geom_bar(stat = "identity", fill = "darkolivegreen1") +
geom_hline(yintercept = 2.0, color = "red") +
ggtitle(label = "Crimes Reported Across Days of the Week") +
scale_y_continuous(name = "Total Reports in One Hundred Thousands")
Sunday =1, Saturday= 7. I will rename them when I have more time. Friday gets the most crime, that can be expected.
Now, let’s look at a map of the spread of crime!
#checking for na's in important values: ward and offense
which(is.na(crimeData$ward))
## [1] 43787 90749 94282 132445 186233 220292 221418 374948
## [9] 471538 559041 715003 739210 933405 1137039
which(is.na(crimeData$offense))
## integer(0)
#summary table - crimes by ward
crimeWard <- crimeData %>%
group_by(ward) %>%
summarize(Weapon_Violations = sum(offense == "WEAPONS VIOLATION"),
Battery = sum(offense == "BATTERY"),
Narcotics = sum(offense == "NARCOTICS"),
Criminal_Damage = sum(offense == "CRIMINAL DAMAGE"),
Theft = sum(offense == "THEFT"),
Deceptive_Practice = sum(offense == "DECEPTIVE PRACTICE"),
Criminal_Trespass = sum(offense == "CRIMINAL TRESPASS"),
Assault = sum(offense == "ASSAULT"),
Burglary = sum(offense == "BURGLARY"),
Robbery = sum(offense == "ROBBERY"),
Interference_w_Public_Officer = sum(offense == "INTERFERENCE WITH PUBLIC OFFICER"),
Auto_Theft = sum(offense == "MOTOR VEHICLE THEFT")) %>%
group_by(ward) %>%
mutate(TotalCrime = sum(Weapon_Violations:Auto_Theft))
# removing NAs
crimeWard <- crimeWard[complete.cases(crimeWard), ]
# Ward Boundaries - reading shapefile and fortifying to dataframe
boundaries <- readOGR(dsn = "C:/Users/Aawin11/Documents/Data Science/R/Projects/Chicago Crime", layer = "wardboundaries")
## OGR data source with driver: ESRI Shapefile
## Source: "C:/Users/Aawin11/Documents/Data Science/R/Projects/Chicago Crime", layer: "wardboundaries"
## with 50 features
## It has 3 fields
boundaries <- fortify(boundaries)
colnames(boundaries) <- c("b_long", "b_lat", "order",
"hole", "piece", "ward",
"group")
# removing bizarre ward boudaries
boundaries <- filter(boundaries, ward >= 1)
#merging crime by ward summary table to ward boundaries
crimeWard <- crimeWard %>%
left_join(boundaries, by = c("ward"))
#google map image of chicago - base layer of map
c_map <- get_map(location = "chicago", zoom = 10)
#the map - visualizations
ggmap(c_map) +
geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = TotalCrime), alpha = 0.9) +
scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) +
scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) +
scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") +
ggtitle("Geospatial Mapping of Total Crime Across Chicago")
ggmap(c_map) +
geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Robbery), alpha = 0.9) +
scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) +
scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) +
scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") +
ggtitle("Geospatial Mapping of Robbery Across Chicago")
ggmap(c_map) +
geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Narcotics), alpha = 0.9) +
scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) +
scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) +
scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") +
ggtitle("Geospatial Mapping of Narcotics Across Chicago")
ggmap(c_map) +
geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Assault), alpha = 0.9) +
scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) +
scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) +
scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") +
ggtitle("Geospatial Mapping of Assault Across Chicago")