library(tidyverse)
library(ggmap) 
library(rgdal)
library(readr)
library(dplyr)
library(highcharter)
library(ggplot2)
library(viridis)
library(lubridate)
library(xts)
library(purrr)
library(plotly)
library(knitr)

I am analyzing the Chicago Crimes data set published by the City of Chicago; I am firsthand interested to see the trend in overall crime from the start date of this data, 2012, to 2017. Is it going up? Going down?

#loading 
chi_crime_2012_16 <- read_csv("Chicago_Crimes_2012_to_2017.csv")

#naming and selecting columns 
crimeData <- chi_crime_2012_16 %>% 
  separate(Location, into = c("lat", "lon"), sep = ",") %>% 
  dplyr::select(`Case Number`, Date, `Primary Type`, 
         `Location Description`, Ward, lon, lat, Arrest)

colnames(crimeData) <- c("case", "date", "offense", "setting",
                      "ward", "lon", "lat", "arrest") 

crimeData$ward <- as.character(crimeData$ward)

# cleaning longitude, latitude data 
crimeData$lon <- parse_number(crimeData$lon, na = ")")  
crimeData$lat <- parse_number(crimeData$lat, na = "(")

#analyzing for missing values - there's a lot; i'll filter where i see fitting
sum(is.na(crimeData))
## [1] 75839
# formatting date variables 
crimeData$date <- as.POSIXlt(crimeData$date, format = "%m/%d/%Y %I:%M:%S %p")

crimeData$day <- as.factor(crimeData$date$mday)
crimeData$hour <- as.factor(crimeData$date$hour)
crimeData$month <- as.factor(crimeData$date$mon+1)
crimeData$year <- as.factor(crimeData$date$year+1900)
crimeData$weekday <- as.factor(crimeData$date$wday+1)

# the date column needs to be a "date" type for visualizations 
crimeData$date <- as.Date(crimeData$date, format = "%m/%d/%Y %I:%M:%S %p")

To do this, I will visualize a time series map of the data, to measure overall trends in a meaningful and interactive way with the highcharter package.

# summarizing total reports by time - tseries1
crime_by_date <- na.omit(crimeData) %>% 
  group_by(date) %>% 
  summarise(total = n()) 

tseries_crime <- xts(crime_by_date$total, order.by =as.POSIXct(crime_by_date$date))

# summarizing arrests by time - tseries2
arrests_by_date <- na.omit(crimeData[crimeData$arrest == 'True', ])  %>% 
  group_by(date) %>% 
  summarise(total = n())

tseries_arrests <- xts(arrests_by_date$total, order.by = as.POSIXct(arrests_by_date$date))

# visualizing time series 
ts <- hchart(tseries_crime, name = "Crimes") %>% 
  hc_add_series(tseries_arrests, name = "Arrests") %>% 
  hc_add_theme(hc_theme_darkunica()) %>% 
  hc_credits(enabled = TRUE, text = "Sources: City of Chicago Administration and The City of Chicago Police Department", style = list(fontSize = "12px")) %>% 
  hc_title(text = "Time Series Plot of Chicago Crime and Arrests") %>% 
  hc_legend(enabled = TRUE)
ts

Looks like it is going down. We have a lot of jumping up and down, but the overall trend is decreasing. Also interesting, you can zoom in to see that Janurary 1st and December 31st have some high spikes in crime reports. I’ll try to figure out which crimes are being comitted on those days in a bit!

The degree of crime reported and arrests being made (represented in the two different lines) do not seem to be varying, so this checks out whether police are arresting more or less often. Nothing has changed in that policy as can be reflected in the data.

I will now look at these trends by different crime types, to see if some crimes may be going up while others are decreasing - this might be a bit overwhelming because there are many different offenses recorded - i’ll limit some meaningless reports, but that will still leave us with 27 graphs from this output:

# summary table crime type and date
crimes_by_type_day <- crimeData %>% 
  group_by(offense, date) %>% 
  summarize(total = n()) %>%
  filter(offense != "CONCEALED CARRY LICENSE VIOLATION", 
         offense!= "NON-CRIMINAL",
         offense!= "NON-CRIMINAL (SUBJECT SPECIFIED)", 
         offense!= "NON - CRIMINAL", 
         offense!= "OTHER NARCOTIC VIOLATION", 
         offense!= "OTHER OFFENSE")

crimes_broad <- crimes_by_type_day %>% 
  ggplot(aes(x = date, y = total)) + 
  geom_line(color = "red") + 
  facet_wrap(~offense, scales = "free") + 
  ggtitle(label = "Breakdown of Crime Trends by Type of Offense") +
  scale_y_continuous(name = "Reports of Violations")

crimes_broad

Yup! Pretty dazzling to the eyes, and I should make some work to clean up the lines for some dense plots. I’ll come back to this and change the scales. A few plots I want to zoom in to: Narcotics is going down tremendously! Grand theft auto is also going down! We have sme funky graphs, like public indencency. I bet that’s because police are less likely to report that one violation specifically, it’ll most likely be tagged with peace violations? Arson has an interesting spike. Any significant increases in the trends? Nope!

narc_cases <- crimes_by_type_day %>% 
  filter(offense == "NARCOTICS") 

narc_cases %>% 
  ggplot(aes(x = date, y = total)) + 
  geom_line(color = "red") + 
  ggtitle(label = "Breakdown of Narcotic Cases Over Time") +
  scale_y_continuous(name = "Reports of Violations")

An uplifting story! Narcotic cases have been declining over time, to almost nothing every day! Yay!

gta_cases <- crimes_by_type_day %>% 
  filter(offense == "MOTOR VEHICLE THEFT") 

gta_cases %>% 
  ggplot(aes(x = date, y = total)) + 
  geom_line(color = "red") + 
  ggtitle(label = "Breakdown of Vechile Theft Over Time") +
  scale_y_continuous(name = "Reports of Violations")

After 2013, some kind of policy must’ve happened! Thats a notable transition.

arson_cases <- crimes_by_type_day %>% 
  filter(offense == "ARSON") 
# line graph
arson_cases %>% 
  ggplot(aes(x = date, y = total)) + 
  geom_line(color = "red") + 
  ggtitle(label = "Breakdown of Arson Cases Over Time") +
  scale_y_continuous(name = "Reports of Violations")

# boxplot 
arson_cases %>% 
  ggplot(aes(x = 1, y = total), color = "red", fill = "red") +
  geom_boxplot() + 
  coord_flip() + 
  ggtitle("Boxplot of Arson Daily Totals: Visualizing Counts of Extreme Arson")

arson_days <- arson_cases[arson_cases$total >= 6, ]
arson_days <- arson_days %>% 
  select(date, total)

kable(arson_days)
offense date total
ARSON 2012-01-31 6
ARSON 2013-08-10 6
ARSON 2014-05-28 11
ARSON 2014-06-10 8
ARSON 2015-07-05 8
ARSON 2015-09-14 8
ARSON 2015-11-03 6
ARSON 2016-09-10 6
ARSON 2016-09-16 7
ARSON 2016-10-29 8

Yup, there’s some strange, arson-heavy days in the city! BUt not too bad. I will map a boxplot to look at if there’s any clear outliers, and lo and behold! There were. The table produces and accounts fot he date of all of these outlier days. I googled the date for our 11 count evening, but nothing too surprising.

Next, let’s look at the sspread of crime across the days of the week.

crimes_by_day_of_week <- 
  crimeData %>% 
  group_by(weekday) %>% 
  summarize(total = n()) 

ggplot(crimes_by_day_of_week, aes(x = weekday, y = total/100000)) + 
  geom_bar(stat = "identity", fill = "darkolivegreen1") + 
  geom_hline(yintercept = 2.0, color = "red") +
  ggtitle(label = "Crimes Reported Across Days of the Week") + 
  scale_y_continuous(name = "Total Reports in One Hundred Thousands")

Sunday =1, Saturday= 7. I will rename them when I have more time. Friday gets the most crime, that can be expected.

Now, let’s look at a map of the spread of crime!

#checking for na's in important values: ward and offense
which(is.na(crimeData$ward))  
##  [1]   43787   90749   94282  132445  186233  220292  221418  374948
##  [9]  471538  559041  715003  739210  933405 1137039
which(is.na(crimeData$offense))   
## integer(0)
#summary table - crimes by ward
crimeWard <- crimeData %>% 
  group_by(ward) %>% 
  summarize(Weapon_Violations = sum(offense == "WEAPONS VIOLATION"), 
            Battery = sum(offense == "BATTERY"), 
            Narcotics = sum(offense == "NARCOTICS"), 
            Criminal_Damage = sum(offense ==  "CRIMINAL DAMAGE"), 
            Theft = sum(offense == "THEFT"), 
            Deceptive_Practice = sum(offense == "DECEPTIVE PRACTICE"), 
            Criminal_Trespass = sum(offense == "CRIMINAL TRESPASS"), 
            Assault = sum(offense == "ASSAULT"), 
            Burglary = sum(offense == "BURGLARY"), 
            Robbery = sum(offense == "ROBBERY"), 
            Interference_w_Public_Officer = sum(offense == "INTERFERENCE WITH PUBLIC OFFICER"), 
            Auto_Theft = sum(offense == "MOTOR VEHICLE THEFT")) %>%
  group_by(ward) %>%
  mutate(TotalCrime = sum(Weapon_Violations:Auto_Theft)) 

# removing NAs
crimeWard <- crimeWard[complete.cases(crimeWard), ] 

# Ward Boundaries - reading shapefile and fortifying to dataframe 
boundaries <- readOGR(dsn = "C:/Users/Aawin11/Documents/Data Science/R/Projects/Chicago Crime", layer = "wardboundaries") 
## OGR data source with driver: ESRI Shapefile 
## Source: "C:/Users/Aawin11/Documents/Data Science/R/Projects/Chicago Crime", layer: "wardboundaries"
## with 50 features
## It has 3 fields
boundaries <- fortify(boundaries)

colnames(boundaries) <- c("b_long", "b_lat", "order", 
                          "hole", "piece", "ward", 
                          "group") 

# removing bizarre ward boudaries 
boundaries <- filter(boundaries, ward >= 1)

#merging crime by ward summary table to ward boundaries 
crimeWard <- crimeWard %>% 
  left_join(boundaries, by = c("ward"))

#google map image of chicago - base layer of map 
c_map <- get_map(location = "chicago", zoom = 10)

#the map - visualizations
ggmap(c_map) + 
    geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = TotalCrime), alpha = 0.9) + 
    scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) + 
    scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) + 
    scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") + 
  ggtitle("Geospatial Mapping of Total Crime Across Chicago")

ggmap(c_map) + 
    geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Robbery), alpha = 0.9) + 
    scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) + 
    scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) + 
    scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") + 
  ggtitle("Geospatial Mapping of Robbery Across Chicago")

ggmap(c_map) + 
    geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Narcotics), alpha = 0.9) + 
    scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) + 
    scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) + 
    scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") + 
  ggtitle("Geospatial Mapping of Narcotics Across Chicago")

ggmap(c_map) + 
    geom_polygon(data = crimeWard, aes(x = b_long, y = b_lat, group = group, fill = Assault), alpha = 0.9) + 
    scale_x_continuous(name = "Longitude", limits = c(-87.9, -87.5)) + 
    scale_y_continuous(name = "Latitude", limits = c(41.6, 42.1)) + 
    scale_fill_continuous(name = "Number of Cases", low = "dodgerblue", high = "dodgerblue4") + 
  ggtitle("Geospatial Mapping of Assault Across Chicago")