library(tidyverse)
library(lubridate)
library(caret)
library(e1071)
library(networkD3)
library(ggplot2)
library(plotly)
library(stringr)
library(janitor)
library(purrr)
library(readxl)
library(readr)
library(kableExtra)
library(flextable)
library(broom)
library(randomForest)
### Read in Datasets
weather_files <- list.files("Inputs/Weather")[-str_detect(string = list.files("Inputs/Weather"),pattern = "\\.zip")]
crime_files <- list.files("Inputs/Crime")[-str_detect(string = list.files("Inputs/Crime"),pattern = "\\.zip")]
crimes <- map(.x = paste0("Inputs/Crime/",crime_files),.f = ~read_csv(.x) %>% 
                 mutate(src = .x,
    type= str_replace_all(string = src,pattern = "Inputs/Crime/|\\.csv",replacement = "")) %>% 
      select(-src)) %>% 
  bind_rows() %>% 
  clean_names()
weather <- map(.x = paste0("Inputs/Weather/",weather_files),.f = ~read_csv(.x) %>% 
                 mutate(src = .x,
    type= str_replace_all(string = src,pattern = "Inputs/Weather/|\\.csv",replacement = "")) %>% 
      select(-src) %>% 
      clean_names()) 
city_attributes <- weather[[1]]
weather_df <- weather[-c(1,5)] %>% 
  bind_rows()
weather_desc <- weather[[5]]
crimes_clean <- crimes %>% 
  filter(!is.na(id)) %>% 
  mutate(day = day(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
         mon =month(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
         yr = year(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
         day_of_wk = wday(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
          hr_of_day = hour(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p"))) %>% 
  mutate(date = make_date(year = yr,month = mon,day = day)) %>% 
  select(-yr) %>% 
  distinct() %>% 
  arrange(id,desc(latitude), desc(longitude)) %>% 
  distinct(id, .keep_all = T)

weather_clean <- weather_df %>% 
  filter(!is.na(chicago)) %>% 
  mutate(day = day(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         mon =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         mon_label =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = T),
         yr = year(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         day_of_wk = wday(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
          hr_of_day = hour(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"))) %>% 
  mutate(date = make_date(year = yr,month = mon,day = day)) 

chicago_clean_day <- weather_clean %>% 
  group_by(date, yr, mon, mon_label, day,type) %>% 
  summarise(chicago = mean(chicago)) %>% 
  spread(key = type,value = chicago) 

chic_desc <- weather_desc %>% 
  filter(!is.na(chicago)) %>% 
  mutate(day = day(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         mon =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         mon_label =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = T),
         yr = year(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
         day_of_wk = wday(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
          hr_of_day = hour(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"))) %>% 
  mutate(date = make_date(year = yr,month = mon,day = day)) %>% 
  distinct(date, chicago) %>% 
  mutate(flag = 1) %>% 
  spread(key = chicago, value = flag, fill = 0) %>% 
  clean_names() 

chicago_weather <- chicago_clean_day %>% 
  left_join(chic_desc) %>% 
  mutate(rain_all = ifelse(thunderstorm == 1 | thunderstorm_with_drizzle == 1 | thunderstorm_with_heavy_rain == 1 |
                               thunderstorm_with_light_drizzle == 1 | thunderstorm_with_light_rain == 1 | thunderstorm_with_rain == 1 | very_heavy_rain ==1 |
           heavy_intensity_drizzle == 1 | heavy_intensity_rain == 1 | light_rain == 1 | moderate_rain == 1 |proximity_shower_rain == 1 | 
             proximity_thunderstorm ==1 |
           proximity_thunderstorm_with_drizzle == 1 | proximity_thunderstorm_with_rain == 1 | light_intensity_drizzle == 1 |squalls == 1,1,0),
         snow_all = ifelse(freezing_rain == 1 |light_snow == 1 | light_rain_and_snow == 1 |snow == 1|heavy_snow ==1,1,0)) %>% 
  select(-mon,-day)

Part I

Data Cleaning

kable(crimes %>% 
  select(everything()) %>% 
summarise_all(funs(sum(is.na(.)))) %>% 
gather(key = variable,value = missing) %>% 
  filter(missing != 0), format = "html",col.names = c("Column", "Missing (N)"),align = "c")
Column Missing (N)
id 1
case_number 7
location_description 1990
arrest 1
domestic 1
beat 1
district 92
ward 700225
community_area 702092
x_coordinate 105574
y_coordinate 105575
year 1
latitude 105575
longitude 105576
location 105574


* This table tells us what variables in the Crimes table have missing values. Upon further investigation, we will remove the row for the missing ID. The other rows do appear to depict actual crimes and should be considered in further analyses, and are likely mistakes in data entry.
* Most notably, there are over 100,000 observations without longitude/latitude data available. For any geospatial analyses, these observations need to be removed or imputed based on values from other crimes in the same district or community.
* For modeling purposes, it is prudent to remove observations with missing values when using the variable as a predictor for violent crime.
* In addition, there are roughly 1 million observations that appear to be duplicates, so these observations will be removed.

* In the weather data, we primarily care about the weather in Chicago. There are roughly 2000 observations where the value for Chicago is missing for a particular weather category. To consolidate the data, these are removed. * For summary statistics below, multiple measurements per day for each weather are considered. For predictive and causal analyses, daily averages are used because there is not a large amount of variance per day.

Violent Crimes

Table 1

violent_map <- crimes_clean %>% 
  distinct(primary_type,description) %>% 
  arrange(primary_type) %>% 
  mutate(violent = case_when(primary_type %in% c("ROBBERY", "CRIM SEXUAL ASSAULT", "HOMICIDE", "ASSAULT", "BATTERY", "KIDNAPPING", "RITUALISM") ~ 1,
                              str_detect(pattern = "SEX ABUSE|SEXUAL ABUSE|VIOLENCE|ASSLT|FORC|SEXUAL EXPLOIT", string = description) ~ 1,
                              TRUE ~ 0))

kable(violent_map %>% filter(violent == 1) %>% 
        select(-violent),
      format = "html",col.names = c("Crime Category", "Description"),align = "c") %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "4in") 
Crime Category Description
ASSAULT AGGRAVATED: HANDGUN
ASSAULT AGGRAVATED:KNIFE/CUTTING INSTR
ASSAULT SIMPLE
ASSAULT AGGRAVATED PO: OTHER DANG WEAP
ASSAULT AGGRAVATED: OTHER DANG WEAPON
ASSAULT AGGRAVATED: OTHER FIREARM
ASSAULT AGG PO HANDS NO/MIN INJURY
ASSAULT AGGRAVATED PO: HANDGUN
ASSAULT PRO EMP HANDS NO/MIN INJURY
ASSAULT AGGRAVATED PO:KNIFE/CUT INSTR
ASSAULT AGGRAVATED PO: OTHER FIREARM
ASSAULT AGG PRO.EMP: OTHER DANG WEAPON
ASSAULT AGG PRO.EMP: OTHER FIREARM
ASSAULT AGG PRO.EMP:KNIFE/CUTTING INST
ASSAULT AGG PRO.EMP: HANDGUN
BATTERY SIMPLE
BATTERY AGGRAVATED: HANDGUN
BATTERY AGGRAVATED:KNIFE/CUTTING INSTR
BATTERY AGGRAVATED: OTHER DANG WEAPON
BATTERY AGG PO HANDS NO/MIN INJURY
BATTERY AGG: HANDS/FIST/FEET NO/MINOR INJURY
BATTERY AGGRAVATED PO: OTHER DANG WEAP
BATTERY DOMESTIC BATTERY SIMPLE
BATTERY AGGRAVATED: OTHER FIREARM
BATTERY AGGRAVATED PO: HANDGUN
BATTERY AGG PRO.EMP: OTHER DANG WEAPON
BATTERY AGGRAVATED PO: KNIFE/CUT INSTR
BATTERY PRO EMP HANDS NO/MIN INJURY
BATTERY AGG PRO.EMP: HANDGUN
BATTERY AGG PRO.EMP:KNIFE/CUTTING INST
BATTERY AGGRAVATED PO: OTHER FIREARM
BATTERY AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST
BATTERY AGGRAVATED DOMESTIC BATTERY
BATTERY AGG PRO.EMP: OTHER FIREARM
BATTERY AGGRAVATED OF A SENIOR CITIZEN
BATTERY AGGRAVATED OF A CHILD
BATTERY AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON
BATTERY OF UNBORN CHILD
BATTERY AGGRAVATED OF A UNBORN CHILD
BATTERY AGG PO HANDS ETC SERIOUS INJ
BATTERY AGG: HANDS/FIST/FEET SERIOUS INJURY
BATTERY AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET SERIOUS INJURY
BATTERY AGGRAVATED DOMESTIC BATTERY: HANDGUN
BATTERY AGG PRO EMP HANDS SERIOUS INJ
BATTERY AGGRAVATED DOMESTIC BATTERY: OTHER FIREARM
BURGLARY FORCIBLE ENTRY
BURGLARY ATTEMPT FORCIBLE ENTRY
CRIM SEXUAL ASSAULT NON-AGGRAVATED
CRIM SEXUAL ASSAULT AGGRAVATED: OTHER FIREARM
CRIM SEXUAL ASSAULT AGGRAVATED: OTHER DANG WEAPON
CRIM SEXUAL ASSAULT ATTEMPT NON-AGGRAVATED
CRIM SEXUAL ASSAULT AGGRAVATED: OTHER
CRIM SEXUAL ASSAULT AGGRAVATED: HANDGUN
CRIM SEXUAL ASSAULT PREDATORY
CRIM SEXUAL ASSAULT AGGRAVATED: KNIFE/CUT INSTR
CRIM SEXUAL ASSAULT ATTEMPT AGG: KNIFE/CUT INSTR
CRIM SEXUAL ASSAULT ATTEMPT AGG: HANDGUN
CRIM SEXUAL ASSAULT ATTEMPT AGG: OTHER
CRIM SEXUAL ASSAULT ATTEMPT AGG: OTHER DANG WEAPON
CRIM SEXUAL ASSAULT ATTEMPT AGG: OTHER FIREARM
DOMESTIC VIOLENCE DOMESTIC VIOLENCE
HOMICIDE FIRST DEGREE MURDER
HOMICIDE RECKLESS HOMICIDE
HOMICIDE INVOLUNTARY MANSLAUGHTER
KIDNAPPING UNLAWFUL INTERFERE/VISITATION
KIDNAPPING CHILD ABDUCTION/STRANGER
KIDNAPPING UNLAWFUL RESTRAINT
KIDNAPPING KIDNAPPING
KIDNAPPING AGGRAVATED
KIDNAPPING FORCIBLE DETENTION
OFFENSE INVOLVING CHILDREN AGG SEX ASSLT OF CHILD FAM MBR
OFFENSE INVOLVING CHILDREN SEX ASSLT OF CHILD BY FAM MBR
OFFENSE INVOLVING CHILDREN CRIM SEX ABUSE BY FAM MEMBER
OFFENSE INVOLVING CHILDREN AGG CRIM SEX ABUSE FAM MEMBER
OTHER OFFENSE VIO BAIL BOND: DOM VIOLENCE
PUBLIC PEACE VIOLATION ARMED VIOLENCE
RITUALISM AGG RITUAL MUT:KNIFE/CUTTING I
RITUALISM AGG RITUAL MUT:OTH DANG WEAPON
RITUALISM AGG RITUAL MUT:HANDGUN
RITUALISM AGG RIT MUT: HANDS/FIST/FEET NO/MINOR INJURY
RITUALISM AGG RIT MUT: HANDS/FIST/FEET SERIOUS INJURY
ROBBERY ARMED: OTHER DANGEROUS WEAPON
ROBBERY STRONGARM - NO WEAPON
ROBBERY AGGRAVATED VEHICULAR HIJACKING
ROBBERY ARMED: HANDGUN
ROBBERY ATTEMPT: ARMED-OTHER FIREARM
ROBBERY ATTEMPT: STRONGARM-NO WEAPON
ROBBERY ARMED:KNIFE/CUTTING INSTRUMENT
ROBBERY VEHICULAR HIJACKING
ROBBERY AGGRAVATED
ROBBERY ARMED: OTHER FIREARM
ROBBERY ATTEMPT: ARMED-HANDGUN
ROBBERY ATTEMPT: ARMED-KNIFE/CUT INSTR
ROBBERY ATTEMPT: ARMED-OTHER DANG WEAP
ROBBERY ATTEMPT: AGGRAVATED
SEX OFFENSE CRIMINAL SEXUAL ABUSE
SEX OFFENSE AGG CRIMINAL SEXUAL ABUSE
SEX OFFENSE ATT CRIM SEXUAL ABUSE
SEX OFFENSE ATT AGG CRIMINAL SEXUAL ABUSE
SEX OFFENSE ATT AGG CRIM SEXUAL ABUSE
SEX OFFENSE SEXUAL EXPLOITATION OF A CHILD


* This table depicts the crimes classified as violent, and will form the basis of subsequent descriptive and predictive analyses.
* Violent crimes were identified based on the definition from the National Institute of Justice.
* Any primary crime category that suggested an individual or group inflicting violence on themselves or others was categorized as violent. This included obvious crimes such as sexual assualt, homicide, robbery, and kidnapping in addition to more nuanced perpetrations of violence such as sexual abuse, forced entry, and ritualism.

Table 2

crimes_clean <- crimes_clean %>% 
  left_join(violent_map)
summarise_violent <- crimes_clean %>% 
  mutate(total = n()) %>% 
  group_by(primary_type, description,violent, total) %>% 
  summarise(n = n()) %>% 
  filter(violent == 1) %>% 
  mutate(pct = (n/total) * 100) %>% 
  group_by(primary_type) %>% 
  mutate(pct_cat = (sum(n)/total)*100,
         n_cat = sum(n)) %>% 
  arrange(desc(pct_cat), desc(pct)) %>% 
  ungroup() %>% 
  select(primary_type, description,pct, n, pct_cat,n_cat) 

all_violent <- summarise_violent %>% 
  summarise(pct = sum(pct),
            n = sum(n)) %>% 
  mutate(primary_type = "TOTAL VIOLENT CRIME")

category_violent <- summarise_violent %>% 
  distinct(primary_type, pct_cat,n_cat) %>% 
  mutate(n = n_cat,
                pct = pct_cat,
 description = "TOTAL")

summarise_violent <- summarise_violent %>% 
  bind_rows(category_violent) %>% 
  arrange(desc(n_cat),n) %>% 
  select(-n_cat,-pct_cat) %>% 
  bind_rows(all_violent) %>% 
  mutate(pct = paste0(format(round(pct,1),digits = 1,nsmall = 1),"%"))  


my_ft <- flextable(
  summarise_violent)
my_ft <- merge_v(my_ft,j = "primary_type")
my_ft <- theme_box(my_ft)
my_ft <- bold(my_ft, i = ~description == "TOTAL",j = 2:4)
my_ft <- set_header_labels(my_ft, primary_type = "Crime Category", description = "Description", pct = "Percent of All Crimes", n = "Number")
my_ft <- bold(my_ft, i=nrow(summarise_violent),j = 1:4)
my_ft <- align(my_ft, align = "center", part = "all")
my_ft

Crime Category

Description

Percent of All Crimes

Number

BATTERY

AGGRAVATED PO: OTHER FIREARM

0.0%

26

AGG PRO.EMP: OTHER FIREARM

0.0%

29

AGGRAVATED OF A UNBORN CHILD

0.0%

45

OF UNBORN CHILD

0.0%

67

AGG PRO.EMP: HANDGUN

0.0%

126

AGGRAVATED DOMESTIC BATTERY: OTHER FIREARM

0.0%

133

AGG PRO.EMP:KNIFE/CUTTING INST

0.0%

165

AGGRAVATED PO: KNIFE/CUT INSTR

0.0%

204

AGGRAVATED PO: HANDGUN

0.0%

298

AGGRAVATED DOMESTIC BATTERY: HANDGUN

0.0%

330

AGG PRO EMP HANDS SERIOUS INJ

0.0%

401

AGG PO HANDS ETC SERIOUS INJ

0.0%

403

AGGRAVATED OF A CHILD

0.0%

525

AGG PRO.EMP: OTHER DANG WEAPON

0.0%

1201

AGGRAVATED: OTHER FIREARM

0.0%

1281

AGGRAVATED PO: OTHER DANG WEAP

0.0%

1402

AGGRAVATED DOMESTIC BATTERY

0.0%

1546

AGGRAVATED OF A SENIOR CITIZEN

0.0%

1766

AGG: HANDS/FIST/FEET SERIOUS INJURY

0.0%

2008

AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET SERIOUS INJURY

0.0%

2495

AGG: HANDS/FIST/FEET NO/MINOR INJURY

0.1%

4529

AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST

0.2%

11021

AGG PO HANDS NO/MIN INJURY

0.2%

11904

PRO EMP HANDS NO/MIN INJURY

0.2%

13516

AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON

0.3%

15918

AGGRAVATED:KNIFE/CUTTING INSTR

0.4%

24910

AGGRAVATED: HANDGUN

0.5%

32600

AGGRAVATED: OTHER DANG WEAPON

1.0%

63534

DOMESTIC BATTERY SIMPLE

7.6%

466318

SIMPLE

7.6%

467024

TOTAL

18.2%

1125725

ASSAULT

AGGRAVATED PO: OTHER FIREARM

0.0%

76

AGG PRO.EMP: OTHER FIREARM

0.0%

77

AGG PRO.EMP: HANDGUN

0.0%

275

AGG PRO.EMP:KNIFE/CUTTING INST

0.0%

406

AGGRAVATED PO:KNIFE/CUT INSTR

0.0%

482

AGGRAVATED PO: HANDGUN

0.0%

884

AGG PRO.EMP: OTHER DANG WEAPON

0.0%

955

AGGRAVATED PO: OTHER DANG WEAP

0.0%

1098

AGGRAVATED: OTHER FIREARM

0.0%

1777

AGG PO HANDS NO/MIN INJURY

0.1%

5804

PRO EMP HANDS NO/MIN INJURY

0.2%

12937

AGGRAVATED: OTHER DANG WEAPON

0.4%

22684

AGGRAVATED:KNIFE/CUTTING INSTR

0.4%

26566

AGGRAVATED: HANDGUN

0.6%

40032

SIMPLE

4.3%

262835

TOTAL

6.1%

376888

BURGLARY

ATTEMPT FORCIBLE ENTRY

0.3%

15863

FORCIBLE ENTRY

4.0%

247388

TOTAL

4.3%

263251

ROBBERY

ATTEMPT: ARMED-OTHER FIREARM

0.0%

266

ATTEMPT: AGGRAVATED

0.0%

798

ARMED: OTHER FIREARM

0.0%

1240

ATTEMPT: ARMED-OTHER DANG WEAP

0.0%

1850

ATTEMPT: ARMED-KNIFE/CUT INSTR

0.0%

1943

VEHICULAR HIJACKING

0.1%

4362

ATTEMPT: ARMED-HANDGUN

0.1%

7510

AGGRAVATED VEHICULAR HIJACKING

0.1%

8301

ATTEMPT: STRONGARM-NO WEAPON

0.1%

9111

ARMED:KNIFE/CUTTING INSTRUMENT

0.2%

10794

AGGRAVATED

0.2%

11481

ARMED: OTHER DANGEROUS WEAPON

0.2%

11666

STRONGARM - NO WEAPON

1.3%

81795

ARMED: HANDGUN

1.3%

82410

TOTAL

3.8%

233527

CRIM SEXUAL ASSAULT

ATTEMPT AGG: OTHER FIREARM

0.0%

3

AGGRAVATED: OTHER FIREARM

0.0%

32

ATTEMPT AGG: OTHER DANG WEAPON

0.0%

47

ATTEMPT AGG: HANDGUN

0.0%

132

ATTEMPT AGG: KNIFE/CUT INSTR

0.0%

168

ATTEMPT AGG: OTHER

0.0%

222

AGGRAVATED: OTHER DANG WEAPON

0.0%

265

AGGRAVATED: KNIFE/CUT INSTR

0.0%

970

ATTEMPT NON-AGGRAVATED

0.0%

1085

AGGRAVATED: HANDGUN

0.0%

1552

PREDATORY

0.0%

2289

AGGRAVATED: OTHER

0.1%

3979

NON-AGGRAVATED

0.2%

12863

TOTAL

0.4%

23607

SEX OFFENSE

ATT AGG CRIM SEXUAL ABUSE

0.0%

6

ATT AGG CRIMINAL SEXUAL ABUSE

0.0%

126

SEXUAL EXPLOITATION OF A CHILD

0.0%

558

ATT CRIM SEXUAL ABUSE

0.0%

780

AGG CRIMINAL SEXUAL ABUSE

0.1%

4988

CRIMINAL SEXUAL ABUSE

0.1%

8070

TOTAL

0.2%

14528

HOMICIDE

INVOLUNTARY MANSLAUGHTER

0.0%

3

RECKLESS HOMICIDE

0.0%

23

FIRST DEGREE MURDER

0.1%

8231

TOTAL

0.1%

8257

OFFENSE INVOLVING CHILDREN

SEX ASSLT OF CHILD BY FAM MBR

0.0%

1246

CRIM SEX ABUSE BY FAM MEMBER

0.0%

1764

AGG SEX ASSLT OF CHILD FAM MBR

0.0%

2385

AGG CRIM SEX ABUSE FAM MEMBER

0.0%

2544

TOTAL

0.1%

7939

KIDNAPPING

FORCIBLE DETENTION

0.0%

73

AGGRAVATED

0.0%

384

KIDNAPPING

0.0%

614

UNLAWFUL RESTRAINT

0.0%

1001

UNLAWFUL INTERFERE/VISITATION

0.0%

1970

CHILD ABDUCTION/STRANGER

0.0%

2204

TOTAL

0.1%

6246

OTHER OFFENSE

VIO BAIL BOND: DOM VIOLENCE

0.0%

580

TOTAL

0.0%

580

RITUALISM

AGG RIT MUT: HANDS/FIST/FEET NO/MINOR INJURY

0.0%

3

AGG RIT MUT: HANDS/FIST/FEET SERIOUS INJURY

0.0%

4

AGG RITUAL MUT:HANDGUN

0.0%

5

AGG RITUAL MUT:KNIFE/CUTTING I

0.0%

5

AGG RITUAL MUT:OTH DANG WEAPON

0.0%

5

TOTAL

0.0%

22

PUBLIC PEACE VIOLATION

ARMED VIOLENCE

0.0%

8

TOTAL

0.0%

8

DOMESTIC VIOLENCE

DOMESTIC VIOLENCE

0.0%

1

TOTAL

0.0%

1

TOTAL VIOLENT CRIME

33.4%

2060579


* This table summarizes the violent crimes in Chicago. Based on the above description classification, roughly 33% of all crimes are violent, and of these 18% (a little over half) are some form of battery. * There is not as clear of a pattern amongst the other types of violent crimes presented in this table.

Weather Patterns

Table

summary_weather <-weather_clean %>% 
        group_by(type,mon_label) %>% 
        summarise(avg = mean(chicago),
                  med = median(chicago),
                  sd = sd(chicago))
kable(summary_weather, col.names = c("Weather Stat", "Month", "Mean", "Median", "Std. Dev"), format = "html", digits = 2, align = "c") %>% 
  kable_styling(bootstrap_options = c("striped", "hover")) %>% 
  scroll_box(height = "2in")
Weather Stat Month Mean Median Std. Dev
humidity Jan 81.77 88.00 16.44
humidity Feb 74.85 76.00 16.96
humidity Mar 72.03 74.00 18.78
humidity Apr 66.39 69.00 19.48
humidity May 68.17 70.00 18.80
humidity Jun 69.24 72.00 19.02
humidity Jul 69.69 70.00 17.00
humidity Aug 73.39 77.00 16.93
humidity Sep 75.10 78.00 18.06
humidity Oct 79.63 86.00 19.68
humidity Nov 77.38 80.00 18.72
humidity Dec 83.36 87.00 14.35
pressure Jan 1020.25 1020.00 13.06
pressure Feb 1019.33 1019.00 12.55
pressure Mar 1021.47 1021.00 10.63
pressure Apr 1017.73 1018.00 9.41
pressure May 1018.30 1016.00 10.19
pressure Jun 1016.23 1015.00 8.78
pressure Jul 1015.23 1015.00 5.62
pressure Aug 1018.03 1018.00 6.02
pressure Sep 1020.22 1019.00 8.17
pressure Oct 1017.76 1018.00 10.11
pressure Nov 1021.38 1022.00 9.95
pressure Dec 1020.62 1020.00 12.15
temperature Jan 269.68 270.41 7.19
temperature Feb 269.47 269.76 7.63
temperature Mar 275.13 274.76 6.44
temperature Apr 282.19 281.15 5.80
temperature May 287.95 287.21 6.31
temperature Jun 293.06 292.86 5.14
temperature Jul 295.72 295.64 4.39
temperature Aug 295.29 295.34 3.93
temperature Sep 292.80 292.52 5.04
temperature Oct 286.09 285.90 5.07
temperature Nov 278.66 278.85 6.04
temperature Dec 273.68 274.06 6.46
wind_direction Jan 226.74 240.00 82.59
wind_direction Feb 223.63 241.00 91.60
wind_direction Mar 185.92 191.00 106.42
wind_direction Apr 170.91 170.00 104.85
wind_direction May 162.75 170.00 102.29
wind_direction Jun 168.84 181.00 97.28
wind_direction Jul 172.83 190.00 103.17
wind_direction Aug 169.64 188.00 95.18
wind_direction Sep 170.93 178.00 96.28
wind_direction Oct 199.98 204.00 92.25
wind_direction Nov 216.30 217.00 84.39
wind_direction Dec 214.23 230.00 87.31
wind_speed Jan 4.16 4.00 2.30
wind_speed Feb 4.61 4.00 2.58
wind_speed Mar 4.20 4.00 2.38
wind_speed Apr 4.18 4.00 2.33
wind_speed May 3.40 3.00 2.25
wind_speed Jun 2.81 3.00 1.89
wind_speed Jul 3.19 3.00 1.87
wind_speed Aug 2.81 3.00 1.67
wind_speed Sep 3.26 3.00 1.84
wind_speed Oct 4.13 4.00 2.26
wind_speed Nov 4.26 4.00 2.47
wind_speed Dec 3.99 4.00 2.21


Figure

f1 <- plot_ly(data = summary_weather %>% filter(type=="humidity"), x = ~mon_label, y = ~avg, type = "bar") %>% 
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Humidity"))
f2 <- plot_ly(data = summary_weather %>% filter(type=="pressure"), x = ~mon_label, y = ~avg, type = "bar") %>% 
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Pressure"))
f3 <- plot_ly(data = summary_weather %>% filter(type=="temperature"), x = ~mon_label, y = ~avg, type = "bar") %>% 
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Temperature"))
f4 <- plot_ly(data = summary_weather %>% filter(type=="wind_direction"), x = ~mon_label, y = ~avg, type = "bar") %>% 
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Wind Direction"))
f5 <- plot_ly(data = summary_weather %>% filter(type=="wind_speed"), x = ~mon_label, y = ~avg, type = "bar") %>% 
  layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Wind Speed"))
f1


f2


f3


f4


f5

Explanation

  • This visualization and table shows us how the different weather statistics vary throughout the year in Chicago. Some characteristics such as temperature, and pressure are relatively stable each month, without drastic variations, but others like wind direction and speed are less predictable.
  • Most of the weather metrics appear to vary throughout the year based on seasonality except for air pressure.
  • It should be noted that units of measurement were not available in the Kaggle data description, so some of these may not be particularly interpretable. For example, temperature shows averages of >250, which is definitely not a F or C estimate.

Part II

Logistic Regression

  • Core predictors used in this model will be ones that offer additional insight into the nature of the crime that the city can use to drive decision-making.
  • These will assess the effect of different policing departments, the locations within the city, and geographic districts on perpetration of violent crime.
  • In addition, the effect of weather and the time of day will be analyzed. While the city cannot influence the weather or time directly, understanding what drives more violent crime will allow it to more knowledgably allocate resources.
  • Certain predictors have been consolidated and cleaned up to bring out the key patterns (i.e. combining all medical locations, grouping time of day, etc.). Only rows with no missing values in key variables for the model will be used to build the model to facilitate predictions in the third step. A limitation of this is that we lose quite a bit of data.
model_inputs <- crimes_clean %>% 
  select(-latitude,-longitude,-x1,-x_coordinate,-y_coordinate,-updated_on,-location,-type,-iucr,-block,-fbi_code) %>% 
  left_join(chicago_weather) %>% 
  group_by(location_description) %>% 
  mutate(n = n()) %>% 
  mutate(location_mod = case_when(str_detect(pattern = "AIRPORT", location_description) ~ "AIRPORT",
                                  str_detect(pattern = "HOSPITAL|MEDICAL|NURSING HOME", location_description) ~ "MEDICAL/NURSING FACILITIES",
                                  str_detect(pattern = "CHURCH|SPORT|LIBRARY|BARBER", location_description) ~ "PUBLIC PLACES",
                                  str_detect(pattern = "VEHICLE|TAXI|TRUCK|TRANSPORTATION", location_description) ~ "OTHER",
                                  str_detect(pattern = "CTA", location_description) ~ "PUBLIC TRANSPORT",
                                  str_detect(pattern = "^CHA ", location_description) ~ "CHICAGO HOUSING AUTHORITY PROPERTY",
                                  str_detect(pattern = "RESIDENTIAL|APARTMENT|RESIDENCE|HOUSE|PORCH", location_description) ~ "RESIDENTIAL",
                                  str_detect(pattern = "JAIL", location_description) ~ "JAIL",
                                  str_detect(pattern = "LIQUOR|TAVERN|CLUB", location_description) ~ "PLACES WITH ALCOHOL",
                                  str_detect(pattern = "STORE|SHOP|RESTAURANT", location_description) ~ "STORE/FOOD",
                                  str_detect(pattern = "SCHOOL|COLLEGE", location_description) ~ "SCHOOL/UNIVERSITY",
                                 str_detect(pattern = "RIVER|PRAIRIE|WOOD|FOREST|LAGOON|LAKE", location_description) ~ "NATURE",
                                str_detect(pattern = "POLICE|FEDERAL|GOVERNMENT|FIRE STATION", location_description) ~ "GOVERNMENT PROPERTY",
                                str_detect(pattern = "SIDEWALK|STREET", location_description) |location_description == "ALLEY" ~ "STREET/ALLEY",
                                str_detect(pattern = "OFFICE|FACTORY", location_description) ~ "OFFICE/FACTORY",
                                str_detect(pattern = "LOT|PARK PROPERTY", location_description) ~ "LOT/PARK",
                                str_detect(pattern = "BANK|CURRENCY|CREDIT|SAVINGS", location_description) ~ "FINANCIAL INSTITUTIONS",
                                str_detect(pattern = "GAS|CAR WASH", location_description) ~ "GAS STATION/CAR WASH",
                                str_detect(pattern = "RAILROAD", location_description) ~ "OTHER",
                                str_detect(pattern = "POOL|CONSTRUCTION|DAY CARE|HIGHWAY|COIN OPERATED|HOTEL", location_description) | location_description == "AUTO" ~ "OTHER",
                                n > 700 ~ location_description,
                                !is.na(location_description) ~ "OTHER"),
         hr_of_day_grp = case_when(hr_of_day %in% c(8:12) ~ "Morning (8AM-12PM)",
                                   hr_of_day %in% c(13:17) ~ "Afternoon (1-5 PM)",
                                   hr_of_day %in% c(18:21) ~ "Early Night (6-9 PM)",
                                   hr_of_day %in% c(22:23, 0:4) ~ "Late Night (10PM - 4 AM)",
                                   hr_of_day %in% c(5:7) ~ "Early Morning (5-7AM)"))

binary_loc <- varhandle::to.dummy(model_inputs$location_mod,prefix = "loc") %>% 
  as.tibble()
model_inputs <- model_inputs %>% 
  bind_cols(binary_loc) %>% 
  clean_names() 

model_inputs2 <- model_inputs %>% 
  select(violent, hr_of_day_grp, starts_with("loc"), snow_all, rain_all,temperature, day, mon, day_of_wk) %>% 
  na.omit

model_inputs3 <- model_inputs2 %>% 
  ungroup() %>% 
  select(violent, day, mon, day_of_wk, hr_of_day_grp, rain_all, temperature)
log_reg <- glm(formula = factor(violent) ~ factor(hr_of_day_grp) + factor(loc_residential) + factor(loc_school_university) + factor(loc_public_transport) + factor(loc_street_alley) + factor(loc_airport) + factor(loc_abandoned_building) + factor(loc_atm_automatic_teller_machine) +factor(loc_public_places) + factor(loc_places_with_alcohol) + factor(loc_store_food) +temperature + factor(rain_all) + factor(snow_all) +factor(loc_lot_park) + factor(loc_gas_station_car_wash),family = "binomial",data = model_inputs2)
summary_tbl <- tidy(log_reg) %>% 
  mutate(odds = exp(estimate)) %>% 
  select(term, estimate, odds, everything())
kable(summary_tbl %>% select(-std.error,-statistic), format = "html", digits = 2,col.names = c("Term", "Log Odds", "Odds", "P-Value"), align = "lccc") %>% 
  kable_styling(bootstrap_options = c("striped","hover")) %>% 
  scroll_box(height = "3.5in")
Term Log Odds Odds P-Value
(Intercept) -2.26 0.10 0.00
factor(hr_of_day_grp)Early Morning (5-7AM) 0.16 1.18 0.00
factor(hr_of_day_grp)Early Night (6-9 PM) -0.06 0.94 0.00
factor(hr_of_day_grp)Late Night (10PM - 4 AM) 0.25 1.29 0.00
factor(hr_of_day_grp)Morning (8AM-12PM) -0.23 0.80 0.00
factor(loc_residential)1 0.87 2.39 0.00
factor(loc_school_university)1 1.25 3.49 0.00
factor(loc_public_transport)1 0.35 1.41 0.00
factor(loc_street_alley)1 0.31 1.36 0.00
factor(loc_airport)1 -1.18 0.31 0.00
factor(loc_abandoned_building)1 -0.16 0.85 0.00
factor(loc_atm_automatic_teller_machine)1 -2.16 0.11 0.00
factor(loc_public_places)1 0.32 1.37 0.00
factor(loc_places_with_alcohol)1 0.11 1.11 0.00
factor(loc_store_food)1 -0.35 0.71 0.00
temperature 0.00 1.00 0.00
factor(rain_all)1 -0.02 0.98 0.00
factor(snow_all)1 0.01 1.01 0.32
factor(loc_lot_park)1 -0.09 0.92 0.00
factor(loc_gas_station_car_wash)1 0.03 1.03 0.13

Interpretation

  • This model considers certain key locations for the crime, the temperature of that day, the time of the day, and whether there was rain or snow that day.
  • Understandably so, the odds of a violent crime increase by 29% and 18% between the hours of 10PM and 4 AM and 5-7 AM respectively as compared to 1 PM to 5 PM. Night time is often considered prime time for crimes, and the data here confirms that violent crimes increase in the night-time and early morning hours. Interestingly, the early parts of the night are slightly less prone (6%) to violent crime than the peak day time afternoon hours even though it is nighttime.
  • Based on the model, public transport, schools and universities, streets and alleys, and residential areas increase the odds of violent crime the most.
  • While there is a slight decrease in the odds of violent crime when there is rain, it is only by 2%. Similarly, the snow and temperature doesn’t impact the odds of violent crime, even though the coefficients themselves are significant.

Part III

Models

model_inputs3$violent <- as.factor(model_inputs3$violent)
model_inputs3$day <- as.factor(model_inputs3$day)
model_inputs3$day_of_wk <- as.factor(model_inputs3$day_of_wk)
model_inputs3$hr_of_day_grp <- as.factor(model_inputs3$hr_of_day_grp)
model_inputs3$rain_all <- as.factor(model_inputs3$rain_all)
model_inputs3$mon <- as.factor(model_inputs3$mon)

n <- sample(1:nrow(model_inputs3),size = .3*nrow(model_inputs3))
testing <- model_inputs3[n,]
training <- model_inputs3[-n,]
  • The first step to building the model is to split the data into a training data set to help build the model, and a testing data to help test the model.
  • A logistic regression, Naive Bayes, and random forest classifier will be developed using the training data and evaluated for performance in terms of misclassification rate, Type I error, and Type II error.
miss_rate <- function(predicted, actual){
  val <- sum(actual != predicted, na.rm = T)/length(actual)
  return(val)
}
cm_mat <- function(predicted, actual){
  val <- confusionMatrix(data = predicted, reference = actual)
  return(val$table)
}

log_reg_pred <- glm(formula = violent ~ hr_of_day_grp + day_of_wk + mon +  day + temperature + rain_all,family = "binomial", data = training)
bayes <- naiveBayes(formula = violent ~ hr_of_day_grp + day_of_wk + mon +  day + temperature + rain_all,data = training)
rf <- randomForest(y = training$violent, x = training %>% ungroup() %>% select(-violent),ntree = 70)
print(paste0("A binomial logistic regression model gives a misclassification rate of ", 
             round(miss_rate(predicted = round(predict(log_reg_pred,newdata = testing,type = "response")),actual = testing$violent)*100,1), "%."))
## [1] "A binomial logistic regression model gives a misclassification rate of 33.4%."
cm_mat(predicted = factor(round(predict(log_reg_pred,newdata = testing,type = "response"))),actual = testing$violent)
##           Reference
## Prediction      0      1
##          0 239550 120091
##          1      0      0
print(paste0("A Naive Bayes model gives a misclassification rate of ", 
             round(miss_rate(predicted =predict(bayes,newdata = testing,threshold = .001),actual = testing$violent)*100,1), "%."))
## [1] "A Naive Bayes model gives a misclassification rate of 33.4%."
cm_mat(predicted =predict(bayes,newdata = testing,threshold = .001),actual = testing$violent)
##           Reference
## Prediction      0      1
##          0 239550 120091
##          1      0      0
print(paste0("A random forest model with 70 trees gives a misclassification rate of ", 
             round(miss_rate(predicted = predict(object = rf,newdata = testing), actual = testing$violent)*100,1), "%."))
## [1] "A random forest model with 70 trees gives a misclassification rate of 33.4%."
cm_mat(predicted = predict(object = rf,newdata = testing), actual = testing$violent)
##           Reference
## Prediction      0      1
##          0 238145 118823
##          1   1405   1268

Interpration

  • In this case, a false negative is more harmful than a false positive. Predicting that a violent crime will not occur when it actually does is more detrimental from a societal perspective. However, a high false positive rate might lead to resources being inefficiently allocated to certain high risk areas, leading to an economic burden for the city.
  • Based on this assumption, the Naive Bayes model and logistic regression are the best fit in this case because they have the lowest rate of Type II errors (false negatives).
  • If the mayor is interested in reducing Type I errors for the purpose of optimizing taxpayer funds, he or she should consider implementing the random forest regression because it offers the lowest Type I error (false positive) and misclassification rate.
  • Given more time, I would construct models with fewer dimensions to assess the impact of a simplistic model, and try other algorithms such as SVMs, KNN, and boosting techniques to improve the classification rate. Additionally, with increased computational power, I would build a random forest with more trees (N =500 is the default).