library(tidyverse)
library(lubridate)
library(caret)
library(e1071)
library(networkD3)
library(ggplot2)
library(plotly)
library(stringr)
library(janitor)
library(purrr)
library(readxl)
library(readr)
library(kableExtra)
library(flextable)
library(broom)
library(randomForest)
### Read in Datasets
weather_files <- list.files("Inputs/Weather")[-str_detect(string = list.files("Inputs/Weather"),pattern = "\\.zip")]
crime_files <- list.files("Inputs/Crime")[-str_detect(string = list.files("Inputs/Crime"),pattern = "\\.zip")]
crimes <- map(.x = paste0("Inputs/Crime/",crime_files),.f = ~read_csv(.x) %>%
mutate(src = .x,
type= str_replace_all(string = src,pattern = "Inputs/Crime/|\\.csv",replacement = "")) %>%
select(-src)) %>%
bind_rows() %>%
clean_names()
weather <- map(.x = paste0("Inputs/Weather/",weather_files),.f = ~read_csv(.x) %>%
mutate(src = .x,
type= str_replace_all(string = src,pattern = "Inputs/Weather/|\\.csv",replacement = "")) %>%
select(-src) %>%
clean_names())
city_attributes <- weather[[1]]
weather_df <- weather[-c(1,5)] %>%
bind_rows()
weather_desc <- weather[[5]]
crimes_clean <- crimes %>%
filter(!is.na(id)) %>%
mutate(day = day(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
mon =month(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
yr = year(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p")),
day_of_wk = wday(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
hr_of_day = hour(as.POSIXlt(date, format="%m/%d/%Y %I:%M:%S %p"))) %>%
mutate(date = make_date(year = yr,month = mon,day = day)) %>%
select(-yr) %>%
distinct() %>%
arrange(id,desc(latitude), desc(longitude)) %>%
distinct(id, .keep_all = T)
weather_clean <- weather_df %>%
filter(!is.na(chicago)) %>%
mutate(day = day(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
mon =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
mon_label =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = T),
yr = year(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
day_of_wk = wday(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
hr_of_day = hour(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"))) %>%
mutate(date = make_date(year = yr,month = mon,day = day))
chicago_clean_day <- weather_clean %>%
group_by(date, yr, mon, mon_label, day,type) %>%
summarise(chicago = mean(chicago)) %>%
spread(key = type,value = chicago)
chic_desc <- weather_desc %>%
filter(!is.na(chicago)) %>%
mutate(day = day(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
mon =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
mon_label =month(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = T),
yr = year(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p")),
day_of_wk = wday(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"), label = TRUE),
hr_of_day = hour(as.POSIXlt(datetime, format="%m/%d/%Y %I:%M:%S %p"))) %>%
mutate(date = make_date(year = yr,month = mon,day = day)) %>%
distinct(date, chicago) %>%
mutate(flag = 1) %>%
spread(key = chicago, value = flag, fill = 0) %>%
clean_names()
chicago_weather <- chicago_clean_day %>%
left_join(chic_desc) %>%
mutate(rain_all = ifelse(thunderstorm == 1 | thunderstorm_with_drizzle == 1 | thunderstorm_with_heavy_rain == 1 |
thunderstorm_with_light_drizzle == 1 | thunderstorm_with_light_rain == 1 | thunderstorm_with_rain == 1 | very_heavy_rain ==1 |
heavy_intensity_drizzle == 1 | heavy_intensity_rain == 1 | light_rain == 1 | moderate_rain == 1 |proximity_shower_rain == 1 |
proximity_thunderstorm ==1 |
proximity_thunderstorm_with_drizzle == 1 | proximity_thunderstorm_with_rain == 1 | light_intensity_drizzle == 1 |squalls == 1,1,0),
snow_all = ifelse(freezing_rain == 1 |light_snow == 1 | light_rain_and_snow == 1 |snow == 1|heavy_snow ==1,1,0)) %>%
select(-mon,-day)
kable(crimes %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.)))) %>%
gather(key = variable,value = missing) %>%
filter(missing != 0), format = "html",col.names = c("Column", "Missing (N)"),align = "c")
| Column | Missing (N) |
|---|---|
| id | 1 |
| case_number | 7 |
| location_description | 1990 |
| arrest | 1 |
| domestic | 1 |
| beat | 1 |
| district | 92 |
| ward | 700225 |
| community_area | 702092 |
| x_coordinate | 105574 |
| y_coordinate | 105575 |
| year | 1 |
| latitude | 105575 |
| longitude | 105576 |
| location | 105574 |
* This table tells us what variables in the Crimes table have missing values. Upon further investigation, we will remove the row for the missing ID. The other rows do appear to depict actual crimes and should be considered in further analyses, and are likely mistakes in data entry.
* Most notably, there are over 100,000 observations without longitude/latitude data available. For any geospatial analyses, these observations need to be removed or imputed based on values from other crimes in the same district or community.
* For modeling purposes, it is prudent to remove observations with missing values when using the variable as a predictor for violent crime.
* In addition, there are roughly 1 million observations that appear to be duplicates, so these observations will be removed.
* In the weather data, we primarily care about the weather in Chicago. There are roughly 2000 observations where the value for Chicago is missing for a particular weather category. To consolidate the data, these are removed. * For summary statistics below, multiple measurements per day for each weather are considered. For predictive and causal analyses, daily averages are used because there is not a large amount of variance per day.
violent_map <- crimes_clean %>%
distinct(primary_type,description) %>%
arrange(primary_type) %>%
mutate(violent = case_when(primary_type %in% c("ROBBERY", "CRIM SEXUAL ASSAULT", "HOMICIDE", "ASSAULT", "BATTERY", "KIDNAPPING", "RITUALISM") ~ 1,
str_detect(pattern = "SEX ABUSE|SEXUAL ABUSE|VIOLENCE|ASSLT|FORC|SEXUAL EXPLOIT", string = description) ~ 1,
TRUE ~ 0))
kable(violent_map %>% filter(violent == 1) %>%
select(-violent),
format = "html",col.names = c("Crime Category", "Description"),align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "4in")
| Crime Category | Description |
|---|---|
| ASSAULT | AGGRAVATED: HANDGUN |
| ASSAULT | AGGRAVATED:KNIFE/CUTTING INSTR |
| ASSAULT | SIMPLE |
| ASSAULT | AGGRAVATED PO: OTHER DANG WEAP |
| ASSAULT | AGGRAVATED: OTHER DANG WEAPON |
| ASSAULT | AGGRAVATED: OTHER FIREARM |
| ASSAULT | AGG PO HANDS NO/MIN INJURY |
| ASSAULT | AGGRAVATED PO: HANDGUN |
| ASSAULT | PRO EMP HANDS NO/MIN INJURY |
| ASSAULT | AGGRAVATED PO:KNIFE/CUT INSTR |
| ASSAULT | AGGRAVATED PO: OTHER FIREARM |
| ASSAULT | AGG PRO.EMP: OTHER DANG WEAPON |
| ASSAULT | AGG PRO.EMP: OTHER FIREARM |
| ASSAULT | AGG PRO.EMP:KNIFE/CUTTING INST |
| ASSAULT | AGG PRO.EMP: HANDGUN |
| BATTERY | SIMPLE |
| BATTERY | AGGRAVATED: HANDGUN |
| BATTERY | AGGRAVATED:KNIFE/CUTTING INSTR |
| BATTERY | AGGRAVATED: OTHER DANG WEAPON |
| BATTERY | AGG PO HANDS NO/MIN INJURY |
| BATTERY | AGG: HANDS/FIST/FEET NO/MINOR INJURY |
| BATTERY | AGGRAVATED PO: OTHER DANG WEAP |
| BATTERY | DOMESTIC BATTERY SIMPLE |
| BATTERY | AGGRAVATED: OTHER FIREARM |
| BATTERY | AGGRAVATED PO: HANDGUN |
| BATTERY | AGG PRO.EMP: OTHER DANG WEAPON |
| BATTERY | AGGRAVATED PO: KNIFE/CUT INSTR |
| BATTERY | PRO EMP HANDS NO/MIN INJURY |
| BATTERY | AGG PRO.EMP: HANDGUN |
| BATTERY | AGG PRO.EMP:KNIFE/CUTTING INST |
| BATTERY | AGGRAVATED PO: OTHER FIREARM |
| BATTERY | AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST |
| BATTERY | AGGRAVATED DOMESTIC BATTERY |
| BATTERY | AGG PRO.EMP: OTHER FIREARM |
| BATTERY | AGGRAVATED OF A SENIOR CITIZEN |
| BATTERY | AGGRAVATED OF A CHILD |
| BATTERY | AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON |
| BATTERY | OF UNBORN CHILD |
| BATTERY | AGGRAVATED OF A UNBORN CHILD |
| BATTERY | AGG PO HANDS ETC SERIOUS INJ |
| BATTERY | AGG: HANDS/FIST/FEET SERIOUS INJURY |
| BATTERY | AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET SERIOUS INJURY |
| BATTERY | AGGRAVATED DOMESTIC BATTERY: HANDGUN |
| BATTERY | AGG PRO EMP HANDS SERIOUS INJ |
| BATTERY | AGGRAVATED DOMESTIC BATTERY: OTHER FIREARM |
| BURGLARY | FORCIBLE ENTRY |
| BURGLARY | ATTEMPT FORCIBLE ENTRY |
| CRIM SEXUAL ASSAULT | NON-AGGRAVATED |
| CRIM SEXUAL ASSAULT | AGGRAVATED: OTHER FIREARM |
| CRIM SEXUAL ASSAULT | AGGRAVATED: OTHER DANG WEAPON |
| CRIM SEXUAL ASSAULT | ATTEMPT NON-AGGRAVATED |
| CRIM SEXUAL ASSAULT | AGGRAVATED: OTHER |
| CRIM SEXUAL ASSAULT | AGGRAVATED: HANDGUN |
| CRIM SEXUAL ASSAULT | PREDATORY |
| CRIM SEXUAL ASSAULT | AGGRAVATED: KNIFE/CUT INSTR |
| CRIM SEXUAL ASSAULT | ATTEMPT AGG: KNIFE/CUT INSTR |
| CRIM SEXUAL ASSAULT | ATTEMPT AGG: HANDGUN |
| CRIM SEXUAL ASSAULT | ATTEMPT AGG: OTHER |
| CRIM SEXUAL ASSAULT | ATTEMPT AGG: OTHER DANG WEAPON |
| CRIM SEXUAL ASSAULT | ATTEMPT AGG: OTHER FIREARM |
| DOMESTIC VIOLENCE | DOMESTIC VIOLENCE |
| HOMICIDE | FIRST DEGREE MURDER |
| HOMICIDE | RECKLESS HOMICIDE |
| HOMICIDE | INVOLUNTARY MANSLAUGHTER |
| KIDNAPPING | UNLAWFUL INTERFERE/VISITATION |
| KIDNAPPING | CHILD ABDUCTION/STRANGER |
| KIDNAPPING | UNLAWFUL RESTRAINT |
| KIDNAPPING | KIDNAPPING |
| KIDNAPPING | AGGRAVATED |
| KIDNAPPING | FORCIBLE DETENTION |
| OFFENSE INVOLVING CHILDREN | AGG SEX ASSLT OF CHILD FAM MBR |
| OFFENSE INVOLVING CHILDREN | SEX ASSLT OF CHILD BY FAM MBR |
| OFFENSE INVOLVING CHILDREN | CRIM SEX ABUSE BY FAM MEMBER |
| OFFENSE INVOLVING CHILDREN | AGG CRIM SEX ABUSE FAM MEMBER |
| OTHER OFFENSE | VIO BAIL BOND: DOM VIOLENCE |
| PUBLIC PEACE VIOLATION | ARMED VIOLENCE |
| RITUALISM | AGG RITUAL MUT:KNIFE/CUTTING I |
| RITUALISM | AGG RITUAL MUT:OTH DANG WEAPON |
| RITUALISM | AGG RITUAL MUT:HANDGUN |
| RITUALISM | AGG RIT MUT: HANDS/FIST/FEET NO/MINOR INJURY |
| RITUALISM | AGG RIT MUT: HANDS/FIST/FEET SERIOUS INJURY |
| ROBBERY | ARMED: OTHER DANGEROUS WEAPON |
| ROBBERY | STRONGARM - NO WEAPON |
| ROBBERY | AGGRAVATED VEHICULAR HIJACKING |
| ROBBERY | ARMED: HANDGUN |
| ROBBERY | ATTEMPT: ARMED-OTHER FIREARM |
| ROBBERY | ATTEMPT: STRONGARM-NO WEAPON |
| ROBBERY | ARMED:KNIFE/CUTTING INSTRUMENT |
| ROBBERY | VEHICULAR HIJACKING |
| ROBBERY | AGGRAVATED |
| ROBBERY | ARMED: OTHER FIREARM |
| ROBBERY | ATTEMPT: ARMED-HANDGUN |
| ROBBERY | ATTEMPT: ARMED-KNIFE/CUT INSTR |
| ROBBERY | ATTEMPT: ARMED-OTHER DANG WEAP |
| ROBBERY | ATTEMPT: AGGRAVATED |
| SEX OFFENSE | CRIMINAL SEXUAL ABUSE |
| SEX OFFENSE | AGG CRIMINAL SEXUAL ABUSE |
| SEX OFFENSE | ATT CRIM SEXUAL ABUSE |
| SEX OFFENSE | ATT AGG CRIMINAL SEXUAL ABUSE |
| SEX OFFENSE | ATT AGG CRIM SEXUAL ABUSE |
| SEX OFFENSE | SEXUAL EXPLOITATION OF A CHILD |
* This table depicts the crimes classified as violent, and will form the basis of subsequent descriptive and predictive analyses.
* Violent crimes were identified based on the definition from the National Institute of Justice.
* Any primary crime category that suggested an individual or group inflicting violence on themselves or others was categorized as violent. This included obvious crimes such as sexual assualt, homicide, robbery, and kidnapping in addition to more nuanced perpetrations of violence such as sexual abuse, forced entry, and ritualism.
crimes_clean <- crimes_clean %>%
left_join(violent_map)
summarise_violent <- crimes_clean %>%
mutate(total = n()) %>%
group_by(primary_type, description,violent, total) %>%
summarise(n = n()) %>%
filter(violent == 1) %>%
mutate(pct = (n/total) * 100) %>%
group_by(primary_type) %>%
mutate(pct_cat = (sum(n)/total)*100,
n_cat = sum(n)) %>%
arrange(desc(pct_cat), desc(pct)) %>%
ungroup() %>%
select(primary_type, description,pct, n, pct_cat,n_cat)
all_violent <- summarise_violent %>%
summarise(pct = sum(pct),
n = sum(n)) %>%
mutate(primary_type = "TOTAL VIOLENT CRIME")
category_violent <- summarise_violent %>%
distinct(primary_type, pct_cat,n_cat) %>%
mutate(n = n_cat,
pct = pct_cat,
description = "TOTAL")
summarise_violent <- summarise_violent %>%
bind_rows(category_violent) %>%
arrange(desc(n_cat),n) %>%
select(-n_cat,-pct_cat) %>%
bind_rows(all_violent) %>%
mutate(pct = paste0(format(round(pct,1),digits = 1,nsmall = 1),"%"))
my_ft <- flextable(
summarise_violent)
my_ft <- merge_v(my_ft,j = "primary_type")
my_ft <- theme_box(my_ft)
my_ft <- bold(my_ft, i = ~description == "TOTAL",j = 2:4)
my_ft <- set_header_labels(my_ft, primary_type = "Crime Category", description = "Description", pct = "Percent of All Crimes", n = "Number")
my_ft <- bold(my_ft, i=nrow(summarise_violent),j = 1:4)
my_ft <- align(my_ft, align = "center", part = "all")
my_ft
Crime Category | Description | Percent of All Crimes | Number |
BATTERY | AGGRAVATED PO: OTHER FIREARM | 0.0% | 26 |
AGG PRO.EMP: OTHER FIREARM | 0.0% | 29 | |
AGGRAVATED OF A UNBORN CHILD | 0.0% | 45 | |
OF UNBORN CHILD | 0.0% | 67 | |
AGG PRO.EMP: HANDGUN | 0.0% | 126 | |
AGGRAVATED DOMESTIC BATTERY: OTHER FIREARM | 0.0% | 133 | |
AGG PRO.EMP:KNIFE/CUTTING INST | 0.0% | 165 | |
AGGRAVATED PO: KNIFE/CUT INSTR | 0.0% | 204 | |
AGGRAVATED PO: HANDGUN | 0.0% | 298 | |
AGGRAVATED DOMESTIC BATTERY: HANDGUN | 0.0% | 330 | |
AGG PRO EMP HANDS SERIOUS INJ | 0.0% | 401 | |
AGG PO HANDS ETC SERIOUS INJ | 0.0% | 403 | |
AGGRAVATED OF A CHILD | 0.0% | 525 | |
AGG PRO.EMP: OTHER DANG WEAPON | 0.0% | 1201 | |
AGGRAVATED: OTHER FIREARM | 0.0% | 1281 | |
AGGRAVATED PO: OTHER DANG WEAP | 0.0% | 1402 | |
AGGRAVATED DOMESTIC BATTERY | 0.0% | 1546 | |
AGGRAVATED OF A SENIOR CITIZEN | 0.0% | 1766 | |
AGG: HANDS/FIST/FEET SERIOUS INJURY | 0.0% | 2008 | |
AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET SERIOUS INJURY | 0.0% | 2495 | |
AGG: HANDS/FIST/FEET NO/MINOR INJURY | 0.1% | 4529 | |
AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST | 0.2% | 11021 | |
AGG PO HANDS NO/MIN INJURY | 0.2% | 11904 | |
PRO EMP HANDS NO/MIN INJURY | 0.2% | 13516 | |
AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON | 0.3% | 15918 | |
AGGRAVATED:KNIFE/CUTTING INSTR | 0.4% | 24910 | |
AGGRAVATED: HANDGUN | 0.5% | 32600 | |
AGGRAVATED: OTHER DANG WEAPON | 1.0% | 63534 | |
DOMESTIC BATTERY SIMPLE | 7.6% | 466318 | |
SIMPLE | 7.6% | 467024 | |
TOTAL | 18.2% | 1125725 | |
ASSAULT | AGGRAVATED PO: OTHER FIREARM | 0.0% | 76 |
AGG PRO.EMP: OTHER FIREARM | 0.0% | 77 | |
AGG PRO.EMP: HANDGUN | 0.0% | 275 | |
AGG PRO.EMP:KNIFE/CUTTING INST | 0.0% | 406 | |
AGGRAVATED PO:KNIFE/CUT INSTR | 0.0% | 482 | |
AGGRAVATED PO: HANDGUN | 0.0% | 884 | |
AGG PRO.EMP: OTHER DANG WEAPON | 0.0% | 955 | |
AGGRAVATED PO: OTHER DANG WEAP | 0.0% | 1098 | |
AGGRAVATED: OTHER FIREARM | 0.0% | 1777 | |
AGG PO HANDS NO/MIN INJURY | 0.1% | 5804 | |
PRO EMP HANDS NO/MIN INJURY | 0.2% | 12937 | |
AGGRAVATED: OTHER DANG WEAPON | 0.4% | 22684 | |
AGGRAVATED:KNIFE/CUTTING INSTR | 0.4% | 26566 | |
AGGRAVATED: HANDGUN | 0.6% | 40032 | |
SIMPLE | 4.3% | 262835 | |
TOTAL | 6.1% | 376888 | |
BURGLARY | ATTEMPT FORCIBLE ENTRY | 0.3% | 15863 |
FORCIBLE ENTRY | 4.0% | 247388 | |
TOTAL | 4.3% | 263251 | |
ROBBERY | ATTEMPT: ARMED-OTHER FIREARM | 0.0% | 266 |
ATTEMPT: AGGRAVATED | 0.0% | 798 | |
ARMED: OTHER FIREARM | 0.0% | 1240 | |
ATTEMPT: ARMED-OTHER DANG WEAP | 0.0% | 1850 | |
ATTEMPT: ARMED-KNIFE/CUT INSTR | 0.0% | 1943 | |
VEHICULAR HIJACKING | 0.1% | 4362 | |
ATTEMPT: ARMED-HANDGUN | 0.1% | 7510 | |
AGGRAVATED VEHICULAR HIJACKING | 0.1% | 8301 | |
ATTEMPT: STRONGARM-NO WEAPON | 0.1% | 9111 | |
ARMED:KNIFE/CUTTING INSTRUMENT | 0.2% | 10794 | |
AGGRAVATED | 0.2% | 11481 | |
ARMED: OTHER DANGEROUS WEAPON | 0.2% | 11666 | |
STRONGARM - NO WEAPON | 1.3% | 81795 | |
ARMED: HANDGUN | 1.3% | 82410 | |
TOTAL | 3.8% | 233527 | |
CRIM SEXUAL ASSAULT | ATTEMPT AGG: OTHER FIREARM | 0.0% | 3 |
AGGRAVATED: OTHER FIREARM | 0.0% | 32 | |
ATTEMPT AGG: OTHER DANG WEAPON | 0.0% | 47 | |
ATTEMPT AGG: HANDGUN | 0.0% | 132 | |
ATTEMPT AGG: KNIFE/CUT INSTR | 0.0% | 168 | |
ATTEMPT AGG: OTHER | 0.0% | 222 | |
AGGRAVATED: OTHER DANG WEAPON | 0.0% | 265 | |
AGGRAVATED: KNIFE/CUT INSTR | 0.0% | 970 | |
ATTEMPT NON-AGGRAVATED | 0.0% | 1085 | |
AGGRAVATED: HANDGUN | 0.0% | 1552 | |
PREDATORY | 0.0% | 2289 | |
AGGRAVATED: OTHER | 0.1% | 3979 | |
NON-AGGRAVATED | 0.2% | 12863 | |
TOTAL | 0.4% | 23607 | |
SEX OFFENSE | ATT AGG CRIM SEXUAL ABUSE | 0.0% | 6 |
ATT AGG CRIMINAL SEXUAL ABUSE | 0.0% | 126 | |
SEXUAL EXPLOITATION OF A CHILD | 0.0% | 558 | |
ATT CRIM SEXUAL ABUSE | 0.0% | 780 | |
AGG CRIMINAL SEXUAL ABUSE | 0.1% | 4988 | |
CRIMINAL SEXUAL ABUSE | 0.1% | 8070 | |
TOTAL | 0.2% | 14528 | |
HOMICIDE | INVOLUNTARY MANSLAUGHTER | 0.0% | 3 |
RECKLESS HOMICIDE | 0.0% | 23 | |
FIRST DEGREE MURDER | 0.1% | 8231 | |
TOTAL | 0.1% | 8257 | |
OFFENSE INVOLVING CHILDREN | SEX ASSLT OF CHILD BY FAM MBR | 0.0% | 1246 |
CRIM SEX ABUSE BY FAM MEMBER | 0.0% | 1764 | |
AGG SEX ASSLT OF CHILD FAM MBR | 0.0% | 2385 | |
AGG CRIM SEX ABUSE FAM MEMBER | 0.0% | 2544 | |
TOTAL | 0.1% | 7939 | |
KIDNAPPING | FORCIBLE DETENTION | 0.0% | 73 |
AGGRAVATED | 0.0% | 384 | |
KIDNAPPING | 0.0% | 614 | |
UNLAWFUL RESTRAINT | 0.0% | 1001 | |
UNLAWFUL INTERFERE/VISITATION | 0.0% | 1970 | |
CHILD ABDUCTION/STRANGER | 0.0% | 2204 | |
TOTAL | 0.1% | 6246 | |
OTHER OFFENSE | VIO BAIL BOND: DOM VIOLENCE | 0.0% | 580 |
TOTAL | 0.0% | 580 | |
RITUALISM | AGG RIT MUT: HANDS/FIST/FEET NO/MINOR INJURY | 0.0% | 3 |
AGG RIT MUT: HANDS/FIST/FEET SERIOUS INJURY | 0.0% | 4 | |
AGG RITUAL MUT:HANDGUN | 0.0% | 5 | |
AGG RITUAL MUT:KNIFE/CUTTING I | 0.0% | 5 | |
AGG RITUAL MUT:OTH DANG WEAPON | 0.0% | 5 | |
TOTAL | 0.0% | 22 | |
PUBLIC PEACE VIOLATION | ARMED VIOLENCE | 0.0% | 8 |
TOTAL | 0.0% | 8 | |
DOMESTIC VIOLENCE | DOMESTIC VIOLENCE | 0.0% | 1 |
TOTAL | 0.0% | 1 | |
TOTAL VIOLENT CRIME | 33.4% | 2060579 |
summary_weather <-weather_clean %>%
group_by(type,mon_label) %>%
summarise(avg = mean(chicago),
med = median(chicago),
sd = sd(chicago))
kable(summary_weather, col.names = c("Weather Stat", "Month", "Mean", "Median", "Std. Dev"), format = "html", digits = 2, align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
scroll_box(height = "2in")
| Weather Stat | Month | Mean | Median | Std. Dev |
|---|---|---|---|---|
| humidity | Jan | 81.77 | 88.00 | 16.44 |
| humidity | Feb | 74.85 | 76.00 | 16.96 |
| humidity | Mar | 72.03 | 74.00 | 18.78 |
| humidity | Apr | 66.39 | 69.00 | 19.48 |
| humidity | May | 68.17 | 70.00 | 18.80 |
| humidity | Jun | 69.24 | 72.00 | 19.02 |
| humidity | Jul | 69.69 | 70.00 | 17.00 |
| humidity | Aug | 73.39 | 77.00 | 16.93 |
| humidity | Sep | 75.10 | 78.00 | 18.06 |
| humidity | Oct | 79.63 | 86.00 | 19.68 |
| humidity | Nov | 77.38 | 80.00 | 18.72 |
| humidity | Dec | 83.36 | 87.00 | 14.35 |
| pressure | Jan | 1020.25 | 1020.00 | 13.06 |
| pressure | Feb | 1019.33 | 1019.00 | 12.55 |
| pressure | Mar | 1021.47 | 1021.00 | 10.63 |
| pressure | Apr | 1017.73 | 1018.00 | 9.41 |
| pressure | May | 1018.30 | 1016.00 | 10.19 |
| pressure | Jun | 1016.23 | 1015.00 | 8.78 |
| pressure | Jul | 1015.23 | 1015.00 | 5.62 |
| pressure | Aug | 1018.03 | 1018.00 | 6.02 |
| pressure | Sep | 1020.22 | 1019.00 | 8.17 |
| pressure | Oct | 1017.76 | 1018.00 | 10.11 |
| pressure | Nov | 1021.38 | 1022.00 | 9.95 |
| pressure | Dec | 1020.62 | 1020.00 | 12.15 |
| temperature | Jan | 269.68 | 270.41 | 7.19 |
| temperature | Feb | 269.47 | 269.76 | 7.63 |
| temperature | Mar | 275.13 | 274.76 | 6.44 |
| temperature | Apr | 282.19 | 281.15 | 5.80 |
| temperature | May | 287.95 | 287.21 | 6.31 |
| temperature | Jun | 293.06 | 292.86 | 5.14 |
| temperature | Jul | 295.72 | 295.64 | 4.39 |
| temperature | Aug | 295.29 | 295.34 | 3.93 |
| temperature | Sep | 292.80 | 292.52 | 5.04 |
| temperature | Oct | 286.09 | 285.90 | 5.07 |
| temperature | Nov | 278.66 | 278.85 | 6.04 |
| temperature | Dec | 273.68 | 274.06 | 6.46 |
| wind_direction | Jan | 226.74 | 240.00 | 82.59 |
| wind_direction | Feb | 223.63 | 241.00 | 91.60 |
| wind_direction | Mar | 185.92 | 191.00 | 106.42 |
| wind_direction | Apr | 170.91 | 170.00 | 104.85 |
| wind_direction | May | 162.75 | 170.00 | 102.29 |
| wind_direction | Jun | 168.84 | 181.00 | 97.28 |
| wind_direction | Jul | 172.83 | 190.00 | 103.17 |
| wind_direction | Aug | 169.64 | 188.00 | 95.18 |
| wind_direction | Sep | 170.93 | 178.00 | 96.28 |
| wind_direction | Oct | 199.98 | 204.00 | 92.25 |
| wind_direction | Nov | 216.30 | 217.00 | 84.39 |
| wind_direction | Dec | 214.23 | 230.00 | 87.31 |
| wind_speed | Jan | 4.16 | 4.00 | 2.30 |
| wind_speed | Feb | 4.61 | 4.00 | 2.58 |
| wind_speed | Mar | 4.20 | 4.00 | 2.38 |
| wind_speed | Apr | 4.18 | 4.00 | 2.33 |
| wind_speed | May | 3.40 | 3.00 | 2.25 |
| wind_speed | Jun | 2.81 | 3.00 | 1.89 |
| wind_speed | Jul | 3.19 | 3.00 | 1.87 |
| wind_speed | Aug | 2.81 | 3.00 | 1.67 |
| wind_speed | Sep | 3.26 | 3.00 | 1.84 |
| wind_speed | Oct | 4.13 | 4.00 | 2.26 |
| wind_speed | Nov | 4.26 | 4.00 | 2.47 |
| wind_speed | Dec | 3.99 | 4.00 | 2.21 |
f1 <- plot_ly(data = summary_weather %>% filter(type=="humidity"), x = ~mon_label, y = ~avg, type = "bar") %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Humidity"))
f2 <- plot_ly(data = summary_weather %>% filter(type=="pressure"), x = ~mon_label, y = ~avg, type = "bar") %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Pressure"))
f3 <- plot_ly(data = summary_weather %>% filter(type=="temperature"), x = ~mon_label, y = ~avg, type = "bar") %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Temperature"))
f4 <- plot_ly(data = summary_weather %>% filter(type=="wind_direction"), x = ~mon_label, y = ~avg, type = "bar") %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Wind Direction"))
f5 <- plot_ly(data = summary_weather %>% filter(type=="wind_speed"), x = ~mon_label, y = ~avg, type = "bar") %>%
layout(xaxis = list(title = "Month"), yaxis = list(title = "Average Wind Speed"))
f1
f2
f3
f4
f5
model_inputs <- crimes_clean %>%
select(-latitude,-longitude,-x1,-x_coordinate,-y_coordinate,-updated_on,-location,-type,-iucr,-block,-fbi_code) %>%
left_join(chicago_weather) %>%
group_by(location_description) %>%
mutate(n = n()) %>%
mutate(location_mod = case_when(str_detect(pattern = "AIRPORT", location_description) ~ "AIRPORT",
str_detect(pattern = "HOSPITAL|MEDICAL|NURSING HOME", location_description) ~ "MEDICAL/NURSING FACILITIES",
str_detect(pattern = "CHURCH|SPORT|LIBRARY|BARBER", location_description) ~ "PUBLIC PLACES",
str_detect(pattern = "VEHICLE|TAXI|TRUCK|TRANSPORTATION", location_description) ~ "OTHER",
str_detect(pattern = "CTA", location_description) ~ "PUBLIC TRANSPORT",
str_detect(pattern = "^CHA ", location_description) ~ "CHICAGO HOUSING AUTHORITY PROPERTY",
str_detect(pattern = "RESIDENTIAL|APARTMENT|RESIDENCE|HOUSE|PORCH", location_description) ~ "RESIDENTIAL",
str_detect(pattern = "JAIL", location_description) ~ "JAIL",
str_detect(pattern = "LIQUOR|TAVERN|CLUB", location_description) ~ "PLACES WITH ALCOHOL",
str_detect(pattern = "STORE|SHOP|RESTAURANT", location_description) ~ "STORE/FOOD",
str_detect(pattern = "SCHOOL|COLLEGE", location_description) ~ "SCHOOL/UNIVERSITY",
str_detect(pattern = "RIVER|PRAIRIE|WOOD|FOREST|LAGOON|LAKE", location_description) ~ "NATURE",
str_detect(pattern = "POLICE|FEDERAL|GOVERNMENT|FIRE STATION", location_description) ~ "GOVERNMENT PROPERTY",
str_detect(pattern = "SIDEWALK|STREET", location_description) |location_description == "ALLEY" ~ "STREET/ALLEY",
str_detect(pattern = "OFFICE|FACTORY", location_description) ~ "OFFICE/FACTORY",
str_detect(pattern = "LOT|PARK PROPERTY", location_description) ~ "LOT/PARK",
str_detect(pattern = "BANK|CURRENCY|CREDIT|SAVINGS", location_description) ~ "FINANCIAL INSTITUTIONS",
str_detect(pattern = "GAS|CAR WASH", location_description) ~ "GAS STATION/CAR WASH",
str_detect(pattern = "RAILROAD", location_description) ~ "OTHER",
str_detect(pattern = "POOL|CONSTRUCTION|DAY CARE|HIGHWAY|COIN OPERATED|HOTEL", location_description) | location_description == "AUTO" ~ "OTHER",
n > 700 ~ location_description,
!is.na(location_description) ~ "OTHER"),
hr_of_day_grp = case_when(hr_of_day %in% c(8:12) ~ "Morning (8AM-12PM)",
hr_of_day %in% c(13:17) ~ "Afternoon (1-5 PM)",
hr_of_day %in% c(18:21) ~ "Early Night (6-9 PM)",
hr_of_day %in% c(22:23, 0:4) ~ "Late Night (10PM - 4 AM)",
hr_of_day %in% c(5:7) ~ "Early Morning (5-7AM)"))
binary_loc <- varhandle::to.dummy(model_inputs$location_mod,prefix = "loc") %>%
as.tibble()
model_inputs <- model_inputs %>%
bind_cols(binary_loc) %>%
clean_names()
model_inputs2 <- model_inputs %>%
select(violent, hr_of_day_grp, starts_with("loc"), snow_all, rain_all,temperature, day, mon, day_of_wk) %>%
na.omit
model_inputs3 <- model_inputs2 %>%
ungroup() %>%
select(violent, day, mon, day_of_wk, hr_of_day_grp, rain_all, temperature)
log_reg <- glm(formula = factor(violent) ~ factor(hr_of_day_grp) + factor(loc_residential) + factor(loc_school_university) + factor(loc_public_transport) + factor(loc_street_alley) + factor(loc_airport) + factor(loc_abandoned_building) + factor(loc_atm_automatic_teller_machine) +factor(loc_public_places) + factor(loc_places_with_alcohol) + factor(loc_store_food) +temperature + factor(rain_all) + factor(snow_all) +factor(loc_lot_park) + factor(loc_gas_station_car_wash),family = "binomial",data = model_inputs2)
summary_tbl <- tidy(log_reg) %>%
mutate(odds = exp(estimate)) %>%
select(term, estimate, odds, everything())
kable(summary_tbl %>% select(-std.error,-statistic), format = "html", digits = 2,col.names = c("Term", "Log Odds", "Odds", "P-Value"), align = "lccc") %>%
kable_styling(bootstrap_options = c("striped","hover")) %>%
scroll_box(height = "3.5in")
| Term | Log Odds | Odds | P-Value |
|---|---|---|---|
| (Intercept) | -2.26 | 0.10 | 0.00 |
| factor(hr_of_day_grp)Early Morning (5-7AM) | 0.16 | 1.18 | 0.00 |
| factor(hr_of_day_grp)Early Night (6-9 PM) | -0.06 | 0.94 | 0.00 |
| factor(hr_of_day_grp)Late Night (10PM - 4 AM) | 0.25 | 1.29 | 0.00 |
| factor(hr_of_day_grp)Morning (8AM-12PM) | -0.23 | 0.80 | 0.00 |
| factor(loc_residential)1 | 0.87 | 2.39 | 0.00 |
| factor(loc_school_university)1 | 1.25 | 3.49 | 0.00 |
| factor(loc_public_transport)1 | 0.35 | 1.41 | 0.00 |
| factor(loc_street_alley)1 | 0.31 | 1.36 | 0.00 |
| factor(loc_airport)1 | -1.18 | 0.31 | 0.00 |
| factor(loc_abandoned_building)1 | -0.16 | 0.85 | 0.00 |
| factor(loc_atm_automatic_teller_machine)1 | -2.16 | 0.11 | 0.00 |
| factor(loc_public_places)1 | 0.32 | 1.37 | 0.00 |
| factor(loc_places_with_alcohol)1 | 0.11 | 1.11 | 0.00 |
| factor(loc_store_food)1 | -0.35 | 0.71 | 0.00 |
| temperature | 0.00 | 1.00 | 0.00 |
| factor(rain_all)1 | -0.02 | 0.98 | 0.00 |
| factor(snow_all)1 | 0.01 | 1.01 | 0.32 |
| factor(loc_lot_park)1 | -0.09 | 0.92 | 0.00 |
| factor(loc_gas_station_car_wash)1 | 0.03 | 1.03 | 0.13 |
model_inputs3$violent <- as.factor(model_inputs3$violent)
model_inputs3$day <- as.factor(model_inputs3$day)
model_inputs3$day_of_wk <- as.factor(model_inputs3$day_of_wk)
model_inputs3$hr_of_day_grp <- as.factor(model_inputs3$hr_of_day_grp)
model_inputs3$rain_all <- as.factor(model_inputs3$rain_all)
model_inputs3$mon <- as.factor(model_inputs3$mon)
n <- sample(1:nrow(model_inputs3),size = .3*nrow(model_inputs3))
testing <- model_inputs3[n,]
training <- model_inputs3[-n,]
miss_rate <- function(predicted, actual){
val <- sum(actual != predicted, na.rm = T)/length(actual)
return(val)
}
cm_mat <- function(predicted, actual){
val <- confusionMatrix(data = predicted, reference = actual)
return(val$table)
}
log_reg_pred <- glm(formula = violent ~ hr_of_day_grp + day_of_wk + mon + day + temperature + rain_all,family = "binomial", data = training)
bayes <- naiveBayes(formula = violent ~ hr_of_day_grp + day_of_wk + mon + day + temperature + rain_all,data = training)
rf <- randomForest(y = training$violent, x = training %>% ungroup() %>% select(-violent),ntree = 70)
print(paste0("A binomial logistic regression model gives a misclassification rate of ",
round(miss_rate(predicted = round(predict(log_reg_pred,newdata = testing,type = "response")),actual = testing$violent)*100,1), "%."))
## [1] "A binomial logistic regression model gives a misclassification rate of 33.4%."
cm_mat(predicted = factor(round(predict(log_reg_pred,newdata = testing,type = "response"))),actual = testing$violent)
## Reference
## Prediction 0 1
## 0 239550 120091
## 1 0 0
print(paste0("A Naive Bayes model gives a misclassification rate of ",
round(miss_rate(predicted =predict(bayes,newdata = testing,threshold = .001),actual = testing$violent)*100,1), "%."))
## [1] "A Naive Bayes model gives a misclassification rate of 33.4%."
cm_mat(predicted =predict(bayes,newdata = testing,threshold = .001),actual = testing$violent)
## Reference
## Prediction 0 1
## 0 239550 120091
## 1 0 0
print(paste0("A random forest model with 70 trees gives a misclassification rate of ",
round(miss_rate(predicted = predict(object = rf,newdata = testing), actual = testing$violent)*100,1), "%."))
## [1] "A random forest model with 70 trees gives a misclassification rate of 33.4%."
cm_mat(predicted = predict(object = rf,newdata = testing), actual = testing$violent)
## Reference
## Prediction 0 1
## 0 238145 118823
## 1 1405 1268