Introduction

In the previous topics on crimes in Chicago we have explored dataset from gov.data and applied Pareto principle to select the most relevant types and locations of crimes in Chicago for estimation the difference in types of crimes versus day of the week and month. See https://rpubs.com/alex-lev/248923, https://rpubs.com/alex-lev/249124.

Research goal

Now we want to estimate criminal vilolence level in Chicago by comparing statistics for days of the week and months to the crime locations.

Data

For more about Chicago criminal data see https://catalog.data.gov/dataset/crimes-2001-to-present-398a4.

chicago_crime<-readRDS(file = "chicago_crime.rds") # it takes two minutes to load compressed file in memory

dim(chicago_crime)
## [1] 6263265      22
names(chicago_crime)
##  [1] "ID"                   "Case Number"          "Date"                
##  [4] "Block"                "IUCR"                 "Primary Type"        
##  [7] "Description"          "Location Description" "Arrest"              
## [10] "Domestic"             "Beat"                 "District"            
## [13] "Ward"                 "Community Area"       "FBI Code"            
## [16] "X Coordinate"         "Y Coordinate"         "Year"                
## [19] "Updated On"           "Latitude"             "Longitude"           
## [22] "Location"

Preparing data

We have to generate some additional variables for days of the week and month.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
head(chicago_crime$Date)# date as character
## [1] "02/20/2011 02:05:00 PM" "02/20/2011 03:30:00 PM"
## [3] "02/20/2011 07:15:00 PM" "02/20/2011 05:10:00 PM"
## [5] "02/18/2011 07:00:00 AM" "02/20/2011 01:00:00 PM"
chicago_crime$Day_time=strptime(chicago_crime$Date, format="%m/%d/%Y %I:%M:%S %p")

chicago_crime$Day_week<-format(chicago_crime$Day_time,'%A')

table(chicago_crime$Day_week) # number of crimes by day of the week
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##    943674    883342    894295    844193    894729    898632    904400
chicago_crime$Month<-format(chicago_crime$Day_time,'%B')

table(chicago_crime$Month) # number of crimes by month
## 
##     April    August  December  February   January      July      June 
##    516097    574234    460922    427162    505187    580322    551891 
##     March       May  November   October September 
##    515521    554846    490906    546844    539333
chicago_crime$M<-format(chicago_crime$Day_time,'%m')

chicago_crime$MM<-as.integer(chicago_crime$M) # month as integer

hist(chicago_crime$MM,main = "Number of crimes",xlab = "Number of month",breaks = 12)

Contigency tables

The main locations of crime in Chicago are STREET, RESIDENCE, APARTMENT, SIDEWALK, OTHER, PARKING LOT/GARAGE(NON.RESID.) and ALLEY - 80% of all crimes. Now we’ll try to estimate difference in number of crimes by location using contigency tables. See https://en.wikipedia.org/wiki/Contingency_table.

cr_m_dw_<-filter(chicago_crime[,-23]) %>% group_by(Month,Day_week) 
cr_md<-cr_m_dw_[,23:24]
head(cr_md,10)
## Source: local data frame [10 x 2]
## Groups: Month, Day_week [4]
## 
##     Day_week    Month
##        <chr>    <chr>
## 1     Sunday February
## 2     Sunday February
## 3     Sunday February
## 4     Sunday February
## 5     Friday February
## 6     Sunday February
## 7     Sunday February
## 8  Wednesday February
## 9     Sunday February
## 10  Saturday February
cr_md_prm<-cr_m_dw_[,c(8,23:24)] # plus locations of crime
cr_md_prm
## Source: local data frame [6,263,265 x 3]
## Groups: Month, Day_week [84]
## 
##            `Location Description`  Day_week    Month
##                             <chr>     <chr>    <chr>
## 1                       APARTMENT    Sunday February
## 2  PARKING LOT/GARAGE(NON.RESID.)    Sunday February
## 3                          STREET    Sunday February
## 4                        SIDEWALK    Sunday February
## 5  PARKING LOT/GARAGE(NON.RESID.)    Friday February
## 6                       APARTMENT    Sunday February
## 7                       RESIDENCE    Sunday February
## 8                        SIDEWALK Wednesday February
## 9                       RESIDENCE    Sunday February
## 10 PARKING LOT/GARAGE(NON.RESID.)  Saturday February
## # ... with 6,263,255 more rows
cr_md_prm_str<-filter(cr_md_prm,`Location Description`=="STREET")
cr_md_prm_str_<-cr_md_prm_str[,2:3]

cr_md_prm_res<-filter(cr_md_prm,`Location Description`=="RESIDENCE")
cr_md_prm_res_<-cr_md_prm_res[,2:3]


cr_md_prm_apt<-filter(cr_md_prm,`Location Description`=="APARTMENT")
cr_md_prm_apt_<-cr_md_prm_apt[,2:3]

cr_md_prm_sdw<-filter(cr_md_prm,`Location Description`=="SIDEWALK")
cr_md_prm_sdw_<-cr_md_prm_sdw[,2:3]

cr_md_prm_oth<-filter(cr_md_prm,`Location Description`=="OTHER")
cr_md_prm_oth_<-cr_md_prm_oth[,2:3]

cr_md_prm_prk<-filter(cr_md_prm,`Location Description`=="PARKING LOT/GARAGE(NON.RESID.)")
cr_md_prm_prk_<-cr_md_prm_prk[,2:3]


cr_md_prm_aly<-filter(cr_md_prm,`Location Description`=="ALLEY")
cr_md_prm_aly_<-cr_md_prm_aly[,2:3]


cr_md_prm_str_tab<-table(cr_md_prm_str_) #street

cr_md_prm_res_tab<-table(cr_md_prm_res_) #residence

cr_md_prm_apt_tab<-table(cr_md_prm_apt_) #apartment

cr_md_prm_sdw_tab<-table(cr_md_prm_sdw_) #sidewalk

cr_md_prm_oth_tab<-table(cr_md_prm_oth_) #other

cr_md_prm_prk_tab<-table(cr_md_prm_prk_) #parking 

cr_md_prm_aly_tab<-table(cr_md_prm_aly_) #alley

Chi-squared test

We use Chi-squared test for contingency tables which we have generated above. See https://en.wikipedia.org/wiki/Chi-squared_test

#street
ch.fit.str<-chisq.test(cr_md_prm_str_tab)
ch.fit.str
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_str_tab
## X-squared = 591.01, df = 66, p-value < 2.2e-16
#residence
ch.fit.res<-chisq.test(cr_md_prm_res_tab)
ch.fit.res
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_res_tab
## X-squared = 421.71, df = 66, p-value < 2.2e-16
#apartment
ch.fit.apt<-chisq.test(cr_md_prm_apt_tab)
ch.fit.apt
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_apt_tab
## X-squared = 176.72, df = 66, p-value = 4.75e-12
#sidewalk
ch.fit.sdw<-chisq.test(cr_md_prm_sdw_tab)
ch.fit.sdw
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_sdw_tab
## X-squared = 398.15, df = 66, p-value < 2.2e-16
#other
ch.fit.oth<-chisq.test(cr_md_prm_oth_tab)
ch.fit.oth
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_oth_tab
## X-squared = 152.28, df = 66, p-value = 8.991e-09
#parking
ch.fit.prk<-chisq.test(cr_md_prm_prk_tab)
ch.fit.prk
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_prk_tab
## X-squared = 141.04, df = 66, p-value = 2.254e-07
#alley
ch.fit.aly<-chisq.test(cr_md_prm_aly_tab)
ch.fit.aly
## 
##  Pearson's Chi-squared test
## 
## data:  cr_md_prm_aly_tab
## X-squared = 94.355, df = 66, p-value = 0.01257

Graphical interpretation

Here we produce charts for demonstating differences between locations of crimes using the results by Chi-squared test. Let’s see!

library(corrplot)
# Visualize the contribution of day of the week and month 

#street
contrib.str <- 100*ch.fit.str$residuals^2/ch.fit.str$statistic
round(contrib.str, 3)
##            Month
## Day_week    April August December February January  July  June March   May
##   Friday    0.756  0.159    0.003    3.444   0.179 7.716 1.155 0.754 0.237
##   Monday    0.001  0.015    0.145    0.003   0.000 1.645 0.833 0.038 0.124
##   Saturday  0.360  1.522    0.165    0.106   0.469 1.657 0.160 2.184 2.116
##   Sunday    0.492  0.546    1.419    1.010   3.572 3.024 5.844 5.028 1.304
##   Thursday  0.001  0.000    0.363    0.019   0.949 0.000 1.802 1.241 1.063
##   Tuesday   0.185  0.202    0.027    1.114   0.005 3.889 1.505 0.450 1.161
##   Wednesday 0.000  0.433    0.001    0.000   0.360 0.607 0.561 0.165 0.148
##            Month
## Day_week    November October September
##   Friday       0.277   0.598     1.097
##   Monday       0.561   0.015     1.026
##   Saturday     5.196   1.373     1.891
##   Sunday       0.521   0.048     4.981
##   Thursday     7.631   0.430     1.305
##   Tuesday      0.000   0.002     0.035
##   Wednesday    0.017   2.096     6.467
corrplot(contrib.str, is.cor = FALSE)

#residence
contrib.res <- 100*ch.fit.res$residuals^2/ch.fit.res$statistic
round(contrib.res, 3)
##            Month
## Day_week    April August December February January  July  June March   May
##   Friday    0.298  1.630    0.888    3.541   1.624 1.232 1.743 2.527 0.082
##   Monday    2.524  0.365    7.692    0.231   1.266 0.170 0.016 1.757 0.263
##   Saturday  2.520  0.290    0.047    0.149   0.035 0.219 0.513 4.388 1.297
##   Sunday    0.038  0.042    2.248    0.043   1.921 1.419 3.114 0.925 0.360
##   Thursday  0.643  0.108    0.080    0.018   3.762 0.362 1.206 0.529 0.162
##   Tuesday   0.023  1.362    0.195    1.941   5.636 1.521 0.178 0.602 3.866
##   Wednesday 0.012  0.805    0.333    0.184   0.100 0.398 0.266 1.892 0.160
##            Month
## Day_week    November October September
##   Friday       0.965   0.488     1.121
##   Monday       1.390   0.649     0.016
##   Saturday     1.294   0.145     0.992
##   Sunday       0.534   0.013     6.141
##   Thursday     1.145   2.105     0.143
##   Tuesday      0.330   0.355     0.355
##   Wednesday    0.515   5.908     1.638
corrplot(contrib.res, is.cor = FALSE)

#apartment
contrib.apt <- 100*ch.fit.apt$residuals^2/ch.fit.apt$statistic
round(contrib.apt, 3)
##            Month
## Day_week     April August December February January   July   June  March
##   Friday     0.311  0.002    0.096    1.082   1.422  0.025  0.752  0.362
##   Monday     0.090  0.172    1.389    0.013   0.297  0.092  1.166  1.242
##   Saturday   1.647  0.004    0.081    0.002   0.357  4.814  0.312  3.808
##   Sunday     0.841  0.329   10.724    8.737   1.834  0.197  3.539  0.005
##   Thursday   0.015  1.317    0.531    3.385   9.935  0.003  2.204  0.170
##   Tuesday    0.167  0.092    0.469    4.186   0.911  1.918  2.998  1.132
##   Wednesday  0.753  0.985    0.616    0.083   2.431  0.269  0.247  4.349
##            Month
## Day_week       May November October September
##   Friday     0.608    0.125   0.305     0.081
##   Monday     0.214    0.062   0.043     0.037
##   Saturday   0.924    0.156   0.923     0.075
##   Sunday     0.023    0.174   0.001     2.058
##   Thursday   0.772    0.004   0.012     0.031
##   Tuesday    0.126    0.308   2.038     0.149
##   Wednesday  0.516    1.265   1.824     2.238
corrplot(contrib.apt, is.cor = FALSE)

#sidewalk
contrib.sdw <- 100*ch.fit.sdw$residuals^2/ch.fit.sdw$statistic
round(contrib.sdw, 3)
##            Month
## Day_week     April August December February January   July   June  March
##   Friday     0.000  0.153    3.879    2.481   0.075  3.827  1.078  0.003
##   Monday     0.002  0.034    0.409    0.043   0.203  0.594  0.206  0.131
##   Saturday   0.096  0.002    0.470    1.413   0.132  0.556  0.144  0.389
##   Sunday     0.294  4.529    4.239    3.347   0.983 10.739  4.978  4.332
##   Thursday   0.701  0.033    0.073    0.062   0.019  0.058  0.217  2.728
##   Tuesday    0.701  1.276    0.127    2.356   0.003  0.677  0.401  1.021
##   Wednesday  0.677  0.295    0.005    0.258   0.001  0.108  0.017  0.126
##            Month
## Day_week       May November October September
##   Friday     0.345    1.631   0.002     0.018
##   Monday     1.137    0.003   0.076     0.265
##   Saturday   2.894    1.481   0.237     0.134
##   Sunday     6.469    0.028   0.281     1.583
##   Thursday   3.676    9.736   1.160     0.096
##   Tuesday    1.327    0.489   0.004     0.329
##   Wednesday  0.530    0.065   0.879     3.422
corrplot(contrib.sdw, is.cor = FALSE)

#other
contrib.oth <- 100*ch.fit.oth$residuals^2/ch.fit.oth$statistic
round(contrib.oth, 3)
##            Month
## Day_week    April August December February January  July  June March   May
##   Friday    0.042  1.868    0.618    2.166   2.457 0.870 0.017 0.132 0.431
##   Monday    4.276  0.099    1.253    0.007   0.567 0.091 0.122 0.137 3.911
##   Saturday  3.822  0.031    0.953    0.512   0.136 2.489 0.544 2.523 0.920
##   Sunday    2.191  0.002    4.318    1.809   0.535 0.498 2.841 2.130 0.045
##   Thursday  0.008  0.002    0.347    0.191   1.307 0.334 0.006 0.697 0.021
##   Tuesday   1.717  0.422    0.016    0.010   0.481 0.758 1.717 0.606 1.108
##   Wednesday 0.031  0.681    0.092    2.101   2.486 1.609 1.828 0.079 1.563
##            Month
## Day_week    November October September
##   Friday       3.102   2.333     1.212
##   Monday       0.036   0.290     1.591
##   Saturday     4.932   1.713     0.215
##   Sunday       5.790   0.615     2.278
##   Thursday     5.262   0.283     0.259
##   Tuesday      0.093   2.741     0.648
##   Wednesday    0.031   0.460     0.534
corrplot(contrib.oth, is.cor = FALSE)

#parking
contrib.prk <- 100*ch.fit.prk$residuals^2/ch.fit.prk$statistic
round(contrib.prk, 3)
##            Month
## Day_week     April August December February January   July   June  March
##   Friday     0.253  0.000    4.840    2.039   2.508  1.920  0.006  0.139
##   Monday     0.252  0.472    3.549    0.579   1.192  0.021  0.010  1.051
##   Saturday   0.070  0.376    0.266    2.291   2.367  1.120  0.039  0.985
##   Sunday     5.325  0.207    0.078    0.244   0.697  0.005  2.718 12.615
##   Thursday   6.444  0.041    0.567    0.885   0.019  0.221  1.438  1.711
##   Tuesday    0.288  0.087    0.656    1.625   0.012  3.283  0.233  0.491
##   Wednesday  0.240  0.174    0.555    0.334   0.360  0.042  0.437  0.038
##            Month
## Day_week       May November October September
##   Friday     0.062    0.399   0.013     0.001
##   Monday     0.010    0.234   0.173     3.732
##   Saturday   1.165    0.014   1.247     0.534
##   Sunday     0.342    3.652   1.892     4.121
##   Thursday   0.663    6.799   0.628     0.080
##   Tuesday    0.577    0.325   0.289     0.648
##   Wednesday  0.058    0.079   2.030     1.818
corrplot(contrib.prk, is.cor = FALSE)

#alley
contrib.aly <- 100*ch.fit.aly$residuals^2/ch.fit.aly$statistic
round(contrib.aly, 3)
##            Month
## Day_week    April August December February January  July  June March   May
##   Friday    0.126  1.201    0.105    0.952   0.402 0.028 0.707 0.340 1.735
##   Monday    0.780  0.224    0.100    0.667   0.327 0.005 0.006 0.046 4.868
##   Saturday  0.818  0.097    2.101    0.549   0.310 0.111 0.000 0.198 1.274
##   Sunday    2.100  0.008    0.067    0.006   4.386 0.009 9.990 3.611 0.131
##   Thursday  3.195  0.029    0.048    0.043   0.074 0.257 4.274 1.094 0.128
##   Tuesday   2.516  0.006    0.243    0.189   1.762 0.045 0.305 1.509 0.414
##   Wednesday 0.719  0.437    0.161    0.112   0.137 0.315 0.057 1.458 0.096
##            Month
## Day_week    November October September
##   Friday       1.335   0.465     3.643
##   Monday       0.003   0.992     4.447
##   Saturday     2.722   0.527     1.141
##   Sunday       0.668   0.059     6.366
##   Thursday     3.398   4.292     5.159
##   Tuesday      0.012   0.020     1.910
##   Wednesday    0.188   4.468     0.173
corrplot(contrib.aly, is.cor = FALSE)

Conclusions

  1. For STREET we can see Friday in July, Thursday in November and Wendsday in September as the most critical time, while Tuesday in February, June and March, Thursday in January and Friday in June contribute very little.
  2. For RESIDENCE we can see Monday in December as the most critical time, while Monday in January, March and November contribute very little.
  3. For APARTMENT we can see Sunday in December and February together with Thursday in January as the most critical time (maybe too cold). But Mondays in December, June and March are so weak.
  4. For SIDEWALK we can see Sunday in July and Thursday in November as the most critical time, while Tuesday in May and Saturday in November are so week.
  5. For OTHER we can see Saturday, Sunday and Thursday in November as critical time, while Tuesday in July and May, Thursday in March and Saturday in May are soo weak.
  6. For PARKING LOT/GARAGE(NON.RESID.) we can see Sunday in March as critical time, while Thursday in June and March are so weak.
  7. For ALLEY we can see Sunday in June as the most critical time, while Wednesday and Tuesday in March are so weak.