In the previous topics on crimes in Chicago we have explored dataset from gov.data and applied Pareto principle to select the most relevant types and locations of crimes in Chicago for estimation the difference in types of crimes versus day of the week and month. See https://rpubs.com/alex-lev/248923, https://rpubs.com/alex-lev/249124.
Now we want to estimate criminal vilolence level in Chicago by comparing statistics for days of the week and months to the crime locations.
For more about Chicago criminal data see https://catalog.data.gov/dataset/crimes-2001-to-present-398a4.
chicago_crime<-readRDS(file = "chicago_crime.rds") # it takes two minutes to load compressed file in memory
dim(chicago_crime)
## [1] 6263265 22
names(chicago_crime)
## [1] "ID" "Case Number" "Date"
## [4] "Block" "IUCR" "Primary Type"
## [7] "Description" "Location Description" "Arrest"
## [10] "Domestic" "Beat" "District"
## [13] "Ward" "Community Area" "FBI Code"
## [16] "X Coordinate" "Y Coordinate" "Year"
## [19] "Updated On" "Latitude" "Longitude"
## [22] "Location"
We have to generate some additional variables for days of the week and month.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(chicago_crime$Date)# date as character
## [1] "02/20/2011 02:05:00 PM" "02/20/2011 03:30:00 PM"
## [3] "02/20/2011 07:15:00 PM" "02/20/2011 05:10:00 PM"
## [5] "02/18/2011 07:00:00 AM" "02/20/2011 01:00:00 PM"
chicago_crime$Day_time=strptime(chicago_crime$Date, format="%m/%d/%Y %I:%M:%S %p")
chicago_crime$Day_week<-format(chicago_crime$Day_time,'%A')
table(chicago_crime$Day_week) # number of crimes by day of the week
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 943674 883342 894295 844193 894729 898632 904400
chicago_crime$Month<-format(chicago_crime$Day_time,'%B')
table(chicago_crime$Month) # number of crimes by month
##
## April August December February January July June
## 516097 574234 460922 427162 505187 580322 551891
## March May November October September
## 515521 554846 490906 546844 539333
chicago_crime$M<-format(chicago_crime$Day_time,'%m')
chicago_crime$MM<-as.integer(chicago_crime$M) # month as integer
hist(chicago_crime$MM,main = "Number of crimes",xlab = "Number of month",breaks = 12)
The main locations of crime in Chicago are STREET, RESIDENCE, APARTMENT, SIDEWALK, OTHER, PARKING LOT/GARAGE(NON.RESID.) and ALLEY - 80% of all crimes. Now we’ll try to estimate difference in number of crimes by location using contigency tables. See https://en.wikipedia.org/wiki/Contingency_table.
cr_m_dw_<-filter(chicago_crime[,-23]) %>% group_by(Month,Day_week)
cr_md<-cr_m_dw_[,23:24]
head(cr_md,10)
## Source: local data frame [10 x 2]
## Groups: Month, Day_week [4]
##
## Day_week Month
## <chr> <chr>
## 1 Sunday February
## 2 Sunday February
## 3 Sunday February
## 4 Sunday February
## 5 Friday February
## 6 Sunday February
## 7 Sunday February
## 8 Wednesday February
## 9 Sunday February
## 10 Saturday February
cr_md_prm<-cr_m_dw_[,c(8,23:24)] # plus locations of crime
cr_md_prm
## Source: local data frame [6,263,265 x 3]
## Groups: Month, Day_week [84]
##
## `Location Description` Day_week Month
## <chr> <chr> <chr>
## 1 APARTMENT Sunday February
## 2 PARKING LOT/GARAGE(NON.RESID.) Sunday February
## 3 STREET Sunday February
## 4 SIDEWALK Sunday February
## 5 PARKING LOT/GARAGE(NON.RESID.) Friday February
## 6 APARTMENT Sunday February
## 7 RESIDENCE Sunday February
## 8 SIDEWALK Wednesday February
## 9 RESIDENCE Sunday February
## 10 PARKING LOT/GARAGE(NON.RESID.) Saturday February
## # ... with 6,263,255 more rows
cr_md_prm_str<-filter(cr_md_prm,`Location Description`=="STREET")
cr_md_prm_str_<-cr_md_prm_str[,2:3]
cr_md_prm_res<-filter(cr_md_prm,`Location Description`=="RESIDENCE")
cr_md_prm_res_<-cr_md_prm_res[,2:3]
cr_md_prm_apt<-filter(cr_md_prm,`Location Description`=="APARTMENT")
cr_md_prm_apt_<-cr_md_prm_apt[,2:3]
cr_md_prm_sdw<-filter(cr_md_prm,`Location Description`=="SIDEWALK")
cr_md_prm_sdw_<-cr_md_prm_sdw[,2:3]
cr_md_prm_oth<-filter(cr_md_prm,`Location Description`=="OTHER")
cr_md_prm_oth_<-cr_md_prm_oth[,2:3]
cr_md_prm_prk<-filter(cr_md_prm,`Location Description`=="PARKING LOT/GARAGE(NON.RESID.)")
cr_md_prm_prk_<-cr_md_prm_prk[,2:3]
cr_md_prm_aly<-filter(cr_md_prm,`Location Description`=="ALLEY")
cr_md_prm_aly_<-cr_md_prm_aly[,2:3]
cr_md_prm_str_tab<-table(cr_md_prm_str_) #street
cr_md_prm_res_tab<-table(cr_md_prm_res_) #residence
cr_md_prm_apt_tab<-table(cr_md_prm_apt_) #apartment
cr_md_prm_sdw_tab<-table(cr_md_prm_sdw_) #sidewalk
cr_md_prm_oth_tab<-table(cr_md_prm_oth_) #other
cr_md_prm_prk_tab<-table(cr_md_prm_prk_) #parking
cr_md_prm_aly_tab<-table(cr_md_prm_aly_) #alley
We use Chi-squared test for contingency tables which we have generated above. See https://en.wikipedia.org/wiki/Chi-squared_test
#street
ch.fit.str<-chisq.test(cr_md_prm_str_tab)
ch.fit.str
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_str_tab
## X-squared = 591.01, df = 66, p-value < 2.2e-16
#residence
ch.fit.res<-chisq.test(cr_md_prm_res_tab)
ch.fit.res
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_res_tab
## X-squared = 421.71, df = 66, p-value < 2.2e-16
#apartment
ch.fit.apt<-chisq.test(cr_md_prm_apt_tab)
ch.fit.apt
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_apt_tab
## X-squared = 176.72, df = 66, p-value = 4.75e-12
#sidewalk
ch.fit.sdw<-chisq.test(cr_md_prm_sdw_tab)
ch.fit.sdw
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_sdw_tab
## X-squared = 398.15, df = 66, p-value < 2.2e-16
#other
ch.fit.oth<-chisq.test(cr_md_prm_oth_tab)
ch.fit.oth
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_oth_tab
## X-squared = 152.28, df = 66, p-value = 8.991e-09
#parking
ch.fit.prk<-chisq.test(cr_md_prm_prk_tab)
ch.fit.prk
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_prk_tab
## X-squared = 141.04, df = 66, p-value = 2.254e-07
#alley
ch.fit.aly<-chisq.test(cr_md_prm_aly_tab)
ch.fit.aly
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_aly_tab
## X-squared = 94.355, df = 66, p-value = 0.01257
Here we produce charts for demonstating differences between locations of crimes using the results by Chi-squared test. Let’s see!
library(corrplot)
# Visualize the contribution of day of the week and month
#street
contrib.str <- 100*ch.fit.str$residuals^2/ch.fit.str$statistic
round(contrib.str, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.756 0.159 0.003 3.444 0.179 7.716 1.155 0.754 0.237
## Monday 0.001 0.015 0.145 0.003 0.000 1.645 0.833 0.038 0.124
## Saturday 0.360 1.522 0.165 0.106 0.469 1.657 0.160 2.184 2.116
## Sunday 0.492 0.546 1.419 1.010 3.572 3.024 5.844 5.028 1.304
## Thursday 0.001 0.000 0.363 0.019 0.949 0.000 1.802 1.241 1.063
## Tuesday 0.185 0.202 0.027 1.114 0.005 3.889 1.505 0.450 1.161
## Wednesday 0.000 0.433 0.001 0.000 0.360 0.607 0.561 0.165 0.148
## Month
## Day_week November October September
## Friday 0.277 0.598 1.097
## Monday 0.561 0.015 1.026
## Saturday 5.196 1.373 1.891
## Sunday 0.521 0.048 4.981
## Thursday 7.631 0.430 1.305
## Tuesday 0.000 0.002 0.035
## Wednesday 0.017 2.096 6.467
corrplot(contrib.str, is.cor = FALSE)
#residence
contrib.res <- 100*ch.fit.res$residuals^2/ch.fit.res$statistic
round(contrib.res, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.298 1.630 0.888 3.541 1.624 1.232 1.743 2.527 0.082
## Monday 2.524 0.365 7.692 0.231 1.266 0.170 0.016 1.757 0.263
## Saturday 2.520 0.290 0.047 0.149 0.035 0.219 0.513 4.388 1.297
## Sunday 0.038 0.042 2.248 0.043 1.921 1.419 3.114 0.925 0.360
## Thursday 0.643 0.108 0.080 0.018 3.762 0.362 1.206 0.529 0.162
## Tuesday 0.023 1.362 0.195 1.941 5.636 1.521 0.178 0.602 3.866
## Wednesday 0.012 0.805 0.333 0.184 0.100 0.398 0.266 1.892 0.160
## Month
## Day_week November October September
## Friday 0.965 0.488 1.121
## Monday 1.390 0.649 0.016
## Saturday 1.294 0.145 0.992
## Sunday 0.534 0.013 6.141
## Thursday 1.145 2.105 0.143
## Tuesday 0.330 0.355 0.355
## Wednesday 0.515 5.908 1.638
corrplot(contrib.res, is.cor = FALSE)
#apartment
contrib.apt <- 100*ch.fit.apt$residuals^2/ch.fit.apt$statistic
round(contrib.apt, 3)
## Month
## Day_week April August December February January July June March
## Friday 0.311 0.002 0.096 1.082 1.422 0.025 0.752 0.362
## Monday 0.090 0.172 1.389 0.013 0.297 0.092 1.166 1.242
## Saturday 1.647 0.004 0.081 0.002 0.357 4.814 0.312 3.808
## Sunday 0.841 0.329 10.724 8.737 1.834 0.197 3.539 0.005
## Thursday 0.015 1.317 0.531 3.385 9.935 0.003 2.204 0.170
## Tuesday 0.167 0.092 0.469 4.186 0.911 1.918 2.998 1.132
## Wednesday 0.753 0.985 0.616 0.083 2.431 0.269 0.247 4.349
## Month
## Day_week May November October September
## Friday 0.608 0.125 0.305 0.081
## Monday 0.214 0.062 0.043 0.037
## Saturday 0.924 0.156 0.923 0.075
## Sunday 0.023 0.174 0.001 2.058
## Thursday 0.772 0.004 0.012 0.031
## Tuesday 0.126 0.308 2.038 0.149
## Wednesday 0.516 1.265 1.824 2.238
corrplot(contrib.apt, is.cor = FALSE)
#sidewalk
contrib.sdw <- 100*ch.fit.sdw$residuals^2/ch.fit.sdw$statistic
round(contrib.sdw, 3)
## Month
## Day_week April August December February January July June March
## Friday 0.000 0.153 3.879 2.481 0.075 3.827 1.078 0.003
## Monday 0.002 0.034 0.409 0.043 0.203 0.594 0.206 0.131
## Saturday 0.096 0.002 0.470 1.413 0.132 0.556 0.144 0.389
## Sunday 0.294 4.529 4.239 3.347 0.983 10.739 4.978 4.332
## Thursday 0.701 0.033 0.073 0.062 0.019 0.058 0.217 2.728
## Tuesday 0.701 1.276 0.127 2.356 0.003 0.677 0.401 1.021
## Wednesday 0.677 0.295 0.005 0.258 0.001 0.108 0.017 0.126
## Month
## Day_week May November October September
## Friday 0.345 1.631 0.002 0.018
## Monday 1.137 0.003 0.076 0.265
## Saturday 2.894 1.481 0.237 0.134
## Sunday 6.469 0.028 0.281 1.583
## Thursday 3.676 9.736 1.160 0.096
## Tuesday 1.327 0.489 0.004 0.329
## Wednesday 0.530 0.065 0.879 3.422
corrplot(contrib.sdw, is.cor = FALSE)
#other
contrib.oth <- 100*ch.fit.oth$residuals^2/ch.fit.oth$statistic
round(contrib.oth, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.042 1.868 0.618 2.166 2.457 0.870 0.017 0.132 0.431
## Monday 4.276 0.099 1.253 0.007 0.567 0.091 0.122 0.137 3.911
## Saturday 3.822 0.031 0.953 0.512 0.136 2.489 0.544 2.523 0.920
## Sunday 2.191 0.002 4.318 1.809 0.535 0.498 2.841 2.130 0.045
## Thursday 0.008 0.002 0.347 0.191 1.307 0.334 0.006 0.697 0.021
## Tuesday 1.717 0.422 0.016 0.010 0.481 0.758 1.717 0.606 1.108
## Wednesday 0.031 0.681 0.092 2.101 2.486 1.609 1.828 0.079 1.563
## Month
## Day_week November October September
## Friday 3.102 2.333 1.212
## Monday 0.036 0.290 1.591
## Saturday 4.932 1.713 0.215
## Sunday 5.790 0.615 2.278
## Thursday 5.262 0.283 0.259
## Tuesday 0.093 2.741 0.648
## Wednesday 0.031 0.460 0.534
corrplot(contrib.oth, is.cor = FALSE)
#parking
contrib.prk <- 100*ch.fit.prk$residuals^2/ch.fit.prk$statistic
round(contrib.prk, 3)
## Month
## Day_week April August December February January July June March
## Friday 0.253 0.000 4.840 2.039 2.508 1.920 0.006 0.139
## Monday 0.252 0.472 3.549 0.579 1.192 0.021 0.010 1.051
## Saturday 0.070 0.376 0.266 2.291 2.367 1.120 0.039 0.985
## Sunday 5.325 0.207 0.078 0.244 0.697 0.005 2.718 12.615
## Thursday 6.444 0.041 0.567 0.885 0.019 0.221 1.438 1.711
## Tuesday 0.288 0.087 0.656 1.625 0.012 3.283 0.233 0.491
## Wednesday 0.240 0.174 0.555 0.334 0.360 0.042 0.437 0.038
## Month
## Day_week May November October September
## Friday 0.062 0.399 0.013 0.001
## Monday 0.010 0.234 0.173 3.732
## Saturday 1.165 0.014 1.247 0.534
## Sunday 0.342 3.652 1.892 4.121
## Thursday 0.663 6.799 0.628 0.080
## Tuesday 0.577 0.325 0.289 0.648
## Wednesday 0.058 0.079 2.030 1.818
corrplot(contrib.prk, is.cor = FALSE)
#alley
contrib.aly <- 100*ch.fit.aly$residuals^2/ch.fit.aly$statistic
round(contrib.aly, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.126 1.201 0.105 0.952 0.402 0.028 0.707 0.340 1.735
## Monday 0.780 0.224 0.100 0.667 0.327 0.005 0.006 0.046 4.868
## Saturday 0.818 0.097 2.101 0.549 0.310 0.111 0.000 0.198 1.274
## Sunday 2.100 0.008 0.067 0.006 4.386 0.009 9.990 3.611 0.131
## Thursday 3.195 0.029 0.048 0.043 0.074 0.257 4.274 1.094 0.128
## Tuesday 2.516 0.006 0.243 0.189 1.762 0.045 0.305 1.509 0.414
## Wednesday 0.719 0.437 0.161 0.112 0.137 0.315 0.057 1.458 0.096
## Month
## Day_week November October September
## Friday 1.335 0.465 3.643
## Monday 0.003 0.992 4.447
## Saturday 2.722 0.527 1.141
## Sunday 0.668 0.059 6.366
## Thursday 3.398 4.292 5.159
## Tuesday 0.012 0.020 1.910
## Wednesday 0.188 4.468 0.173
corrplot(contrib.aly, is.cor = FALSE)