In the previous topic on crimes in Chicago we have explored dataset from gov.data and applied Pareto principle to select the most relevant types and locations of crimes in Chicago. See https://rpubs.com/alex-lev/248923.
Now we want to estimate criminal vilolence level in Chicago by comparing statistics for days of the week and months.
For more about Chicago criminal data see https://catalog.data.gov/dataset/crimes-2001-to-present-398a4.
chicago_crime<-readRDS(file = "chicago_crime.rds") # it takes two minutes to load compressed file in memory
dim(chicago_crime)
## [1] 6263265 22
names(chicago_crime)
## [1] "ID" "Case Number" "Date"
## [4] "Block" "IUCR" "Primary Type"
## [7] "Description" "Location Description" "Arrest"
## [10] "Domestic" "Beat" "District"
## [13] "Ward" "Community Area" "FBI Code"
## [16] "X Coordinate" "Y Coordinate" "Year"
## [19] "Updated On" "Latitude" "Longitude"
## [22] "Location"
We have to generate some additional variables for days of the week and month.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
head(chicago_crime$Date)# date as character
## [1] "02/20/2011 02:05:00 PM" "02/20/2011 03:30:00 PM"
## [3] "02/20/2011 07:15:00 PM" "02/20/2011 05:10:00 PM"
## [5] "02/18/2011 07:00:00 AM" "02/20/2011 01:00:00 PM"
chicago_crime$Day_time=strptime(chicago_crime$Date, format="%m/%d/%Y %I:%M:%S %p")
chicago_crime$Day_week<-format(chicago_crime$Day_time,'%A')
table(chicago_crime$Day_week) # number of crimes by day of the week
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 943674 883342 894295 844193 894729 898632 904400
chicago_crime$Month<-format(chicago_crime$Day_time,'%B')
table(chicago_crime$Month) # number of crimes by month
##
## April August December February January July June
## 516097 574234 460922 427162 505187 580322 551891
## March May November October September
## 515521 554846 490906 546844 539333
chicago_crime$M<-format(chicago_crime$Day_time,'%m')
chicago_crime$MM<-as.integer(chicago_crime$M) # month as integer
hist(chicago_crime$MM,main = "Number of crimes",xlab = "Number of month",breaks = 12)
The main types of crime in Chicago are THEFT, BATTERY, CRIMINAL DAMAGE, NARCOTICS, OTHER OFFENSE, ASSAULT and BURGLARY - 80% of all crimes. Now we’ll try to estimate difference in number of crimes by types using contigency tables. See https://en.wikipedia.org/wiki/Contingency_table.
cr_m_dw_<-filter(chicago_crime[,-23]) %>% group_by(Month,Day_week)
cr_md<-cr_m_dw_[,23:24]
head(cr_md,10)
## Source: local data frame [10 x 2]
## Groups: Month, Day_week [4]
##
## Day_week Month
## <chr> <chr>
## 1 Sunday February
## 2 Sunday February
## 3 Sunday February
## 4 Sunday February
## 5 Friday February
## 6 Sunday February
## 7 Sunday February
## 8 Wednesday February
## 9 Sunday February
## 10 Saturday February
cr_md_prm<-cr_m_dw_[,c(6,23:24)] # plus types of crime
cr_md_prm
## Source: local data frame [6,263,265 x 3]
## Groups: Month, Day_week [84]
##
## `Primary Type` Day_week Month
## <chr> <chr> <chr>
## 1 BATTERY Sunday February
## 2 CRIMINAL DAMAGE Sunday February
## 3 NARCOTICS Sunday February
## 4 ASSAULT Sunday February
## 5 THEFT Friday February
## 6 BATTERY Sunday February
## 7 BATTERY Sunday February
## 8 ROBBERY Wednesday February
## 9 CRIMINAL DAMAGE Sunday February
## 10 THEFT Saturday February
## # ... with 6,263,255 more rows
cr_md_prm_theft<-filter(cr_md_prm,`Primary Type`=="THEFT")
cr_md_prm_theft_<-cr_md_prm_theft[,2:3]
cr_md_prm_bat<-filter(cr_md_prm,`Primary Type`=="BATTERY")
cr_md_prm_bat_<-cr_md_prm_bat[,2:3]
cr_md_prm_crm_dam<-filter(cr_md_prm,`Primary Type`=="CRIMINAL DAMAGE")
cr_md_prm_crm_dam_<-cr_md_prm_crm_dam[,2:3]
cr_md_prm_nar<-filter(cr_md_prm,`Primary Type`=="NARCOTICS")
cr_md_prm_nar_<-cr_md_prm_nar[,2:3]
cr_md_prm_crm_oth_off<-filter(cr_md_prm,`Primary Type`=="OTHER OFFENSE")
cr_md_prm_crm_oth_off_<-cr_md_prm_crm_oth_off[,2:3]
cr_md_prm_ass<-filter(cr_md_prm,`Primary Type`=="ASSAULT")
cr_md_prm_ass_<-cr_md_prm_ass[,2:3]
cr_md_prm_bug<-filter(cr_md_prm,`Primary Type`=="BURGLARY")
cr_md_prm_bug_<-cr_md_prm_bug[,2:3]
cr_md_tab<-table(cr_md) # all types of crime
cr_md_tab
## Month
## Day_week April August December February January July June March May
## Friday 78142 86764 69418 67075 75516 84607 81673 78668 84066
## Monday 73759 80792 66553 59584 70684 82583 78672 71884 77535
## Saturday 72583 82466 66123 60194 72084 82477 79232 74476 77071
## Sunday 68304 79972 60140 56288 65913 81969 77666 66144 72602
## Thursday 74130 81054 66149 61412 73982 81840 76879 75954 81607
## Tuesday 74460 80435 65787 60229 73862 84305 77902 74080 81166
## Wednesday 74719 82751 66752 62380 73146 82541 79867 74315 80799
## Month
## Day_week November October September
## Friday 74156 82872 80717
## Monday 69225 77200 74871
## Saturday 72058 77122 78409
## Sunday 66692 72972 75531
## Thursday 66972 77470 77280
## Tuesday 70979 78147 77280
## Wednesday 70824 81061 75245
cr_md_prm_theft_tab<-table(cr_md_prm_theft_) #theft
cr_md_prm_bat_tab<-table(cr_md_prm_bat_) #battery
cr_md_prm_crm_dam_tab<-table(cr_md_prm_crm_dam_) #criminal damage
cr_md_prm_nar_tab<-table(cr_md_prm_nar_) #narcotics
cr_md_prm_crm_oth_off_tab<-table(cr_md_prm_crm_oth_off_) #other offence
cr_md_prm_ass_tab<-table(cr_md_prm_ass_) #assault
cr_md_prm_bug_tab<-table(cr_md_prm_bug_) #burglary
We use Chi-squared test for contingency tables which we have generated above. See https://en.wikipedia.org/wiki/Chi-squared_test
#all types of crime
ch.fit<-chisq.test(cr_md_tab)
ch.fit
##
## Pearson's Chi-squared test
##
## data: cr_md_tab
## X-squared = 2272.6, df = 66, p-value < 2.2e-16
#theft
ch.fit.theft<-chisq.test(cr_md_prm_theft_tab)
ch.fit.theft
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_theft_tab
## X-squared = 585.72, df = 66, p-value < 2.2e-16
#battery
ch.fit.bat<-chisq.test(cr_md_prm_bat_tab)
ch.fit.bat
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_bat_tab
## X-squared = 1328.4, df = 66, p-value < 2.2e-16
#criminal damage
ch.fit.dam<-chisq.test(cr_md_prm_crm_dam_tab)
ch.fit.dam
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_crm_dam_tab
## X-squared = 392.82, df = 66, p-value < 2.2e-16
#narcotics
ch.fit.nar<-chisq.test(cr_md_prm_nar_tab)
ch.fit.nar
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_nar_tab
## X-squared = 692.64, df = 66, p-value < 2.2e-16
#other offence
ch.fit.off<-chisq.test(cr_md_prm_crm_oth_off_tab)
ch.fit.off
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_crm_oth_off_tab
## X-squared = 142.24, df = 66, p-value = 1.615e-07
#assault
ch.fit.ass<-chisq.test(cr_md_prm_ass_tab)
ch.fit.ass
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_ass_tab
## X-squared = 681.05, df = 66, p-value < 2.2e-16
#burglary
ch.fit.bug<-chisq.test(cr_md_prm_bug_tab)
ch.fit.bug
##
## Pearson's Chi-squared test
##
## data: cr_md_prm_bug_tab
## X-squared = 399.31, df = 66, p-value < 2.2e-16
Here we produce charts for demonstating differences between types of crimes using the results by Chi-squared test. Let’s see!
library(corrplot)
# Visualize the contribution of day of the week and month
#all types of crime
contrib <- 100*ch.fit$residuals^2/ch.fit$statistic
round(contrib, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.083 0.031 0.001 5.041 0.208 4.028 1.158 0.561 0.115
## Monday 0.570 0.021 1.619 0.319 0.197 0.292 0.395 0.410 0.290
## Saturday 0.732 0.121 0.065 0.459 0.001 0.078 0.104 0.450 2.573
## Sunday 1.001 3.767 2.792 1.266 3.067 7.913 6.362 7.066 2.803
## Thursday 0.097 0.512 0.062 0.110 2.007 0.598 2.145 3.188 3.054
## Tuesday 0.101 2.039 0.079 0.805 1.155 0.574 0.913 0.008 1.343
## Wednesday 0.023 0.015 0.025 0.348 0.024 0.828 0.017 0.009 0.254
## Month
## Day_week November October September
## Friday 0.022 0.123 0.160
## Monday 0.000 0.003 0.825
## Saturday 2.422 0.518 1.121
## Sunday 0.184 0.322 4.872
## Thursday 6.248 0.237 0.031
## Tuesday 0.186 0.055 0.006
## Wednesday 0.002 2.453 3.918
corrplot(contrib, is.cor = FALSE)
#theft
contrib.th <- 100*ch.fit.theft$residuals^2/ch.fit.theft$statistic
round(contrib.th, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.023 0.304 1.323 6.835 0.486 5.584 0.228 1.311 0.303
## Monday 1.074 0.055 2.138 0.094 0.025 0.115 0.038 0.497 4.631
## Saturday 0.609 0.034 0.419 0.181 0.127 3.672 0.030 3.232 0.218
## Sunday 2.114 0.853 0.349 1.927 1.025 5.285 4.112 8.590 0.973
## Thursday 0.043 0.235 0.010 0.053 0.858 0.275 0.974 0.320 3.537
## Tuesday 0.254 1.600 0.353 0.922 1.575 1.109 0.674 0.329 0.999
## Wednesday 0.046 0.139 0.108 0.002 0.106 0.083 0.153 0.869 0.007
## Month
## Day_week November October September
## Friday 0.040 0.017 0.030
## Monday 0.000 0.070 1.820
## Saturday 3.752 0.189 0.042
## Sunday 0.032 0.037 3.838
## Thursday 9.263 0.167 0.494
## Tuesday 0.370 0.727 0.329
## Wednesday 0.014 1.230 3.094
corrplot(contrib.th, is.cor = FALSE)
#battery
contrib.bat <- 100*ch.fit.bat$residuals^2/ch.fit.bat$statistic
round(contrib.bat, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.432 0.990 0.686 2.356 0.002 2.343 0.536 0.947 0.002
## Monday 0.068 0.716 0.025 0.432 0.002 0.288 0.058 1.933 3.328
## Saturday 0.537 6.979 0.378 1.301 0.428 1.786 1.328 0.242 5.339
## Sunday 1.070 7.265 3.220 0.235 1.712 3.459 4.601 4.738 3.389
## Thursday 0.019 2.044 0.862 0.699 0.748 1.392 2.690 3.697 0.941
## Tuesday 0.055 1.685 0.043 0.243 1.076 0.001 0.941 0.156 1.315
## Wednesday 0.388 1.512 0.248 0.253 0.062 1.552 0.239 1.009 0.399
## Month
## Day_week November October September
## Friday 0.038 0.000 0.198
## Monday 0.104 0.278 0.264
## Saturday 0.024 0.363 1.454
## Sunday 0.033 0.049 3.040
## Thursday 0.135 0.203 0.748
## Tuesday 0.000 0.254 0.444
## Wednesday 0.012 1.872 3.061
corrplot(contrib.bat, is.cor = FALSE)
#criminal damage
contrib.dam <- 100*ch.fit.dam$residuals^2/ch.fit.dam$statistic
round(contrib.dam, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.521 0.392 0.322 7.551 0.181 2.308 2.065 1.136 0.007
## Monday 0.044 0.429 1.501 0.076 0.414 1.988 0.757 0.564 0.152
## Saturday 0.041 3.395 0.146 0.139 0.080 4.651 0.557 1.046 2.252
## Sunday 1.975 0.388 1.658 0.467 0.357 0.004 0.352 3.039 0.017
## Thursday 1.756 0.085 0.002 0.771 0.061 0.330 0.012 0.586 0.555
## Tuesday 0.035 0.038 0.005 1.982 2.388 3.054 0.178 0.226 0.120
## Wednesday 0.116 4.700 0.045 2.157 0.128 0.070 1.258 0.722 0.007
## Month
## Day_week November October September
## Friday 0.054 0.062 0.009
## Monday 3.230 0.069 1.672
## Saturday 3.994 0.162 5.690
## Sunday 0.802 0.392 7.429
## Thursday 0.001 2.802 2.637
## Tuesday 1.240 1.329 0.407
## Wednesday 0.160 1.413 4.092
corrplot(contrib.dam, is.cor = FALSE)
#narcotics
contrib.nar <- 100*ch.fit.nar$residuals^2/ch.fit.nar$statistic
round(contrib.nar, 3)
## Month
## Day_week April August December February January July June March
## Friday 0.537 2.551 0.069 0.707 0.760 0.608 1.220 0.871
## Monday 0.155 0.000 0.147 0.024 0.000 0.310 0.831 1.027
## Saturday 0.075 7.097 2.388 0.374 6.159 0.419 1.610 0.470
## Sunday 0.025 0.755 0.278 0.014 0.052 3.295 0.005 0.047
## Thursday 0.108 0.166 0.401 0.222 0.028 2.419 0.011 0.118
## Tuesday 0.168 0.065 0.330 2.262 1.026 4.537 0.084 0.252
## Wednesday 0.242 0.000 0.388 0.337 0.260 0.056 0.993 0.102
## Month
## Day_week May November October September
## Friday 0.448 0.912 0.277 0.142
## Monday 0.018 0.002 0.051 0.287
## Saturday 0.472 2.869 0.486 0.549
## Sunday 9.711 4.242 4.143 0.147
## Thursday 2.360 15.348 3.141 4.439
## Tuesday 0.456 0.001 0.109 0.487
## Wednesday 0.415 0.191 0.070 0.776
corrplot(contrib.nar, is.cor = FALSE)
#other offence
contrib.off <- 100*ch.fit.off$residuals^2/ch.fit.off$statistic
round(contrib.off, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.340 1.108 0.347 1.934 0.275 0.058 0.789 0.534 0.632
## Monday 0.085 0.190 1.387 1.561 4.518 0.270 2.126 0.004 4.530
## Saturday 1.770 0.596 0.302 0.726 1.040 4.347 0.534 3.920 1.118
## Sunday 0.122 4.977 0.007 0.325 1.095 0.276 2.864 0.707 3.479
## Thursday 0.345 0.276 0.070 0.125 2.445 0.142 1.528 0.804 0.681
## Tuesday 2.158 0.177 0.136 2.996 0.119 1.286 0.580 0.001 6.679
## Wednesday 0.115 1.651 0.338 0.361 0.258 0.084 0.239 1.369 0.554
## Month
## Day_week November October September
## Friday 0.052 0.017 0.204
## Monday 0.191 2.439 0.009
## Saturday 5.323 0.501 0.010
## Sunday 3.980 0.882 0.065
## Thursday 3.228 0.858 0.572
## Tuesday 0.148 0.099 1.059
## Wednesday 4.741 1.209 0.007
corrplot(contrib.off, is.cor = FALSE)
#assault
contrib.ass <- 100*ch.fit.ass$residuals^2/ch.fit.ass$statistic
round(contrib.ass, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.088 0.001 0.000 2.190 0.000 2.579 0.128 0.412 0.013
## Monday 1.113 0.001 0.027 1.202 0.600 0.023 0.574 0.729 0.064
## Saturday 0.637 3.724 0.007 1.206 0.024 4.561 1.143 1.849 5.113
## Sunday 0.228 5.150 0.607 4.786 1.393 8.080 4.449 5.817 2.284
## Thursday 0.040 1.994 0.726 2.589 1.087 1.453 3.229 2.938 0.882
## Tuesday 0.024 2.209 0.018 0.008 0.161 0.082 1.158 0.593 1.681
## Wednesday 0.028 1.154 0.005 1.038 0.316 2.159 0.277 1.643 1.422
## Month
## Day_week November October September
## Friday 0.014 0.007 0.000
## Monday 0.898 0.178 0.003
## Saturday 0.384 1.208 0.769
## Sunday 0.643 0.000 1.747
## Thursday 1.065 0.000 0.198
## Tuesday 0.338 0.170 0.082
## Wednesday 0.043 0.942 1.599
corrplot(contrib.ass, is.cor = FALSE)
#burglary
contrib.bug <- 100*ch.fit.bug$residuals^2/ch.fit.bug$statistic
round(contrib.bug, 3)
## Month
## Day_week April August December February January July June March May
## Friday 0.216 0.013 0.068 0.915 0.023 0.306 0.178 1.771 0.496
## Monday 2.229 0.240 1.801 0.046 0.141 0.000 0.072 0.013 3.211
## Saturday 1.583 2.139 0.000 1.182 1.350 2.046 0.331 0.061 0.072
## Sunday 0.058 6.566 2.128 0.443 8.042 8.835 1.605 1.123 0.092
## Thursday 0.018 0.295 0.224 0.108 1.216 1.227 0.646 1.316 0.592
## Tuesday 0.070 4.121 0.027 0.336 1.154 0.753 0.300 0.000 0.106
## Wednesday 0.060 0.316 0.020 0.010 0.597 1.715 0.198 1.619 0.257
## Month
## Day_week November October September
## Friday 8.155 0.138 0.116
## Monday 2.568 0.097 3.393
## Saturday 0.120 0.101 0.048
## Sunday 1.873 2.611 2.508
## Thursday 0.536 0.095 0.396
## Tuesday 1.983 0.076 0.175
## Wednesday 4.435 2.682 1.196
corrplot(contrib.bug, is.cor = FALSE)