This document shows the steps taken to get the number of crimes per hour for 2014 in Dallas County.
This analysis uses Dallas Open Data (https://www.dallasopendata.com/) to calculate the number of crimes per zip code in Dallas County. As pointed out on Dallas Police Public Data website (http://www.dallaspolice.net/publicdata/), the data that the police supply to the public is sample data, so the data cannot be used to supply official statistics.
Add required packages.
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.2.5
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Get the crime data from dallasopendata.com
crime.data.file <- "crime.csv"
if(!file.exists(crime.data.file)){
download.file("http://www.dallasopendata.com/api/views/ftja-9jxd/rows.csv?accessType=DOWNLOAD",
destfile=crime.data.file)
}
Disclaimer: The data supplied by Dallas Police Department is sampled and should not be used for statistical purposes, but we should be able to get an idea of when crimes are committed.
The Dallas Police Department implemented a new Records Management System on June 1, 2014. To get crime data for 2014, two datasets are needed.
rms.file <- "rms.csv"
if(!file.exists(rms.file)){
download.file("http://www.dallasopendata.com/api/views/tbnj-w5hb/rows.csv?accessType=DOWNLOAD",
destfile=rms.file)
}
Read in the crime data into a data.frame
crime.data.part1 <- read.csv(crime.data.file,
as.is = TRUE)
crime.data.part2 <- read.csv(rms.file,
as.is = TRUE)
Get the columns that are needed for this analysis
crime.data <- dplyr::select(crime.data.part1, offensedate, offensetimedispatched)
temp <- dplyr::select(crime.data.part2, Date1, Time1)
Change the names of the columns to match the first set of data.
colnames(temp) <- c("offensedate", "offensetimedispatched")
Remove records before June 1, 2014 and use only 2014 data.
temp <- mutate(temp, tempdate = as.Date(temp$offensedate,
format="%m/%d/%Y"))
temp <- temp[as.Date(temp$tempdate) >= as.Date("2014-06-01")
& year(temp$tempdate) == 2014
& !is.na(temp$tempdate),]
#' Remove the tempdate column
tempdateindex <- grep("^tempdate$", colnames(temp))
temp <- temp[,-tempdateindex]
head(temp)
## offensedate offensetimedispatched
## 339 06/03/2014 12:00:00 AM 13:00
## 344 06/17/2014 12:00:00 AM 00:00
## 409 07/05/2014 12:00:00 AM 18:30
## 410 07/23/2014 12:00:00 AM 19:00
## 411 08/02/2014 12:00:00 AM 10:00
## 413 07/01/2014 12:00:00 AM 11:30
Bind the two data sets
crime.data <- rbind(crime.data, temp)
Check our date range of the data
crime.data$offensedate <- as.Date(crime.data$offensedate,
format="%m/%d/%Y")
paste("Min is ", min(crime.data$offensedate), sep=" ")
## [1] "Min is 1994-03-15"
paste("Max is ", max(crime.data$offensedate), sep=" ")
## [1] "Max is 2014-12-31"
Check if the data is what is expected
crime.data <- mutate(crime.data, offenseyear = year(crime.data$offensedate))
crime <- group_by(crime.data, offenseyear)
summarize(crime, countsperyear = length(offenseyear))
## Source: local data frame [20 x 2]
##
## offenseyear countsperyear
## (dbl) (int)
## 1 1994 1
## 2 1995 1
## 3 1996 1
## 4 1997 1
## 5 1998 1
## 6 2000 2
## 7 2001 4
## 8 2002 4
## 9 2003 12
## 10 2004 21
## 11 2005 14
## 12 2006 14
## 13 2007 23
## 14 2008 33
## 15 2009 31
## 16 2010 60
## 17 2011 80
## 18 2012 270
## 19 2013 24124
## 20 2014 109754
Dates previous to 2014 are not complete, so we will get data for year 2014
crime.data <- crime.data[crime.data$offenseyear == 2014,]
Assuming “offensetimedispatched” is the time that the offense happened, we will use that column to determine when the crime happened.
crime.data$offensetimedispatched <- strptime(crime.data$offensetimedispatched,
format="%H:%M")
Add an hour column
merged.data <- mutate(crime.data, offensehour = hour(crime.data$offensetimedispatched))
merged.data$offensetimedispatched <- as.character(merged.data$offensetimedispatched)
merged.data$offensehour <- as.numeric(merged.data$offensehour)
head(merged.data)
## offensedate offensetimedispatched offenseyear offensehour
## 1 2014-05-31 2016-05-22 13:55:00 2014 13
## 2 2014-05-31 2016-05-22 20:50:00 2014 20
## 3 2014-05-31 2016-05-22 15:02:00 2014 15
## 4 2014-05-31 2016-05-22 14:18:00 2014 14
## 5 2014-05-31 2016-05-22 20:39:00 2014 20
## 6 2014-05-31 2016-05-22 16:09:00 2014 16
Group by date and then by hour
crime.data.hour <- group_by(merged.data, offensedate, offensehour)
head(crime.data.hour)
## Source: local data frame [6 x 4]
## Groups: offensedate, offensehour [5]
##
## offensedate offensetimedispatched offenseyear offensehour
## (date) (chr) (dbl) (dbl)
## 1 2014-05-31 2016-05-22 13:55:00 2014 13
## 2 2014-05-31 2016-05-22 20:50:00 2014 20
## 3 2014-05-31 2016-05-22 15:02:00 2014 15
## 4 2014-05-31 2016-05-22 14:18:00 2014 14
## 5 2014-05-31 2016-05-22 20:39:00 2014 20
## 6 2014-05-31 2016-05-22 16:09:00 2014 16
Get the crimes per hour
complete.sample <- summarise(crime.data.hour,
CrimesPerHour = length(offensehour) )
complete.sample
## Source: local data frame [8,693 x 3]
## Groups: offensedate [?]
##
## offensedate offensehour CrimesPerHour
## (date) (dbl) (int)
## 1 2014-01-01 0 6
## 2 2014-01-01 1 5
## 3 2014-01-01 2 11
## 4 2014-01-01 3 6
## 5 2014-01-01 4 10
## 6 2014-01-01 5 5
## 7 2014-01-01 6 2
## 8 2014-01-01 7 13
## 9 2014-01-01 8 16
## 10 2014-01-01 9 17
## .. ... ... ...
Simple normalization of the crimes per hour throughout time
min.crimes <- min(complete.sample$CrimesPerHour)
max.crimes <- max(complete.sample$CrimesPerHour)
complete.sample$NormCrimesPerHour <- as.numeric(scale(complete.sample$CrimesPerHour,
center = min.crimes,
scale = max.crimes - min.crimes))
temp <- group_by(complete.sample, offensehour)
temp <- summarise(temp, value = mean(NormCrimesPerHour))
temp
## Source: local data frame [24 x 2]
##
## offensehour value
## (dbl) (dbl)
## 1 0 0.03875302
## 2 1 0.02206281
## 3 2 0.01882410
## 4 3 0.01435176
## 5 4 0.03498621
## 6 5 0.01058002
## 7 6 0.01246759
## 8 7 0.02023829
## 9 8 0.03306516
## 10 9 0.03269741
## .. ... ...
ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")
It looks like the 4th hour is a bit extreme, so let’s see if there is an outlyer. I prefer to use box plots to determine if there is an outlier.
fourth.hour <- complete.sample[complete.sample$offensehour == 4,]
qplot(fourth.hour$offensehour, fourth.hour$CrimesPerHour, data=fourth.hour, geom="boxplot")
There are quite a few outliers, so let us take a look at some
fourth.hour[fourth.hour$CrimesPerHour > 50,]
## Source: local data frame [14 x 4]
## Groups: offensedate [14]
##
## offensedate offensehour CrimesPerHour NormCrimesPerHour
## (date) (dbl) (int) (dbl)
## 1 2014-02-08 4 142 0.3740053
## 2 2014-02-09 4 128 0.3368700
## 3 2014-03-04 4 65 0.1697613
## 4 2014-03-05 4 298 0.7877984
## 5 2014-03-06 4 131 0.3448276
## 6 2014-03-07 4 268 0.7082228
## 7 2014-03-08 4 122 0.3209549
## 8 2014-05-17 4 52 0.1352785
## 9 2014-05-18 4 84 0.2201592
## 10 2014-05-19 4 361 0.9549072
## 11 2014-05-20 4 378 1.0000000
## 12 2014-05-21 4 335 0.8859416
## 13 2014-05-22 4 345 0.9124668
## 14 2014-05-23 4 127 0.3342175
It looks like these outliers are grouped together, posibly a sting operation. For example, May 19 - 22, there were over 300 offenses in the 4th hour. We will leave that data in for this analysis.
The data appears to follow a normal distribution if 0 through 5 were to follow 23, so let’s reorder
foo = rep(0, nrow(temp))
foo[with(temp, offensehour == 6)] = 1
foo[with(temp, offensehour == 7)] = 2
foo[with(temp, offensehour == 8)] = 3
foo[with(temp, offensehour == 9)] = 4
foo[with(temp, offensehour == 10)] = 5
foo[with(temp, offensehour == 11)] = 6
foo[with(temp, offensehour == 12)] = 7
foo[with(temp, offensehour == 13)] = 8
foo[with(temp, offensehour == 14)] = 9
foo[with(temp, offensehour == 15)] = 10
foo[with(temp, offensehour == 16)] = 11
foo[with(temp, offensehour == 17)] = 12
foo[with(temp, offensehour == 18)] = 13
foo[with(temp, offensehour == 19)] = 14
foo[with(temp, offensehour == 20)] = 15
foo[with(temp, offensehour == 21)] = 16
foo[with(temp, offensehour == 22)] = 17
foo[with(temp, offensehour == 23)] = 18
foo[with(temp, offensehour == 0)] = 19
foo[with(temp, offensehour == 1)] = 20
foo[with(temp, offensehour == 2)] = 21
foo[with(temp, offensehour == 3)] = 22
foo[with(temp, offensehour == 4)] = 23
foo[with(temp, offensehour == 5)] = 24
temp$offensehour = with(temp, reorder(offensehour, foo))
rm(foo)
ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")