This document shows the steps taken to get the number of crimes per hour for 2015 in Dallas County.
This analysis uses Dallas Open Data (https://www.dallasopendata.com/) to calculate the number of crimes per zip code in Dallas County. As pointed out on Dallas Police Public Data website (http://www.dallaspolice.net/publicdata/), the data that the police supply to the public is sample data, so the data cannot be used to supply official statistics.
Add required packages.
library(lubridate)
library(dplyr)
library(ggplot2)
Get the crime data from dallasopendata.com
rms.file <- "rms.csv"
if(!file.exists(rms.file)){
download.file("http://www.dallasopendata.com/api/views/tbnj-w5hb/rows.csv?accessType=DOWNLOAD",
destfile=rms.file)
}
Read in the crime data into a data.frame
crime.data <- read.csv(rms.file,
as.is = TRUE)
Get the columns that are needed for this analysis
crime.data <- dplyr::select(crime.data, Date1, Time1)
Change the names of the dataset.
colnames(crime.data) <- c("offensedate", "offensetimedispatched")
Use only 2015 data.
crime.data <- mutate(crime.data, tempdate = as.Date(crime.data$offensedate,
format="%m/%d/%Y"))
crime.data <- crime.data[year(crime.data$tempdate) == 2015
& !is.na(crime.data$tempdate),]
#' Remove the tempdate column
tempdateindex <- grep("^tempdate$", colnames(crime.data))
crime.data <- crime.data[,-tempdateindex]
head(crime.data)
## offensedate offensetimedispatched
## 4 12/31/2015 12:00:00 AM 22:00
## 6 12/31/2015 12:00:00 AM 22:30
## 9 12/30/2015 12:00:00 AM 16:30
## 12 12/31/2015 12:00:00 AM 03:00
## 31 07/29/2015 12:00:00 AM 08:00
## 32 12/28/2015 12:00:00 AM 00:01
Check our date range of the data
crime.data$offensedate <- as.Date(crime.data$offensedate,
format="%m/%d/%Y")
paste("Min is ", min(crime.data$offensedate), sep=" ")
## [1] "Min is 2015-01-01"
paste("Max is ", max(crime.data$offensedate), sep=" ")
## [1] "Max is 2015-12-31"
Check if the data is what is expected
crime.data <- mutate(crime.data, offenseyear = year(crime.data$offensedate))
crime <- group_by(crime.data, offenseyear)
summarize(crime, countsperyear = length(offenseyear))
## Source: local data frame [1 x 2]
##
## offenseyear countsperyear
## (dbl) (int)
## 1 2015 94272
Assuming “offensetimedispatched” is the time that the offense happened, we will use that column to determine when the crime happened.
crime.data$offensetimedispatched <- strptime(crime.data$offensetimedispatched,
format="%H:%M")
Add an hour column
merged.data <- mutate(crime.data, offensehour = hour(crime.data$offensetimedispatched))
merged.data$offensetimedispatched <- as.character(merged.data$offensetimedispatched)
merged.data$offensehour <- as.numeric(merged.data$offensehour)
Group by date and then by hour
crime.data.hour <- group_by(merged.data, offensedate, offensehour)
Get the crimes per hour
complete.sample <- summarise(crime.data.hour,
CrimesPerHour = length(offensehour) )
complete.sample
## Source: local data frame [8,737 x 3]
## Groups: offensedate [?]
##
## offensedate offensehour CrimesPerHour
## (date) (dbl) (int)
## 1 2015-01-01 0 43
## 2 2015-01-01 1 20
## 3 2015-01-01 2 22
## 4 2015-01-01 3 12
## 5 2015-01-01 4 12
## 6 2015-01-01 5 8
## 7 2015-01-01 6 4
## 8 2015-01-01 7 6
## 9 2015-01-01 8 15
## 10 2015-01-01 9 8
## .. ... ... ...
Summarise the crimes per hour throughout time
temp <- group_by(complete.sample, offensehour)
temp <- summarise(temp, value = mean(CrimesPerHour))
temp
## Source: local data frame [24 x 2]
##
## offensehour value
## (dbl) (dbl)
## 1 0 16.712329
## 2 1 9.255495
## 3 2 8.178571
## 4 3 6.848901
## 5 4 5.639665
## 6 5 5.169492
## 7 6 5.760989
## 8 7 7.076923
## 9 8 10.306849
## 10 9 8.863014
## .. ... ...
ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")
The data appears to follow a normal distribution if 0 through 5 were to follow 23, so let’s reorder
foo = rep(0, nrow(temp))
foo[with(temp, offensehour == 6)] = 1
foo[with(temp, offensehour == 7)] = 2
foo[with(temp, offensehour == 8)] = 3
foo[with(temp, offensehour == 9)] = 4
foo[with(temp, offensehour == 10)] = 5
foo[with(temp, offensehour == 11)] = 6
foo[with(temp, offensehour == 12)] = 7
foo[with(temp, offensehour == 13)] = 8
foo[with(temp, offensehour == 14)] = 9
foo[with(temp, offensehour == 15)] = 10
foo[with(temp, offensehour == 16)] = 11
foo[with(temp, offensehour == 17)] = 12
foo[with(temp, offensehour == 18)] = 13
foo[with(temp, offensehour == 19)] = 14
foo[with(temp, offensehour == 20)] = 15
foo[with(temp, offensehour == 21)] = 16
foo[with(temp, offensehour == 22)] = 17
foo[with(temp, offensehour == 23)] = 18
foo[with(temp, offensehour == 0)] = 19
foo[with(temp, offensehour == 1)] = 20
foo[with(temp, offensehour == 2)] = 21
foo[with(temp, offensehour == 3)] = 22
foo[with(temp, offensehour == 4)] = 23
foo[with(temp, offensehour == 5)] = 24
temp$offensehour = with(temp, reorder(offensehour, foo))
rm(foo)
ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")