Crimes per Hour in Dallas 2015

Friday, April 29, 2016

Introduction

This document shows the steps taken to get the number of crimes per hour for 2015 in Dallas County.

This analysis uses Dallas Open Data (https://www.dallasopendata.com/) to calculate the number of crimes per zip code in Dallas County. As pointed out on Dallas Police Public Data website (http://www.dallaspolice.net/publicdata/), the data that the police supply to the public is sample data, so the data cannot be used to supply official statistics.

Analysis

Requirements

Add required packages.

library(lubridate)
library(dplyr)
library(ggplot2)

Get the crime data from dallasopendata.com

rms.file <- "rms.csv"
if(!file.exists(rms.file)){
  download.file("http://www.dallasopendata.com/api/views/tbnj-w5hb/rows.csv?accessType=DOWNLOAD",
                destfile=rms.file)
}

Read in the crime data into a data.frame

crime.data <- read.csv(rms.file,
                             as.is = TRUE)

Get the columns that are needed for this analysis

crime.data <- dplyr::select(crime.data, Date1, Time1)

Change the names of the dataset.

colnames(crime.data) <- c("offensedate", "offensetimedispatched")

Use only 2015 data.

crime.data <- mutate(crime.data, tempdate = as.Date(crime.data$offensedate,
                                        format="%m/%d/%Y"))

crime.data <- crime.data[year(crime.data$tempdate) == 2015 
             & !is.na(crime.data$tempdate),]

#' Remove the tempdate column
tempdateindex <- grep("^tempdate$", colnames(crime.data))
crime.data <- crime.data[,-tempdateindex]
head(crime.data)
##               offensedate offensetimedispatched
## 4  12/31/2015 12:00:00 AM                 22:00
## 6  12/31/2015 12:00:00 AM                 22:30
## 9  12/30/2015 12:00:00 AM                 16:30
## 12 12/31/2015 12:00:00 AM                 03:00
## 31 07/29/2015 12:00:00 AM                 08:00
## 32 12/28/2015 12:00:00 AM                 00:01

Check our date range of the data

crime.data$offensedate <- as.Date(crime.data$offensedate,
                                  format="%m/%d/%Y")

paste("Min is ", min(crime.data$offensedate), sep=" ")
## [1] "Min is  2015-01-01"
paste("Max is ", max(crime.data$offensedate), sep=" ")
## [1] "Max is  2015-12-31"

Check if the data is what is expected

crime.data <- mutate(crime.data, offenseyear = year(crime.data$offensedate))

crime <- group_by(crime.data, offenseyear)
summarize(crime, countsperyear = length(offenseyear))
## Source: local data frame [1 x 2]
## 
##   offenseyear countsperyear
##         (dbl)         (int)
## 1        2015         94272

Assuming “offensetimedispatched” is the time that the offense happened, we will use that column to determine when the crime happened.

crime.data$offensetimedispatched <- strptime(crime.data$offensetimedispatched,
                                             format="%H:%M")

Add an hour column

merged.data <- mutate(crime.data, offensehour = hour(crime.data$offensetimedispatched))
merged.data$offensetimedispatched <- as.character(merged.data$offensetimedispatched)
merged.data$offensehour <- as.numeric(merged.data$offensehour)

Group by date and then by hour

crime.data.hour <- group_by(merged.data, offensedate, offensehour)

Get the crimes per hour

complete.sample <- summarise(crime.data.hour,  
          CrimesPerHour = length(offensehour) )
complete.sample
## Source: local data frame [8,737 x 3]
## Groups: offensedate [?]
## 
##    offensedate offensehour CrimesPerHour
##         (date)       (dbl)         (int)
## 1   2015-01-01           0            43
## 2   2015-01-01           1            20
## 3   2015-01-01           2            22
## 4   2015-01-01           3            12
## 5   2015-01-01           4            12
## 6   2015-01-01           5             8
## 7   2015-01-01           6             4
## 8   2015-01-01           7             6
## 9   2015-01-01           8            15
## 10  2015-01-01           9             8
## ..         ...         ...           ...

Summarise the crimes per hour throughout time

temp <- group_by(complete.sample, offensehour)
temp <- summarise(temp, value = mean(CrimesPerHour))
temp
## Source: local data frame [24 x 2]
## 
##    offensehour     value
##          (dbl)     (dbl)
## 1            0 16.712329
## 2            1  9.255495
## 3            2  8.178571
## 4            3  6.848901
## 5            4  5.639665
## 6            5  5.169492
## 7            6  5.760989
## 8            7  7.076923
## 9            8 10.306849
## 10           9  8.863014
## ..         ...       ...
ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")

The data appears to follow a normal distribution if 0 through 5 were to follow 23, so let’s reorder

foo = rep(0, nrow(temp))
foo[with(temp, offensehour == 6)] = 1
foo[with(temp, offensehour == 7)] = 2
foo[with(temp, offensehour == 8)] = 3
foo[with(temp, offensehour == 9)] = 4
foo[with(temp, offensehour == 10)] = 5
foo[with(temp, offensehour == 11)] = 6
foo[with(temp, offensehour == 12)] = 7
foo[with(temp, offensehour == 13)] = 8
foo[with(temp, offensehour == 14)] = 9
foo[with(temp, offensehour == 15)] = 10
foo[with(temp, offensehour == 16)] = 11
foo[with(temp, offensehour == 17)] = 12
foo[with(temp, offensehour == 18)] = 13
foo[with(temp, offensehour == 19)] = 14
foo[with(temp, offensehour == 20)] = 15
foo[with(temp, offensehour == 21)] = 16
foo[with(temp, offensehour == 22)] = 17
foo[with(temp, offensehour == 23)] = 18
foo[with(temp, offensehour == 0)] = 19
foo[with(temp, offensehour == 1)] = 20
foo[with(temp, offensehour == 2)] = 21
foo[with(temp, offensehour == 3)] = 22
foo[with(temp, offensehour == 4)] = 23
foo[with(temp, offensehour == 5)] = 24

temp$offensehour = with(temp, reorder(offensehour, foo))
rm(foo)

ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")