Analysis

Requirements

Add required packages.

library(lubridate)

## Warning: package 'lubridate' was built under R version 3.2.5

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Get the crime data from dallasopendata.com

crime.data.file <- "crime.csv"
if(!file.exists(crime.data.file)){
  download.file("http://www.dallasopendata.com/api/views/ftja-9jxd/rows.csv?accessType=DOWNLOAD",
                destfile=crime.data.file)
}

Disclaimer: The data supplied by Dallas Police Department is sampled and should not be used for statistical purposes, but we should be able to get an idea of when crimes are committed.

The Dallas Police Department implemented a new Records Management System on June 1, 2014. To get crime data for 2014, two datasets are needed.

rms.file <- "rms.csv"
if(!file.exists(rms.file)){
  download.file("http://www.dallasopendata.com/api/views/tbnj-w5hb/rows.csv?accessType=DOWNLOAD",
                destfile=rms.file)
}

Read in the crime data into a data.frame

crime.data.part1 <- read.csv(crime.data.file,
                             as.is = TRUE)
crime.data.part2 <- read.csv(rms.file,
                             as.is = TRUE)

Get the columns that are needed for this analysis

crime.data <- dplyr::select(crime.data.part1, offensedate, offensetimedispatched)
temp <- dplyr::select(crime.data.part2, Date1, Time1)

Change the names of the columns to match the first set of data.

colnames(temp) <- c("offensedate", "offensetimedispatched")

Remove records before June 1, 2014 and use only 2014 data.

temp <- mutate(temp, tempdate = as.Date(temp$offensedate,
                                        format="%m/%d/%Y"))

temp <- temp[as.Date(temp$tempdate) >= as.Date("2014-06-01") 
             & year(temp$tempdate) == 2014 
             & !is.na(temp$tempdate),]

#' Remove the tempdate column
tempdateindex <- grep("^tempdate$", colnames(temp))
temp <- temp[,-tempdateindex]
head(temp)

##                offensedate offensetimedispatched
## 339 06/03/2014 12:00:00 AM                 13:00
## 344 06/17/2014 12:00:00 AM                 00:00
## 409 07/05/2014 12:00:00 AM                 18:30
## 410 07/23/2014 12:00:00 AM                 19:00
## 411 08/02/2014 12:00:00 AM                 10:00
## 413 07/01/2014 12:00:00 AM                 11:30

Bind the two data sets

crime.data <- rbind(crime.data, temp)

Check our date range of the data

crime.data$offensedate <- as.Date(crime.data$offensedate,
                                  format="%m/%d/%Y")

paste("Min is ", min(crime.data$offensedate), sep=" ")

## [1] "Min is  1994-03-15"

paste("Max is ", max(crime.data$offensedate), sep=" ")

## [1] "Max is  2014-12-31"

Check if the data is what is expected

crime.data <- mutate(crime.data, offenseyear = year(crime.data$offensedate))

crime <- group_by(crime.data, offenseyear)
summarize(crime, countsperyear = length(offenseyear))

## Source: local data frame [20 x 2]
## 
##    offenseyear countsperyear
##          (dbl)         (int)
## 1         1994             1
## 2         1995             1
## 3         1996             1
## 4         1997             1
## 5         1998             1
## 6         2000             2
## 7         2001             4
## 8         2002             4
## 9         2003            12
## 10        2004            21
## 11        2005            14
## 12        2006            14
## 13        2007            23
## 14        2008            33
## 15        2009            31
## 16        2010            60
## 17        2011            80
## 18        2012           270
## 19        2013         24124
## 20        2014        109754

Dates previous to 2014 are not complete, so we will get data for year 2014

crime.data <- crime.data[crime.data$offenseyear == 2014,]

Assuming “offensetimedispatched” is the time that the offense happened, we will use that column to determine when the crime happened.

crime.data$offensetimedispatched <- strptime(crime.data$offensetimedispatched,
                                             format="%H:%M")

Add an hour column

merged.data <- mutate(crime.data, offensehour = hour(crime.data$offensetimedispatched))
merged.data$offensetimedispatched <- as.character(merged.data$offensetimedispatched)
merged.data$offensehour <- as.numeric(merged.data$offensehour)
head(merged.data)

##   offensedate offensetimedispatched offenseyear offensehour
## 1  2014-05-31   2016-05-22 13:55:00        2014          13
## 2  2014-05-31   2016-05-22 20:50:00        2014          20
## 3  2014-05-31   2016-05-22 15:02:00        2014          15
## 4  2014-05-31   2016-05-22 14:18:00        2014          14
## 5  2014-05-31   2016-05-22 20:39:00        2014          20
## 6  2014-05-31   2016-05-22 16:09:00        2014          16

Group by date and then by hour

crime.data.hour <- group_by(merged.data, offensedate, offensehour)
head(crime.data.hour)

## Source: local data frame [6 x 4]
## Groups: offensedate, offensehour [5]
## 
##   offensedate offensetimedispatched offenseyear offensehour
##        (date)                 (chr)       (dbl)       (dbl)
## 1  2014-05-31   2016-05-22 13:55:00        2014          13
## 2  2014-05-31   2016-05-22 20:50:00        2014          20
## 3  2014-05-31   2016-05-22 15:02:00        2014          15
## 4  2014-05-31   2016-05-22 14:18:00        2014          14
## 5  2014-05-31   2016-05-22 20:39:00        2014          20
## 6  2014-05-31   2016-05-22 16:09:00        2014          16

Get the crimes per hour

complete.sample <- summarise(crime.data.hour,  
          CrimesPerHour = length(offensehour) )
complete.sample

## Source: local data frame [8,693 x 3]
## Groups: offensedate [?]
## 
##    offensedate offensehour CrimesPerHour
##         (date)       (dbl)         (int)
## 1   2014-01-01           0             6
## 2   2014-01-01           1             5
## 3   2014-01-01           2            11
## 4   2014-01-01           3             6
## 5   2014-01-01           4            10
## 6   2014-01-01           5             5
## 7   2014-01-01           6             2
## 8   2014-01-01           7            13
## 9   2014-01-01           8            16
## 10  2014-01-01           9            17
## ..         ...         ...           ...

Simple normalization of the crimes per hour throughout time

min.crimes <- min(complete.sample$CrimesPerHour)
max.crimes <- max(complete.sample$CrimesPerHour)
complete.sample$NormCrimesPerHour <- as.numeric(scale(complete.sample$CrimesPerHour, 
                                                      center = min.crimes, 
                                                      scale = max.crimes - min.crimes))
temp <- group_by(complete.sample, offensehour)
temp <- summarise(temp, value = mean(NormCrimesPerHour))
temp

## Source: local data frame [24 x 2]
## 
##    offensehour      value
##          (dbl)      (dbl)
## 1            0 0.03875302
## 2            1 0.02206281
## 3            2 0.01882410
## 4            3 0.01435176
## 5            4 0.03498621
## 6            5 0.01058002
## 7            6 0.01246759
## 8            7 0.02023829
## 9            8 0.03306516
## 10           9 0.03269741
## ..         ...        ...

ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")

It looks like the 4th hour is a bit extreme, so let’s see if there is an outlyer. I prefer to use box plots to determine if there is an outlier.

fourth.hour <- complete.sample[complete.sample$offensehour == 4,]
qplot(fourth.hour$offensehour, fourth.hour$CrimesPerHour, data=fourth.hour, geom="boxplot")

There are quite a few outliers, so let us take a look at some

fourth.hour[fourth.hour$CrimesPerHour > 50,]

## Source: local data frame [14 x 4]
## Groups: offensedate [14]
## 
##    offensedate offensehour CrimesPerHour NormCrimesPerHour
##         (date)       (dbl)         (int)             (dbl)
## 1   2014-02-08           4           142         0.3740053
## 2   2014-02-09           4           128         0.3368700
## 3   2014-03-04           4            65         0.1697613
## 4   2014-03-05           4           298         0.7877984
## 5   2014-03-06           4           131         0.3448276
## 6   2014-03-07           4           268         0.7082228
## 7   2014-03-08           4           122         0.3209549
## 8   2014-05-17           4            52         0.1352785
## 9   2014-05-18           4            84         0.2201592
## 10  2014-05-19           4           361         0.9549072
## 11  2014-05-20           4           378         1.0000000
## 12  2014-05-21           4           335         0.8859416
## 13  2014-05-22           4           345         0.9124668
## 14  2014-05-23           4           127         0.3342175

It looks like these outliers are grouped together, posibly a sting operation. For example, May 19 - 22, there were over 300 offenses in the 4th hour. We will leave that data in for this analysis.

The data appears to follow a normal distribution if 0 through 5 were to follow 23, so let’s reorder

foo = rep(0, nrow(temp))
foo[with(temp, offensehour == 6)] = 1
foo[with(temp, offensehour == 7)] = 2
foo[with(temp, offensehour == 8)] = 3
foo[with(temp, offensehour == 9)] = 4
foo[with(temp, offensehour == 10)] = 5
foo[with(temp, offensehour == 11)] = 6
foo[with(temp, offensehour == 12)] = 7
foo[with(temp, offensehour == 13)] = 8
foo[with(temp, offensehour == 14)] = 9
foo[with(temp, offensehour == 15)] = 10
foo[with(temp, offensehour == 16)] = 11
foo[with(temp, offensehour == 17)] = 12
foo[with(temp, offensehour == 18)] = 13
foo[with(temp, offensehour == 19)] = 14
foo[with(temp, offensehour == 20)] = 15
foo[with(temp, offensehour == 21)] = 16
foo[with(temp, offensehour == 22)] = 17
foo[with(temp, offensehour == 23)] = 18
foo[with(temp, offensehour == 0)] = 19
foo[with(temp, offensehour == 1)] = 20
foo[with(temp, offensehour == 2)] = 21
foo[with(temp, offensehour == 3)] = 22
foo[with(temp, offensehour == 4)] = 23
foo[with(temp, offensehour == 5)] = 24

temp$offensehour = with(temp, reorder(offensehour, foo))
rm(foo)

ggplot(temp, aes(x = offensehour, y = value)) + geom_bar(stat = "identity")

Crimes per Hour in Dallas 2014

Justin Nafe

date: Saturday, January 03, 2015

Introduction

Analysis

Requirements