Exploratory Data Analysis of Metropolitan Police recorded crime

This short tutorial provides the code for running basic exploratory data analysis of police recorded crime in R. The manual entry, importing, and cleaning of crime data will be covered along with the code for generating summary statistics, bar charts and choropleth maps.

Data sources: The crime data are derived from the Metropolitan Police website: http://maps.met.police.uk/tables.htm. They represent borough counts of crime by major category during 2013/14. London borough shapefiles are provided by the London DataStore: http://data.london.gov.uk/dataset/statistical-gis-boundary-files-london

Step 1: Set the working directory and load the necessary packages

# use the command setwd("c:/docs/mydir")
library(gdata)

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size

library(plyr) 
library(reshape2)
library(ggplot2)
library(RColorBrewer)
library(maptools)

## Loading required package: sp
## Checking rgeos availability: TRUE

library(classInt)

Step 2a: Entering data manually

NB Go to step 2b if you prefer to import the data directly from the MPS website

# enter London borough names as a character vector called 'borough'
borough <- c("Barking & Dagenham", "Barnet", "Bexley", "Brent", "Bromley", "Camden",
             "Croydon", "Ealing", "Enfield", "Greenwich", "Hackney", "Hammersmith & Fulham",
             "Haringey", "Harrow", "Havering", "Hillingdon", "Hounslow",
             "Islington", "Kensington & Chelsea", "Kingston upon Thames", "Lambeth", "Lewisham",
             "Merton", "Newham", "Redbridge", "Richmond upon Thames", "Southwark", "Sutton",
             "Tower Hamlets", "Waltham Forest", "Wandsworth", "Westminster")
# enter borough counts of crime by major category as numeric vectors
vap <- c(4200,4557,2772,6410,4426,4638,6558,6283,4765,5410,6052,4042,4826,3200,3437,  
         5035,5225,5133,2659,2053,7342,6324,2534,6710,4552,1939,6401,2547,6644,5415,4243,7322)
sex <- c(300,357,248,366,309,350,554,483,355,350,436,284,363,219,215,313,337,332,221,
         150,557,473,231,540,340,139,494,220,429,400,376,595)
rob <- c(526,710,257,1124,539,954,1507,849,918,443,1015,514,933,430,272,505,452,897,
         533,142,2256,1170,370,2166,884,191,2058,251,1250,870,909,2357)
burg <- c(2006,3970,1783,2854,3066,3124,3806,3307,3454,2348,2888,1506,2907,2023,2395,
          2845,2212,2511,1744,1111,3796,3029,1709,2950,2891,1820,3430,1431,2621,2741,3019,3963)
theft <- c(5879,9670,4249,7778,7671,15398,10499,10398,8984,7279,11768,8955,9571,4007,6141,
           7045,7580,12232,10634,4109,13892,7161,4783,12283,7738,4488,13155,4047,10573,8948,10564,32440)
fraud <- c(14,35,11,25,11,29,19,30,27,26,15,18,12,16,10,12,17,25,23,9,32,25,15,30,16,24,30,
           8,16,14,20,58)
crimdam <- c(1647,1965,1398,2047,2047,1584,2837,2300,2035,1969,1663,1206,1906,1083,1468,2172,
             1965,1668,855,899,2429,2117,1179,2133,1486,1100,2408,1259,2126,1786,1735,2137)
drug <- c(1213,1129,886,3405,1098,2097,2016,1787,1710,1356,1792,2036,1234,1016,927,998,1546,
          1408,1063,702,2112,1659,609,1652,1783,668,2755,502,2970,1560,1024,4240)
other <- c(303,322,236,458,272,451,400,434,370,425,396,412,309,239,252,338,320,391,234,110,
           484,355,160,471,346,148,452,135,498,356,341,911)
# combine the character and numeric vectors in a data frame called 'crimestats'
crimestats <- data.frame(borough, vap, sex, rob, burg, theft, fraud, crimdam, drug, other)
head(crimestats,4) # check the first 4 rows of the 'crimestats' data frame

##              borough  vap sex  rob burg theft fraud crimdam drug other
## 1 Barking & Dagenham 4200 300  526 2006  5879    14    1647 1213   303
## 2             Barnet 4557 357  710 3970  9670    35    1965 1129   322
## 3             Bexley 2772 248  257 1783  4249    11    1398  886   236
## 4              Brent 6410 366 1124 2854  7778    25    2047 3405   458

str(crimestats) # check the class of each variable

## 'data.frame':    32 obs. of  10 variables:
##  $ borough: Factor w/ 32 levels "Barking & Dagenham",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ vap    : num  4200 4557 2772 6410 4426 ...
##  $ sex    : num  300 357 248 366 309 350 554 483 355 350 ...
##  $ rob    : num  526 710 257 1124 539 ...
##  $ burg   : num  2006 3970 1783 2854 3066 ...
##  $ theft  : num  5879 9670 4249 7778 7671 ...
##  $ fraud  : num  14 35 11 25 11 29 19 30 27 26 ...
##  $ crimdam: num  1647 1965 1398 2047 2047 ...
##  $ drug   : num  1213 1129 886 3405 1098 ...
##  $ other  : num  303 322 236 458 272 451 400 434 370 425 ...

Step 2b: Importing data from the MPS website

# import crime data from the MPS website and select the first worksheet of the spreadsheet which contains 2013/14 offences
crimestats <- read.xls("http://maps.met.police.uk/datatables/borough_2013-14.xls", sheet=1, header=T, skip=1)
head(crimestats, 3) # check the first 3 rows of the 'crimestats' data frame

##   Month.Year.D_Offences.       Borough.Name Murder Wounding.GBH
## 1             FYTD 13/14 Barking & Dagenham      3          510
## 2             FYTD 13/14             Barnet      2          511
## 3             FYTD 13/14             Bexley      0          324
##   Assault.With.Injury Common.Assault Offensive.Weapon Harassment
## 1                1173           1106               54       1137
## 2                1110           1217               84       1424
## 3                 725            704               46        805
##   Other.Violence Total Rape Other.Sexual Total.1 Personal.Property
## 1            217  4200  118          182     300               491
## 2            209  4557  113          244     357               634
## 3            168  2772   87          161     248               219
##   Business.Property Total.2 Burglary.in.A.Dwelling
## 1                35     526                   1470
## 2                76     710                   2731
## 3                38     257                   1148
##   Burglary.in.Other.Buildings Total.3 Theft.Taking.Of.Motor.Vehicle
## 1                         536    2006                           773
## 2                        1239    3970                           769
## 3                         635    1783                           489
##   Theft.From.Motor.Vehicle Motor.Vehicle.Interference...Tampering
## 1                     1588                                    126
## 2                     2889                                    232
## 3                     1115                                    108
##   Theft.From.Shops Theft.Person Theft.Taking.Of.Pedal.Cycles Other.Theft
## 1              720          349                          240        2011
## 2             1198          713                          226        3579
## 3              807          145                          117        1426
##   Handling.Stolen.Goods Total.4 Counted.Per.Victim Other.Fraud...Forgery
## 1                    72    5879                  0                    14
## 2                    64    9670                  0                    35
## 3                    42    4249                  0                    11
##   Total.5 Criminal.Damage.to.a.Dwelling Criminal.Damage.To.Other.Buildings
## 1      14                           458                                151
## 2      35                           407                                200
## 3      11                           299                                111
##   Criminal.Damage.To.Motor.Vehicle Other.Criminal.Damage Total.6
## 1                              692                   346    1647
## 2                              908                   450    1965
## 3                              688                   300    1398
##   Drug.Trafficking Possession.Of.Drugs Other.Drugs Total.7 Going.Equipped
## 1               87                1120           6    1213             11
## 2              103                1019           7    1129             24
## 3               91                 791           4     886              8
##   Other.Notifiable Total.8 Total.Notifiable.Offences
## 1              292     303                     16088
## 2              298     322                     22715
## 3              228     236                     11840

# select the columns containing the borough counts of crime by major category
crimestats <- crimestats[,c("Borough.Name","Total","Total.1", "Total.2", "Total.3", "Total.4", "Total.5", "Total.6", "Total.7", "Total.8")]
# rename the major crime category "Total" columns
crimestats <- rename(crimestats, c("Borough.Name"="borough","Total"="vap", "Total.1"="sex", "Total.2"="rob", "Total.3"="burg", "Total.4"="theft", "Total.5"="fraud", "Total.6"="crimdam","Total.7"="drugs", "Total.8"="other"))

Step 3: Data cleaning

# drop MPS and Heathrow Airport rows
crimestats <- subset(crimestats, borough!="MPS" & borough!="Heathrow Airport") 
crimestats <- droplevels(crimestats)
# convert 'crimestats' to a long form data frame
crimestats <- melt(crimestats,id = c("borough"))
names(crimestats) <- c("borough","category","count") # rename the columns
head(crimestats)

##              borough category count
## 1 Barking & Dagenham      vap  4200
## 2             Barnet      vap  4557
## 3             Bexley      vap  2772
## 4              Brent      vap  6410
## 5            Bromley      vap  4426
## 6             Camden      vap  4638

# rename major crime categories
levels(crimestats$category)

## [1] "vap"     "sex"     "rob"     "burg"    "theft"   "fraud"   "crimdam"
## [8] "drugs"   "other"

levels(crimestats$category) <- c("Violence Against The Person", "Sexual Offences", "Robbery", "Burglary", "Theft & Handling", "Fraud & Forgery", "Criminal Damage", "Drugs", "Other Notifiable Offences")
# arrange major crime categories in alphabethical order
crimestats$category <- factor(crimestats$category, levels=c("Burglary", "Criminal Damage", "Drugs", "Fraud & Forgery", "Robbery", "Other Notifiable Offences", "Sexual Offences", "Theft & Handling", "Violence Against The Person"), ordered=T)

Step 4: Summary statistics

# minimum borough count of offences by major crime category
aggregate(count ~ category, crimestats, min)

##                      category count
## 1                    Burglary  1111
## 2             Criminal Damage   855
## 3                       Drugs   502
## 4             Fraud & Forgery     8
## 5                     Robbery   142
## 6   Other Notifiable Offences   110
## 7             Sexual Offences   139
## 8            Theft & Handling  4007
## 9 Violence Against The Person  1939

# maximum borough count of offences by major crime category
aggregate(count ~ category, crimestats, max)

##                      category count
## 1                    Burglary  3970
## 2             Criminal Damage  2837
## 3                       Drugs  4240
## 4             Fraud & Forgery    58
## 5                     Robbery  2357
## 6   Other Notifiable Offences   911
## 7             Sexual Offences   595
## 8            Theft & Handling 32440
## 9 Violence Against The Person  7342

# mean borough count of offences by major crime category
ddply(crimestats, c("category"), summarise, round(mean = mean(count)))

##                      category  ..1
## 1                    Burglary 2664
## 2             Criminal Damage 1769
## 3                       Drugs 1592
## 4             Fraud & Forgery   21
## 5                     Robbery  883
## 6   Other Notifiable Offences  354
## 7             Sexual Offences  354
## 8            Theft & Handling 9372
## 9 Violence Against The Person 4802

Step 5: Box plots

# boxplot of borough counts of crime by major crime category (excluding Heathrow Airport)
crimestats$category = with(crimestats, factor(category, levels = rev(levels(category))))
ggplot(subset(crimestats), aes(category, count)) +
  geom_boxplot() +
  geom_jitter(colour="red", size=1) + # remove this line of code if preferred
  xlab("") + ylab("") +
  theme_bw() +
  coord_flip() +
  ggtitle(expression(atop("Borough counts of Metropolitan Police recorded crime by major crime category, 2013/14*",
                          atop(italic("* excluding Heathrow Airport")))))

# identify the outlier boroughs by major crime category
crimestats$category = with(crimestats, factor(category, levels = rev(levels(category))))
outliers <- boxplot(count~category, data = crimestats, plot=FALSE)$out
outliers_freq <- count(crimestats[crimestats$count %in% outliers,], c("borough", "category", "count"))
outliers_freq[order(outliers_freq$category,decreasing=FALSE),]

##       borough                  category count freq
## 1       Brent                     Drugs  3405    1
## 5 Westminster                     Drugs  4240    1
## 6 Westminster           Fraud & Forgery    58    1
## 2     Lambeth                   Robbery  2256    1
## 3      Newham                   Robbery  2166    1
## 4   Southwark                   Robbery  2058    1
## 7 Westminster                   Robbery  2357    1
## 8 Westminster Other Notifiable Offences   911    1
## 9 Westminster          Theft & Handling 32440    1

Step 6: Bar charts

# bar chart of borough counts of crime by major crime category (using a free scale)
ggplot(crimestats, aes(borough, count, fill= category)) +
  geom_bar(color="white", stat="identity") +
  scale_fill_brewer(palette="Paired") +
  theme(legend.position = "none") +
  ylab("") + xlab("") +
  facet_wrap(~category, scales = "free_y") +
  ggtitle ("Borough counts of police recorded crime by major crime category during 2013/14") +
  theme_bw() + theme(legend.position = "none") +
  theme(plot.title = element_text(size = 12,colour="black")) +
  theme(axis.text.x = element_text(angle=+90, hjust=1))

# bar chart of borough counts of crime by major crime category (using a fixed scale)
ggplot(crimestats, aes(borough, count, fill= category)) +
  geom_bar(color="white", stat="identity") +
  scale_fill_brewer(palette="Paired") +
  theme(legend.position = "none") +
  ylab("") + xlab("") +
  facet_wrap(~category) +
  coord_flip() +
  ggtitle ("Borough counts of police recorded crime by major crime category during 2013/14") +
  theme_bw() + theme(legend.position = "none") +
  theme(plot.title = element_text(size = 12,colour="black")) +
  theme(axis.text.x = element_text(angle=+90, hjust=1)) +
  scale_x_discrete(limits = rev(levels(crimestats$borough)))

Step 7: Choropleth mapping

# read the borough shapefile
boundary <- readShapePoly("London_Borough_Excluding_MHW.shp")
# confirm the projection as British National Grid
proj4string(boundary)<- CRS("+init=epsg:27700")
# check whether the borough names in the shapefile attribute table match with 'crimestats' data frame
crimestats$borough[!crimestats$borough %in% boundary$NAME]

##  [1] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
##  [4] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
##  [7] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [10] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [13] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [16] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [19] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [22] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## [25] Barking & Dagenham   Hammersmith & Fulham Kensington & Chelsea
## 32 Levels: Barking & Dagenham Barnet Bexley Brent Bromley ... Westminster

# rename the boroughs, i.e. swap ampersands for and
crimestats$borough <- mapvalues(crimestats$borough, from = c("Barking & Dagenham", "Hammersmith & Fulham", "Kensington & Chelsea"), to = c("Barking and Dagenham", "Hammersmith and Fulham", "Kensington and Chelsea"))
# subset by crime type
levels(crimestats$category)

## [1] "Burglary"                    "Criminal Damage"            
## [3] "Drugs"                       "Fraud & Forgery"            
## [5] "Robbery"                     "Other Notifiable Offences"  
## [7] "Sexual Offences"             "Theft & Handling"           
## [9] "Violence Against The Person"

# create a data frame containg the borough count of Robbery offences
crimes_sub <- subset(crimestats, category=="Robbery")
# join the crime counts to the shapefile using 'borough' as the matching variable
boundary <- rename(boundary, replace = c(NAME = "borough"))
head(join(boundary@data, crimes_sub))

## Joining by: borough

##                borough  GSS_CODE  HECTARES NONLD_AREA ONS_INNER SUB_2009
## 1 Kingston upon Thames E09000021  3726.117      0.000         F     <NA>
## 2              Croydon E09000008  8649.441      0.000         F     <NA>
## 3              Bromley E09000006 15013.487      0.000         F     <NA>
## 4             Hounslow E09000018  5658.541     60.755         F     <NA>
## 5               Ealing E09000009  5554.428      0.000         F     <NA>
## 6             Havering E09000016 11445.735    210.763         F     <NA>
##   SUB_2006 category count
## 1     <NA>  Robbery   142
## 2     <NA>  Robbery  1507
## 3     <NA>  Robbery   539
## 4     <NA>  Robbery   452
## 5     <NA>  Robbery   849
## 6     <NA>  Robbery   272

boundary@data <- join(boundary@data, crimes_sub)

## Joining by: borough

# create a thematic range using natural breaks
classes_crimes <- classIntervals(boundary$count, n=5, style="jenks") # ignore the warning as values for City of London are missing

## Warning in classIntervals(boundary$count, n = 5, style = "jenks"): var has
## missing values, omitted in finding classes

classes_crimes <- classes_crimes$brks
# fortify the borough shapefile for plotting in ggplot2
boundary.f <- fortify(boundary, region="borough")
boundary.f <- merge(boundary.f, boundary@data, by.x="id", by.y="borough")
# create text labels
centroids <- as.data.frame(coordinates(boundary))
centroids$count <- boundary$count
# plot the results
ggplot(subset(boundary.f, boundary.f$count != "0"), aes(long,lat)) +
  geom_polygon(data=boundary.f, aes(group=group, fill=count)) +
  geom_polygon(data=boundary.f, aes(long,lat, group=group), colour = "white", fill = NA, inherit.aes = F) +
  geom_text(data = centroids, aes(V1, V2, label = count), colour="black", size=3) +
  scale_fill_gradientn('', colours=brewer.pal(5,"PuRd"), breaks=classes_crimes, guide="colourbar") +
  coord_equal() +
  scale_x_continuous("", breaks=NULL) + scale_y_continuous("", breaks=NULL) + theme(panel.grid=element_blank()) +
  ggtitle(expression(atop("Metropolitan Police recorded Robbery offences by borough", atop(italic("2013/14"))))) +
  theme_bw() +
  theme(plot.title = element_text(vjust = 1, face = c("bold"), size = 15, colour  = "black"), legend.position = "none")

## Warning: Removed 1 rows containing missing values (geom_text).