This short tutorial provides the code for running basic exploratory data analysis of police recorded crime in R. The manual entry, importing, and cleaning of crime data will be covered along with the code for generating summary statistics, bar charts and choropleth maps.
Data sources: The crime data are derived from the Metropolitan Police website: http://maps.met.police.uk/tables.htm. They represent borough counts of crime by major category during 2013/14. London borough shapefiles are provided by the London DataStore: http://data.london.gov.uk/dataset/statistical-gis-boundary-files-london
# use the command setwd("c:/docs/mydir")
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
library(plyr)
library(reshape2)
library(ggplot2)
library(RColorBrewer)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
library(classInt)
NB Go to step 2b if you prefer to import the data directly from the MPS website
# enter London borough names as a character vector called 'borough'
borough <- c("Barking & Dagenham", "Barnet", "Bexley", "Brent", "Bromley", "Camden",
"Croydon", "Ealing", "Enfield", "Greenwich", "Hackney", "Hammersmith & Fulham",
"Haringey", "Harrow", "Havering", "Hillingdon", "Hounslow",
"Islington", "Kensington & Chelsea", "Kingston upon Thames", "Lambeth", "Lewisham",
"Merton", "Newham", "Redbridge", "Richmond upon Thames", "Southwark", "Sutton",
"Tower Hamlets", "Waltham Forest", "Wandsworth", "Westminster")
# enter borough counts of crime by major category as numeric vectors
vap <- c(4200,4557,2772,6410,4426,4638,6558,6283,4765,5410,6052,4042,4826,3200,3437,
5035,5225,5133,2659,2053,7342,6324,2534,6710,4552,1939,6401,2547,6644,5415,4243,7322)
sex <- c(300,357,248,366,309,350,554,483,355,350,436,284,363,219,215,313,337,332,221,
150,557,473,231,540,340,139,494,220,429,400,376,595)
rob <- c(526,710,257,1124,539,954,1507,849,918,443,1015,514,933,430,272,505,452,897,
533,142,2256,1170,370,2166,884,191,2058,251,1250,870,909,2357)
burg <- c(2006,3970,1783,2854,3066,3124,3806,3307,3454,2348,2888,1506,2907,2023,2395,
2845,2212,2511,1744,1111,3796,3029,1709,2950,2891,1820,3430,1431,2621,2741,3019,3963)
theft <- c(5879,9670,4249,7778,7671,15398,10499,10398,8984,7279,11768,8955,9571,4007,6141,
7045,7580,12232,10634,4109,13892,7161,4783,12283,7738,4488,13155,4047,10573,8948,10564,32440)
fraud <- c(14,35,11,25,11,29,19,30,27,26,15,18,12,16,10,12,17,25,23,9,32,25,15,30,16,24,30,
8,16,14,20,58)
crimdam <- c(1647,1965,1398,2047,2047,1584,2837,2300,2035,1969,1663,1206,1906,1083,1468,2172,
1965,1668,855,899,2429,2117,1179,2133,1486,1100,2408,1259,2126,1786,1735,2137)
drug <- c(1213,1129,886,3405,1098,2097,2016,1787,1710,1356,1792,2036,1234,1016,927,998,1546,
1408,1063,702,2112,1659,609,1652,1783,668,2755,502,2970,1560,1024,4240)
other <- c(303,322,236,458,272,451,400,434,370,425,396,412,309,239,252,338,320,391,234,110,
484,355,160,471,346,148,452,135,498,356,341,911)
# combine the character and numeric vectors in a data frame called 'crimestats'
crimestats <- data.frame(borough, vap, sex, rob, burg, theft, fraud, crimdam, drug, other)
head(crimestats,4) # check the first 4 rows of the 'crimestats' data frame
## borough vap sex rob burg theft fraud crimdam drug other
## 1 Barking & Dagenham 4200 300 526 2006 5879 14 1647 1213 303
## 2 Barnet 4557 357 710 3970 9670 35 1965 1129 322
## 3 Bexley 2772 248 257 1783 4249 11 1398 886 236
## 4 Brent 6410 366 1124 2854 7778 25 2047 3405 458
str(crimestats) # check the class of each variable
## 'data.frame': 32 obs. of 10 variables:
## $ borough: Factor w/ 32 levels "Barking & Dagenham",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ vap : num 4200 4557 2772 6410 4426 ...
## $ sex : num 300 357 248 366 309 350 554 483 355 350 ...
## $ rob : num 526 710 257 1124 539 ...
## $ burg : num 2006 3970 1783 2854 3066 ...
## $ theft : num 5879 9670 4249 7778 7671 ...
## $ fraud : num 14 35 11 25 11 29 19 30 27 26 ...
## $ crimdam: num 1647 1965 1398 2047 2047 ...
## $ drug : num 1213 1129 886 3405 1098 ...
## $ other : num 303 322 236 458 272 451 400 434 370 425 ...
# import crime data from the MPS website and select the first worksheet of the spreadsheet which contains 2013/14 offences
crimestats <- read.xls("http://maps.met.police.uk/datatables/borough_2013-14.xls", sheet=1, header=T, skip=1)
head(crimestats, 3) # check the first 3 rows of the 'crimestats' data frame
## Month.Year.D_Offences. Borough.Name Murder Wounding.GBH
## 1 FYTD 13/14 Barking & Dagenham 3 510
## 2 FYTD 13/14 Barnet 2 511
## 3 FYTD 13/14 Bexley 0 324
## Assault.With.Injury Common.Assault Offensive.Weapon Harassment
## 1 1173 1106 54 1137
## 2 1110 1217 84 1424
## 3 725 704 46 805
## Other.Violence Total Rape Other.Sexual Total.1 Personal.Property
## 1 217 4200 118 182 300 491
## 2 209 4557 113 244 357 634
## 3 168 2772 87 161 248 219
## Business.Property Total.2 Burglary.in.A.Dwelling
## 1 35 526 1470
## 2 76 710 2731
## 3 38 257 1148
## Burglary.in.Other.Buildings Total.3 Theft.Taking.Of.Motor.Vehicle
## 1 536 2006 773
## 2 1239 3970 769
## 3 635 1783 489
## Theft.From.Motor.Vehicle Motor.Vehicle.Interference...Tampering
## 1 1588 126
## 2 2889 232
## 3 1115 108
## Theft.From.Shops Theft.Person Theft.Taking.Of.Pedal.Cycles Other.Theft
## 1 720 349 240 2011
## 2 1198 713 226 3579
## 3 807 145 117 1426
## Handling.Stolen.Goods Total.4 Counted.Per.Victim Other.Fraud...Forgery
## 1 72 5879 0 14
## 2 64 9670 0 35
## 3 42 4249 0 11
## Total.5 Criminal.Damage.to.a.Dwelling Criminal.Damage.To.Other.Buildings
## 1 14 458 151
## 2 35 407 200
## 3 11 299 111
## Criminal.Damage.To.Motor.Vehicle Other.Criminal.Damage Total.6
## 1 692 346 1647
## 2 908 450 1965
## 3 688 300 1398
## Drug.Trafficking Possession.Of.Drugs Other.Drugs Total.7 Going.Equipped
## 1 87 1120 6 1213 11
## 2 103 1019 7 1129 24
## 3 91 791 4 886 8
## Other.Notifiable Total.8 Total.Notifiable.Offences
## 1 292 303 16088
## 2 298 322 22715
## 3 228 236 11840
# select the columns containing the borough counts of crime by major category
crimestats <- crimestats[,c("Borough.Name","Total","Total.1", "Total.2", "Total.3", "Total.4", "Total.5", "Total.6", "Total.7", "Total.8")]
# rename the major crime category "Total" columns
crimestats <- rename(crimestats, c("Borough.Name"="borough","Total"="vap", "Total.1"="sex", "Total.2"="rob", "Total.3"="burg", "Total.4"="theft", "Total.5"="fraud", "Total.6"="crimdam","Total.7"="drugs", "Total.8"="other"))
# drop MPS and Heathrow Airport rows
crimestats <- subset(crimestats, borough!="MPS" & borough!="Heathrow Airport")
crimestats <- droplevels(crimestats)
# convert 'crimestats' to a long form data frame
crimestats <- melt(crimestats,id = c("borough"))
names(crimestats) <- c("borough","category","count") # rename the columns
head(crimestats)
## borough category count
## 1 Barking & Dagenham vap 4200
## 2 Barnet vap 4557
## 3 Bexley vap 2772
## 4 Brent vap 6410
## 5 Bromley vap 4426
## 6 Camden vap 4638
# rename major crime categories
levels(crimestats$category)
## [1] "vap" "sex" "rob" "burg" "theft" "fraud" "crimdam"
## [8] "drugs" "other"
levels(crimestats$category) <- c("Violence Against The Person", "Sexual Offences", "Robbery", "Burglary", "Theft & Handling", "Fraud & Forgery", "Criminal Damage", "Drugs", "Other Notifiable Offences")
# arrange major crime categories in alphabethical order
crimestats$category <- factor(crimestats$category, levels=c("Burglary", "Criminal Damage", "Drugs", "Fraud & Forgery", "Robbery", "Other Notifiable Offences", "Sexual Offences", "Theft & Handling", "Violence Against The Person"), ordered=T)
# minimum borough count of offences by major crime category
aggregate(count ~ category, crimestats, min)
## category count
## 1 Burglary 1111
## 2 Criminal Damage 855
## 3 Drugs 502
## 4 Fraud & Forgery 8
## 5 Robbery 142
## 6 Other Notifiable Offences 110
## 7 Sexual Offences 139
## 8 Theft & Handling 4007
## 9 Violence Against The Person 1939
# maximum borough count of offences by major crime category
aggregate(count ~ category, crimestats, max)
## category count
## 1 Burglary 3970
## 2 Criminal Damage 2837
## 3 Drugs 4240
## 4 Fraud & Forgery 58
## 5 Robbery 2357
## 6 Other Notifiable Offences 911
## 7 Sexual Offences 595
## 8 Theft & Handling 32440
## 9 Violence Against The Person 7342
# mean borough count of offences by major crime category
ddply(crimestats, c("category"), summarise, round(mean = mean(count)))
## category ..1
## 1 Burglary 2664
## 2 Criminal Damage 1769
## 3 Drugs 1592
## 4 Fraud & Forgery 21
## 5 Robbery 883
## 6 Other Notifiable Offences 354
## 7 Sexual Offences 354
## 8 Theft & Handling 9372
## 9 Violence Against The Person 4802
# boxplot of borough counts of crime by major crime category (excluding Heathrow Airport)
crimestats$category = with(crimestats, factor(category, levels = rev(levels(category))))
ggplot(subset(crimestats), aes(category, count)) +
geom_boxplot() +
geom_jitter(colour="red", size=1) + # remove this line of code if preferred
xlab("") + ylab("") +
theme_bw() +
coord_flip() +
ggtitle(expression(atop("Borough counts of Metropolitan Police recorded crime by major crime category, 2013/14*",
atop(italic("* excluding Heathrow Airport")))))
# identify the outlier boroughs by major crime category
crimestats$category = with(crimestats, factor(category, levels = rev(levels(category))))
outliers <- boxplot(count~category, data = crimestats, plot=FALSE)$out
outliers_freq <- count(crimestats[crimestats$count %in% outliers,], c("borough", "category", "count"))
outliers_freq[order(outliers_freq$category,decreasing=FALSE),]
## borough category count freq
## 1 Brent Drugs 3405 1
## 5 Westminster Drugs 4240 1
## 6 Westminster Fraud & Forgery 58 1
## 2 Lambeth Robbery 2256 1
## 3 Newham Robbery 2166 1
## 4 Southwark Robbery 2058 1
## 7 Westminster Robbery 2357 1
## 8 Westminster Other Notifiable Offences 911 1
## 9 Westminster Theft & Handling 32440 1
# bar chart of borough counts of crime by major crime category (using a free scale)
ggplot(crimestats, aes(borough, count, fill= category)) +
geom_bar(color="white", stat="identity") +
scale_fill_brewer(palette="Paired") +
theme(legend.position = "none") +
ylab("") + xlab("") +
facet_wrap(~category, scales = "free_y") +
ggtitle ("Borough counts of police recorded crime by major crime category during 2013/14") +
theme_bw() + theme(legend.position = "none") +
theme(plot.title = element_text(size = 12,colour="black")) +
theme(axis.text.x = element_text(angle=+90, hjust=1))
# bar chart of borough counts of crime by major crime category (using a fixed scale)
ggplot(crimestats, aes(borough, count, fill= category)) +
geom_bar(color="white", stat="identity") +
scale_fill_brewer(palette="Paired") +
theme(legend.position = "none") +
ylab("") + xlab("") +
facet_wrap(~category) +
coord_flip() +
ggtitle ("Borough counts of police recorded crime by major crime category during 2013/14") +
theme_bw() + theme(legend.position = "none") +
theme(plot.title = element_text(size = 12,colour="black")) +
theme(axis.text.x = element_text(angle=+90, hjust=1)) +
scale_x_discrete(limits = rev(levels(crimestats$borough)))
# read the borough shapefile
boundary <- readShapePoly("London_Borough_Excluding_MHW.shp")
# confirm the projection as British National Grid
proj4string(boundary)<- CRS("+init=epsg:27700")
# check whether the borough names in the shapefile attribute table match with 'crimestats' data frame
crimestats$borough[!crimestats$borough %in% boundary$NAME]
## [1] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [4] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [7] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [10] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [13] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [16] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [19] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [22] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## [25] Barking & Dagenham Hammersmith & Fulham Kensington & Chelsea
## 32 Levels: Barking & Dagenham Barnet Bexley Brent Bromley ... Westminster
# rename the boroughs, i.e. swap ampersands for and
crimestats$borough <- mapvalues(crimestats$borough, from = c("Barking & Dagenham", "Hammersmith & Fulham", "Kensington & Chelsea"), to = c("Barking and Dagenham", "Hammersmith and Fulham", "Kensington and Chelsea"))
# subset by crime type
levels(crimestats$category)
## [1] "Burglary" "Criminal Damage"
## [3] "Drugs" "Fraud & Forgery"
## [5] "Robbery" "Other Notifiable Offences"
## [7] "Sexual Offences" "Theft & Handling"
## [9] "Violence Against The Person"
# create a data frame containg the borough count of Robbery offences
crimes_sub <- subset(crimestats, category=="Robbery")
# join the crime counts to the shapefile using 'borough' as the matching variable
boundary <- rename(boundary, replace = c(NAME = "borough"))
head(join(boundary@data, crimes_sub))
## Joining by: borough
## borough GSS_CODE HECTARES NONLD_AREA ONS_INNER SUB_2009
## 1 Kingston upon Thames E09000021 3726.117 0.000 F <NA>
## 2 Croydon E09000008 8649.441 0.000 F <NA>
## 3 Bromley E09000006 15013.487 0.000 F <NA>
## 4 Hounslow E09000018 5658.541 60.755 F <NA>
## 5 Ealing E09000009 5554.428 0.000 F <NA>
## 6 Havering E09000016 11445.735 210.763 F <NA>
## SUB_2006 category count
## 1 <NA> Robbery 142
## 2 <NA> Robbery 1507
## 3 <NA> Robbery 539
## 4 <NA> Robbery 452
## 5 <NA> Robbery 849
## 6 <NA> Robbery 272
boundary@data <- join(boundary@data, crimes_sub)
## Joining by: borough
# create a thematic range using natural breaks
classes_crimes <- classIntervals(boundary$count, n=5, style="jenks") # ignore the warning as values for City of London are missing
## Warning in classIntervals(boundary$count, n = 5, style = "jenks"): var has
## missing values, omitted in finding classes
classes_crimes <- classes_crimes$brks
# fortify the borough shapefile for plotting in ggplot2
boundary.f <- fortify(boundary, region="borough")
boundary.f <- merge(boundary.f, boundary@data, by.x="id", by.y="borough")
# create text labels
centroids <- as.data.frame(coordinates(boundary))
centroids$count <- boundary$count
# plot the results
ggplot(subset(boundary.f, boundary.f$count != "0"), aes(long,lat)) +
geom_polygon(data=boundary.f, aes(group=group, fill=count)) +
geom_polygon(data=boundary.f, aes(long,lat, group=group), colour = "white", fill = NA, inherit.aes = F) +
geom_text(data = centroids, aes(V1, V2, label = count), colour="black", size=3) +
scale_fill_gradientn('', colours=brewer.pal(5,"PuRd"), breaks=classes_crimes, guide="colourbar") +
coord_equal() +
scale_x_continuous("", breaks=NULL) + scale_y_continuous("", breaks=NULL) + theme(panel.grid=element_blank()) +
ggtitle(expression(atop("Metropolitan Police recorded Robbery offences by borough", atop(italic("2013/14"))))) +
theme_bw() +
theme(plot.title = element_text(vjust = 1, face = c("bold"), size = 15, colour = "black"), legend.position = "none")
## Warning: Removed 1 rows containing missing values (geom_text).