… but that’s how things often are in reality
# First load some packages
library(dplyr) # Pipeline manipulation etc
library(Hmisc) # Reading in .mdb files
crimes <- NULL # Location for results
stems <- array(outer(c("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"),
c("10","11","12","13"),paste0)) # 'stem' for filenames in URI
for (stem in stems) {
holder <- tempfile(fileext='.mdb') # Create a temporary file name ending in mdb
download.file(paste0('http://www.houstontx.gov/police/cs/mdbs/',stem,'.mdb'),
holder) # Download one months data into this file
df <- mdb.get(holder) # Read this temporary mdb file into a data frame
colnames(df) <- c("date","hour","offense","beat","premise",
"block_range","street","type","suffix","n_events") # Standardise column names
if (is.null(crimes)) { # Do this for first month - create the data frame
crimes<- df
} else { # Do this for the rest - append latest data
crimes <- rbind(crimes,df[,1:10])
}
}
crimes <- as.tbl(crimes) # Convert to more modern tbl form - useful later
"29-01-1977" or a factorcrimes %>% head(n=5)
# A tibble: 5 × 10
date hour offense beat premise block_range street
<fctr> <chr> <fctr> <fctr> <fctr> <fctr> <fctr>
1 01/08/10 00:00:00 22 Murder 11H30 20R 3000-3099 BROADWAY
2 01/17/10 00:00:00 18 Murder 10H50 18C 2800-2899 MCGOWEN
3 01/01/10 00:00:00 0 Murder 15E30 18A 9600-9699 MARLIVE
4 01/09/10 00:00:00 14 Murder 5F20 13R 9000-9099 LONG POINT
5 01/18/10 00:00:00 14 Murder 10H40 13R 4300-4399 GARROTT
# ... with 3 more variables: type <fctr>, suffix <fctr>, n_events <chr>
lubridate for datesas.character first:library(lubridate)
crimes %>%
transmute(date=mdy_hms(as.character(date)),day=wday(date,label=TRUE),d_o_y=yday(date)) %>%
head(n=4)
# A tibble: 4 × 3
date day d_o_y
<dttm> <ord> <dbl>
1 2010-01-08 Fri 8
2 2010-01-17 Sun 17
3 2010-01-01 Fri 1
4 2010-01-09 Sat 9
crimes %>% transmute(
date=mdy_hms(as.character(date)) + hours(as.numeric(as.character(hour)))) %>%
head(n=4)
Warning in period(hour = x): NAs introduced by coercion
# A tibble: 4 × 1
date
<dttm>
1 2010-01-08 22:00:00
2 2010-01-17 18:00:00
3 2010-01-01 00:00:00
4 2010-01-09 14:00:00
hour is a character variable, is made numeric before using hoursNA values for hour - time of event not knownhours function issues a warning in that casecrimes %>% mutate(date=mdy_hms(as.character(date)) + hours(as.numeric(as.character(hour)))) -> crimes
select) and NA values of date (filter)crimes %>% select(-hour,-beat,-premise,-block_range,-suffix,-n_events) %>% filter(!is.na(date)) -> crimes
street and type a single character variablecrimes %>% mutate_at(vars(street,type),as.character) %>%
mutate(addr=paste(street,type)) %>% select(-street,-type) -> crimes
crimes %>% head(n=3)
# A tibble: 3 × 3
date offense addr
<dttm> <fctr> <chr>
1 2010-01-08 22:00:00 Murder BROADWAY ST
2 2010-01-17 18:00:00 Murder MCGOWEN ST
3 2010-01-01 00:00:00 Murder MARLIVE LN
library(ggplot2)
crimes %>% ggplot(aes(x=offense)) + geom_bar()
ggplot2 - a library of plot routines based on The Grammar of Graphics
ggplot creates the ‘framework’ for a plotaes - aesthetics, mapping input data to plot characteristicsgeom - a geometry turns characteristics into visual entitystat - here it counts the number of observations in each x category (default for geom_bar, so nothing shown in code)1e+05 not 100,000factorcrimes %>% ggplot(aes(x=offense)) + geom_bar() + labs(x='Offense Type',y="Number of Offenses (1000's)") +
scale_y_continuous(breaks=c(0,100000,200000,300000),labels=c(0,100,200,300))
labs re-specifies axes labelsscale_y_continuous re-specifies points on the y axis where values are marked.
breaks - where the labels golabels what is written therelabs in conjunction with scale_y_continuous
forcats packagefct_infreq reorders factors by frequency of occurrence
crimeslibrary(forcats)
crimes %>% mutate(offense = fct_infreq(offense)) -> crimes
crimes %>% ggplot(aes(x=offense)) + geom_bar() + labs(x='Offense Type',y="Number of Offenses (1000's)") +
scale_y_continuous(breaks=c(0,100000,200000,300000),labels=c(0,100,200,300))
crimes %>% ggplot(aes(x=offense)) + geom_bar() -> cplot # A plot object not yet drawn
cplot + labs(x='Offense Type',y="Number of Offenses (1000's)") +
scale_y_continuous(breaks=c(0,100000,200000,300000),labels=c(0,100,200,300)) + theme_light() # Clean up/new theme
library(ggthemes)
cplot + labs(x='Offense Type',y="Number of Offenses (1000's)") +
scale_y_continuous(breaks=c(0,100000,200000,300000),labels=c(0,100,200,300)) + theme_economist() # Re-use cplot
crimes %>% ggplot(aes(x=hour(date))) + geom_bar() + labs(x='Time of Day',y="Number of Offenses") +
scale_x_continuous(breaks=c(3,9,12,15,21),labels=c("3am","9am","Noon","3pm","9pm")) -> crime_time
crime_time
crime_time + facet_wrap(~offense,scale='free')
crime_time + facet_wrap(~offense,scale='free') +
scale_x_continuous(breaks=c(3,9,15,21),labels=c("3am","9am","3pm","9pm"))
crimes %>% filter(offense=="Murder") %>% mutate(day=wday(date,label=TRUE)) %>%
filter(date %within% interval(dmy("01012010"),dmy("01012014"))) -> murders
murders %>% ggplot(aes(x=day)) + geom_bar() + labs(y='Number of Murders',x='Day of Week')
murders %>% ggplot(aes(x=hour(date))) + geom_bar() + labs(y='Number of Murders',x='Time of Day') +
scale_x_continuous(breaks=c(3,9,15,21),labels=c("3am","9am","3pm","9pm")) -> murder_time
murder_time
murder_time + facet_wrap(~day)
murder_time + facet_wrap(~day,nrow=2) + coord_polar(start=-pi/24)
Pie charts considered harmful!
crimes %>% ggplot(aes(x=factor(1),fill=offense)) + geom_bar(width=1) + scale_x_discrete(breaks=NULL) +
scale_y_continuous(breaks=seq(100000,600000,by=100000),labels=paste0(1:6,"00,000")) +
coord_polar(theta="y") + labs(x='',y='',fill="Offense Type") + scale_fill_brewer(type='qual')
tmap’s geocode_OSM functioncrimes %>% filter(offense=="Murder") %>% transmute(addr=paste0(addr,', Houston, TX')) -> addresses
addresses$addr %>% geocode_OSM -> crime_loc
head(crime_loc,n=3)
query lat lon lat_min lat_max lon_min
1 BROADWAY ST, Houston, TX 29.69464 -95.27777 29.68736 29.70153 -95.27795
2 MCGOWEN ST, Houston, TX 29.74280 -95.36977 29.74071 29.74536 -95.37403
3 MARLIVE LN, Houston, TX 29.67905 -95.43738 29.67694 29.67905 -95.43738
lon_max
1 -95.27759
2 -95.36631
3 -95.43736
library(sp)
crime_loc %>% group_by(query) %>% summarise(lat=first(lat),lon=first(lon)) %>%
transmute(addr=query,lat,lon) -> crime_loc2
crimes %>% mutate(addr=paste0(addr,', Houston, TX')) %>% filter(offense=="Murder") %>%
left_join(crime_loc2) %>% filter(!is.na(lon)) -> murders
SpatialPointsDataFrame(murders[,c("lon","lat")],murders) -> murders
SpatialPointsDataFrame (like a shapefile)library(tmap)
tmap_mode('view')
murders %>% qtm
# Compute the day of the week for the murder
murders@data %>% mutate(day=wday(date,label=TRUE)) -> murders@data
# Create the sub-layers. Easiset via a new R function
day_layer <- function(spdf,day) spdf[spdf$day==day,]
murders %>% day_layer("Sun") -> sun_murders
murders %>% day_layer("Mon") -> mon_murders
murders %>% day_layer("Tues") -> tue_murders
murders %>% day_layer("Wed") -> wed_murders
murders %>% day_layer("Thurs") -> thu_murders
murders %>% day_layer("Fri") -> fri_murders
murders %>% day_layer("Sat") -> sat_murders
tm_shape(sun_murders) + tm_dots(col='red') + tm_shape(mon_murders) + tm_dots(col='dodgerblue') +
tm_shape(tue_murders) + tm_dots(col='dodgerblue') + tm_shape(wed_murders) + tm_dots(col='dodgerblue') +
tm_shape(thu_murders) + tm_dots(col='dodgerblue') + tm_shape(fri_murders) + tm_dots(col='dodgerblue') +
tm_shape(sat_murders) + tm_dots(col='red')
day_point <- function(x) {
temp1 <- cut(hour(x),c(0,3,9,15,21,24))
temp1 <- recode(temp1, `(0,3]`='9pm-3am',`(3,9]`='3am-9am',`(9,15]`='9am-3pm',
`(15,21]`='3pm-9pm',`(21,24]`='9pm-3am')
return(factor(temp1))
}
murders@data %>% mutate(dpoint = day_point(date)) -> murders@data
tm_shape(murders) + tm_dots(col='dpoint',title='Time of Day')