Bruno Voisin, Irish Centre for High-End Computing - bruno.voisin@ichec.ie
29th September 2016
Good news!
However…
There are two different kind of maps that we'll use for plotting data:
library(ggmap)
ggmap( get_map("dublin, ireland", zoom=12, source="osm") )
# satellite pictures are available too
ggmap( get_map("ireland", zoom=7, maptype = "satellite") )
We'll enrich live register data with the coordinates (lon/lat) of all social welfare offices and plot them on the map.
lrm07 <- as.data.frame(read.px("data/LRM07.px"))
lwo <- read.csv("data/LWoffice_codes_coords.csv")
mylivereg <- left_join(lwo, lrm07, by=c("offices" = "Social.Welfare.Office"))
mylivereg <- filter(mylivereg, Age.Group=="All ages", Month=="2016M08", Sex=="Both sexes")
gmireland <- ggmap( get_map("ireland", zoom=7 ) )
gmireland + geom_point(data=mylivereg, aes(lon, lat))
# resize dots according to number of people
# on register in each office
gmireland + geom_point(data=mylivereg, aes(lon, lat, size=value))
# add a colour scale on that same number of
# people on register in each office
gmireland + geom_point(data=mylivereg, aes(lon, lat, size=value, colour=value))
gmdublin <- ggmap( get_map("dublin, ireland", zoom=11, source="osm") )
mylivereg <- left_join(lwo, lrm07, by=c("offices" = "Social.Welfare.Office"))
mylivereg <- filter(mylivereg, Month=="2016M08")
# plot in a grid with gender on the X axis and age on the Y axis
gmdublin + geom_point(data=mylivereg, aes(lon, lat, size=value, colour=Sex)) + facet_grid(Age.Group~Sex)
Census 2011 boundaries: http://census.cso.ie/censusasp/saps/boundaries/ED_SA%20Disclaimer1.htm
# load required libraries
library(rgeos)
library(maptools)
# read the shape file
spdf <- readShapePoly(
"data/Census2011_Admin_Counties.shp")
# make it ggplot-friendly
spdf@data$id <- rownames(spdf@data)
spdf.points <- fortify(spdf, region="id")
counties <- inner_join(spdf.points, spdf@data, by="id")
# plot it
ggplot(counties) + geom_polygon(colour="black", fill=NA, aes(x=long, y=lat, group=group)) + coord_fixed()
CSO boundaries coordinates use the TM65 projection (Irish Grid) http://spatialreference.org/ref/epsg/tm65-irish-grid/
The proj4 package can be used to reverse a projection:
library(proj4)
# store the 'proj4' string for TM65
# (can be found on spatialreference.org)
tm65 <- "+proj=tmerc +lat_0=53.5 +lon_0=-8 +k=1.000035 +x_0=200000 +y_0=250000 +a=6377340.189 +b=6356034.447938534 +units=m +no_defs "
# reverse the projection on long/lat
newlonglat <- project(
cbind(counties$long, counties$lat),
proj=tm65, inverse=TRUE)
# replace long/lat with the new ones
counties$long <- newlonglat[,1]
counties$lat <- newlonglat[,2]
(see additional code in the last few slides for an 'shp2df' function wrapping all the shape file loading and transforming for easy use of CSO boundaries)
# plot it
ggplot(counties) + geom_polygon(colour="black", fill=NA, aes(x=long, y=lat, group=group)) + coord_fixed()
Using the online map as a base layer will set the lon/lat axis to the “correct” scale:
gmireland + geom_polygon(data=counties, colour="black", fill=NA, aes(x=long, y=lat, group=group))
Shape files can also contain data attached to each region.
names(counties)
[1] "long" "lat" "order" "hole" "piece"
[6] "id" "group" "NUTS1" "NUTS1NAME" "NUTS2"
[11] "NUTS2NAME" "NUTS3" "NUTS3NAME" "COUNTY" "COUNTYNAME"
[16] "GEOGID" "MALE2011" "FEMALE2011" "TOTAL2011" "PPOCC2011"
[21] "UNOCC2011" "VACANT2011" "HS2011" "PCVAC20111" "LAND_AREA"
[26] "TOTAL_AREA" "CREATEDATE"
# fill each region according to
# the TOTAL2011 column value
ggplot(counties) + geom_polygon(colour="black", aes(x=long, y=lat, group=group, fill=TOTAL2011))
Using the Census 2016 data for Housing available on the Statbank, we'll need to change its shape in order to match it to the regions on our map. The Statbank data is in tall format, and we need it to be in wide format. Basically, we want to go from this:
Region Statistic Value
Galway stock 498
Galway vacant 14
Galway holiday 29
Galway other 8
Cork stock 903
Cork vacant 28
Cork holiday 57
Cork other 11
to this:
Region stock vacant holiday other
Galway 498 14 29 8
Cork 903 28 57 11
Also, we want to fix a number of small things like removing all non-administrative county regions, and ensuring that all administrative county names use the exact same spelling as those on the map.
It may look scary but it's actually pretty straightforward:
# load px file as data frame
ep007 <- as.data.frame(read.px("data/EP007.px"))
# make it wide format (requires reshape2 package)
ep007_wide <- dcast(ep007, Province.County.or.City~statistical.indicator, value.var = 'value')
# then remove non-admin counties aggregates: counties, state, provinces#
# requires (dplyr package)
ep007_wide_admin <- as.data.frame( filter(ep007_wide, !(Province.County.or.City %in%
c("Cork", "Dublin", "Galway", "Limerick", "Waterford", "State",
"Connacht", "Munster", "Leinster", "Ulster (part of)"))) )
# rename counties to match map nomenclature and remove unused factor levels
# (my function doing this is provided after the 'last' slide)
ep007_wide_admin$Province.County.or.City <- factor(factor_rename_counties_long(ep007_wide_admin$Province.County.or.City))
# strange errors on merge with factors, so forcing to string
counties$COUNTYNAME <- as.vector(counties$COUNTYNAME)
ep007_wide_admin$Province.County.or.City <- as.vector(ep007_wide_admin$Province.County.or.City)
# encoding problem with the 'ú' so overwriting for consistency
counties$COUNTYNAME[counties$COUNTYNAME == "D\xfan Laoghaire-Rathdown"] <- "Dún Laoghaire-Rathdown"
ep007_wide_admin$Province.County.or.City[ep007_wide_admin$Province.County.or.City == "Dún Laoghaire-Rathdown"] <- "Dún Laoghaire-Rathdown"
… aaaaand breathe!
We now have a single data row for each administrative county. Let's have a look at our available statistical indicators:
# check available data column names
colnames(ep007_wide_admin)
[1] "Province.County.or.City"
[2] "Housing Stock - 2011 (Number)"
[3] "Vacant holiday homes 2011 (Number)"
[4] "Other vacant dwellings 2011 (Number)"
[5] "Total vacant dwellings 2011 (Number)"
[6] "Vacancy Rate - 2011 (%)"
[7] "Housing Stock -2016 (Number)"
[8] "Vacant holiday homes 2016 (Number)"
[9] "Other vacant dwellings 2016 (Number)"
[10] "Total vacant dwellings 2016 (Number)"
[11] "Vacancy Rate - 2016 (%)"
[12] "Actual change in vacant dwellings (Number)"
[13] "Percentage change in vacant dwellings (%)"
We now need to merge these columns to our map data frame so they can be mapped to aesthetic attributes in our plot: region colour, region outline colour, etc.
Merging is straightforward. We will however reencode the COUNTYNAME column as a R factor as we broke that earlier to deal with the fada encoding issues (Thank you Dún Laoghaire…).
# merge map and data
ep007_map <- left_join(counties, ep007_wide_admin, by=c("COUNTYNAME" = "Province.County.or.City"))
ep007_map$COUNTYNAME <- factor(ep007_map$COUNTYNAME)
To reduce our effort and the code on the next few slides, we'll also define a base plot layer that we'll reuse instead of redefining it for each new plot:
# define a base map layer
housingplot_baselayer <- ggplot(ep007_map) +
aes(long, lat, group=group) +
geom_polygon(colour="grey")
We're ready, let's get a first plot:
# plot a data column (vacancy rates 2016 %)
housingplot_baselayer + aes(fill=`Vacancy Rate - 2016 (%)`)
# substracting both column to show evolution of housing stock between 2011 and 2016
housingplot_baselayer + aes(fill=`Housing Stock -2016 (Number)` - `Housing Stock - 2011 (Number)`) +
scale_fill_gradient2(limits=c(-350,2900), low="red", mid="white", high="blue") +
labs(fill="Housing Stock evolution 2011-2016")
# show counties by positive/negative housing stock evolution
housingplot_baselayer +
aes(fill=(`Housing Stock -2016 (Number)` - `Housing Stock - 2011 (Number)`)>0) +
labs(fill="Housing Stock increase 2011-2016")
p1 <- housingplot_baselayer + aes(fill=`Total vacant dwellings 2011 (Number)`) + scale_fill_gradient(limits=c(0,30000), low="white", high="blue") + labs(fill='Total, 2011')
p2 <- housingplot_baselayer + aes(fill=`Total vacant dwellings 2016 (Number)`) + scale_fill_gradient(limits=c(0,30000), low="white", high="blue") + labs(fill='Total, 2016')
multiplot(p1, p2, cols=2) # see additional code at the end for that function's definition
With a few recipes, it's fairly easy to:
Some effort will always be required to get the data in proper shape: tall vs wide format, cleanup of only the parts of the data you need, matching region names between map and data…
And of course, the key to it all is to find which visualisation best fits the point being made. A lot of tinkering may be involved there to work it out. See previous slide for an example of what not to do!
Presentation available online at http://rpubs.com/BrunoVoisin/csomaps
Social Welfare office coordinates courtesy of Paul Rockley, CSO.
Slides made with RStudio's tools for R presentations (https://support.rstudio.com/hc/en-us/articles/200486468).
factor_rename_counties_long <- function(myfactor){
recode(myfactor,
Carlow = 'Carlow County',
Dublin = 'Dublin County',
Kildare = 'Kildare County',
Kilkenny = 'Kilkenny County',
Laois = 'Laois County',
Longford = 'Longford County',
Louth = 'Louth County',
Meath = 'Meath County',
Offaly = 'Offaly County',
Westmeath = 'Westmeath County',
Wexford = 'Wexford County',
Wicklow = 'Wicklow County',
Clare = 'Clare County',
Cork = 'Cork County',
Kerry = 'Kerry County',
Limerick = 'Limerick County',
Waterford = 'Waterford County',
Galway = 'Galway County',
Leitrim = 'Leitrim County',
Mayo = 'Mayo County',
Roscommon = 'Roscommon County',
Sligo = 'Sligo County',
Cavan = 'Cavan County',
Donegal = 'Donegal County',
Monaghan = 'Monaghan County')
}
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
## shp2df: convenience function transforming an Esri shape file
## into a "simple" data frame that can be plotted with ggplot2.
## takes 2 parameters:
## - shapefilepath: path to .shp file (and corresponding .dbf and .shx)
## - projectionstring: proj4 format string defining the projection to reverse
## on the long/lat columns. If undefined, defaults to TM65 (Irish Grid).
## If set to FALSE, then the coordinates are left to the original values
## from the shape file.
shp2df <- function(shapefilepath, projectionstring){
# read shape file into SpatialPolygonsDataFrame object
spdf <- readShapePoly( shapefilepath )
# make a redundant but ggplot-friendly data frame out of it
spdf@data$id <- rownames(spdf@data)
spdf.points <- fortify(spdf, region="id")
spdf.df <- inner_join(spdf.points, spdf@data, by="id")
# default to TM65 (irish grid) if no transformation was specified
if (missing(projectionstring)){
projectionstring <- "+proj=tmerc +lat_0=53.5 +lon_0=-8 +k=1.000035 +x_0=200000 +y_0=250000 +a=6377340.189 +b=6356034.447938534 +units=m +no_defs "
}
# reverse transformation to get long/lat
if (projectionstring != FALSE){
longlat_transformed <- project(cbind(spdf.df$long, spdf.df$lat),
proj=projectionstring, inverse=TRUE)
spdf.df$long <- longlat_transformed[,1]
spdf.df$lat <- longlat_transformed[,2]
}
return(spdf.df)
}