Get cb_2014_us_state_5m.zip and unzip it into the data folder.1 This contains the cartographic boundary files.
Save the following as state_fips_postal.csv in the csv directory.2
Alabama,1,AL
Alaska,2,AK
Arizona,4,AZ
Arkansas,5,AR
California,6,CA
Colorado,8,CO
Connecticut,9,CT
Delaware,10,DE
District of Columbia,11,DC
Florida,12,FL
Georgia,13,GA
Hawaii,15,HI
Idaho,16,ID
Illinois,17,IL
Indiana,18,IN
Iowa,19,IA
Kansas,20,KS
Kentucky,21,KY
Louisiana,22,LA
Maine,23,ME
Maryland,24,MD
Massachusetts,25,MA
Michigan,26,MI
Minnesota,27,MN
Mississippi,28,MS
Missouri,29,MO
Montana,30,MT
Nebraska,31,NE
Nevada,32,NV
New Hampshire,33,NH
New Jersey,34,NJ
New Mexico,35,NM
New York,36,NY
North Carolina,37,NC
North Dakota,38,ND
Ohio,39,OH
Oklahoma,40,OK
Oregon,41,OR
Pennsylvania,42,PA
Rhode Island,44,RI
South Carolina,45,SC
South Dakota,46,SD
Tennessee,47,TN
Texas,48,TX
Utah,49,UT
Vermont,50,VT
Virginia,51,VA
Washington,53,WA
West Virginia,54,WV
Wisconsin,55,WI
Wyoming,56,WY
setwd("~/Desktop/gitmo/playpen/r") # Your directory here
getwd()
## [1] "/Users/rc/Desktop/gitmo/playpen/r"
x <- c("dplyr", "ggmap", "ggplot2", "RColorBrewer", "rgdal", "classInt", "RCurl")
lapply(x, library, character.only = TRUE)
## [[1]]
## [1] "dplyr" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "ggmap" "ggplot2" "dplyr" "stats" "graphics"
## [6] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "ggmap" "ggplot2" "dplyr" "stats" "graphics"
## [6] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "RColorBrewer" "ggmap" "ggplot2" "dplyr"
## [5] "stats" "graphics" "grDevices" "utils"
## [9] "datasets" "methods" "base"
##
## [[5]]
## [1] "rgdal" "sp" "RColorBrewer" "ggmap"
## [5] "ggplot2" "dplyr" "stats" "graphics"
## [9] "grDevices" "utils" "datasets" "methods"
## [13] "base"
##
## [[6]]
## [1] "classInt" "rgdal" "sp" "RColorBrewer"
## [5] "ggmap" "ggplot2" "dplyr" "stats"
## [9] "graphics" "grDevices" "utils" "datasets"
## [13] "methods" "base"
##
## [[7]]
## [1] "RCurl" "bitops" "classInt" "rgdal"
## [5] "sp" "RColorBrewer" "ggmap" "ggplot2"
## [9] "dplyr" "stats" "graphics" "grDevices"
## [13] "utils" "datasets" "methods" "base"
This could also be accomplished one-at-a-time4
require("dplyr")
require("ggmap")
require("ggplot2")
require("RColorBrewer")
require("rgdal")
require("classInt")
require("RCurl")
In a future installment, a technique to inset Alaska and Hawaii will be shown.
remove.farflung = function(.df) {
subset(.df,
.df$id != "AS" &
.df$id != "MP" &
.df$id != "GU" &
.df$id != "PR" &
.df$id != "VI" &
.df$id != "AK" &
.df$id != "HI"
)
}
save(remove.farflung, file = "helpers/remove.farflung")
intervals = function(.df, ...){
argList = match.call(expand.dots=FALSE)$...
for(i in 1:length(argList)){
colName <- argList[[i]]
series_colName = eval(substitute(colName), envir=.df, enclos=parent.frame())
min <- min(series_colName)
max <- max(series_colName)
diff <- max - min
std <- sd(series_colName)
equal.interval <- seq(min, max, by = diff/6)
quantile.interval <- quantile(series_colName, probs = seq(0, 1, by = 1/6))
std.interval <- c(seq(min, max, by = std), max)
natural.interval <- classIntervals(series_colName, n = 6, style = 'jenks')$brks
.df$equal <- cut(series_colName, breaks = equal.interval, include.lowest = TRUE)
names(.df)[names(.df)=="equal"] <- paste(colName,".","equal", sep = '')
.df$quantile <- cut(series_colName, breaks = quantile.interval, include.lowest = TRUE)
names(.df)[names(.df)=="quantile"] <- paste(colName,".","quantile", sep = '')
.df$std <- cut(series_colName, breaks = std.interval, include.lowest = TRUE)
names(.df)[names(.df)=="std"] <- paste(colName,".","std", sep = '')
.df$natural <- cut(series_colName, breaks = natural.interval, include.lowest = TRUE)
names(.df)[names(.df)=="natural"] <- paste(colName,".","natural", sep = '')
}
return(.df)
}
save(intervals, file = "helpers/intervals")
bucket <- function(x,y) trunc(x/y)*y
save(intervals, file = "helpers/bucket")
no_ylab = ylab("")
no_xlab = xlab("")
plain_theme = theme(axis.text=element_blank()) + theme(panel.background = element_blank(), panel.grid = element_blank(), axis.ticks = element_blank())
poly = coord_map("polyconic")
pop_source = "https://www.census.gov/popest/data/national/totals/2014/files/NST_EST2014_ALLDATA.csv"
population = read.csv(textConnection(getURL(pop_source)))[-(1:5),]
save(population, file = "helpers/population")
dsn <- "data/cb_2014_us_state_5m"
layer <- "cb_2014_us_state_5m"
cb5 = readOGR(dsn, layer)
## OGR data source with driver: ESRI Shapefile
## Source: "data/cb_2014_us_state_5m", layer: "cb_2014_us_state_5m"
## with 56 features
## It has 9 fields
save(cb5, file ="data/cb5")
summary(cb5)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -179.14734 179.77847
## y -14.55255 71.35256
## Is projected: FALSE
## proj4string :
## [+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0]
## Data attributes:
## STATEFP STATENS AFFGEOID GEOID STUSPS
## 01 : 1 00068085: 1 0400000US01: 1 01 : 1 AK : 1
## 02 : 1 00294478: 1 0400000US02: 1 02 : 1 AL : 1
## 04 : 1 00448508: 1 0400000US04: 1 04 : 1 AR : 1
## 05 : 1 00481813: 1 0400000US05: 1 05 : 1 AS : 1
## 06 : 1 00606926: 1 0400000US06: 1 06 : 1 AZ : 1
## 08 : 1 00662849: 1 0400000US08: 1 08 : 1 CA : 1
## (Other):50 (Other) :50 (Other) :50 (Other):50 (Other):50
## NAME LSAD ALAND AWATER
## Alabama : 1 00:56 Min. :1.584e+08 Min. :1.863e+07
## Alaska : 1 1st Qu.:2.483e+10 1st Qu.:1.502e+09
## American Samoa: 1 Median :1.285e+11 Median :3.705e+09
## Arizona : 1 Mean :1.635e+11 Mean :1.249e+10
## Arkansas : 1 3rd Qu.:2.008e+11 3rd Qu.:8.945e+09
## California : 1 Max. :1.478e+12 Max. :2.455e+11
## (Other) :50
us = fortify(cb5, region = "STUSPS")
us48 = remove.farflung(us)
pop = data.frame(population$NAME,population$POPESTIMATE2014)
names(pop) = c("state", "pop")
converter = read.csv("csv/state_fips_postal.csv", header = FALSE)
converter = rename(converter, state = V1, fips = V2, id = V3)
pop = merge(pop, converter)
pop$state = NULL
pop$fips = NULL
pop48 = remove.farflung(pop)
pop48 = intervals(pop48, pop)
map48 = merge(us48,pop48) # has to be geo first
b = ggplot(data = map48, aes(x=long, y=lat, group = group))
d = b + geom_polygon(aes(x=long, y=lat, group=group), color = "dark grey", size = 0.3)
d
w = d + geom_polygon(aes(x=long, y=lat, group=group), color = "dark grey", size = 0.3, fill="white")
w
p = w + poly
p
my_title = "Population Map of 48 Contiguous States and District of Columbia,\ndefault settings"
sources <- "Source: U.S. Census Bureau 2014 Population Estimates http://1.usa.gov/1LPM7hc\n and Cartographic Boundary File (States, cb_2014_us_state_5m) http://1.usa.gov/1IZgFLV\nPrepared by: @technocrat 2015-07-15"
p = b + geom_polygon(color = "dark grey", size = 0.3, aes(fill = pop)) + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black")
p
my_title = "Population Map of 48 Contiguous States and District of Columbia,\n no background or axis legends, polyconic projection"
p + plain_theme + no_ylab + no_xlab + poly + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black") # blues, reversed
buckets = bucket(pop48$pop,5000000)
popb = data.frame(pop48$id,buckets)
names(popb) = c("id","buckets")
map48b = merge(us48,popb)
my_title = "Population Map of 48 Contiguous States and District of Columbia,\n manual discrete scale, default colors"
m = b + geom_polygon(color = "dark grey", size = 0.3, aes(fill=factor(map48b$buckets)))
m + plain_theme + no_xlab + no_ylab + poly + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black")
my_title = "Population Map of 48 Contiguous States and District of Columbia,\n manual discrete scale, graded neutral colors"
m + plain_theme + no_xlab + no_ylab + poly + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black") + scale_fill_brewer(type = "seq", palette = "YlOrBr")
my_title = "Population Map of 48 Contiguous States and District of Columbia,\nwith improved legend"
m + plain_theme + no_xlab + no_ylab + poly + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black") + scale_fill_brewer(type = "seq", palette = "YlOrBr",name = "State Population", labels=c("Under 5 million","5-10 million","10-15 million","15-20 million","20-25 million","more than 35 million"))
my_title = "Population Map of 48 Contiguous States and District of Columbia,\nwith equal interval scale"
e = b + geom_polygon(color = "dark grey", size = 0.3, aes(fill = pop.equal)) + ggtitle(my_title) + annotate("text", x = -100, y = 22, label = sources, size = 3, family = "Times", colour = "black") + plain_theme + no_xlab + no_ylab + poly + scale_fill_brewer(type = "seq", palette = "YlOrBr",name = "State Population", labels=c("Under 6.95 million","6.95-13.3 million","19.7-26.1 million","26.1-32.4 million","34.2-38.8 million"))
e
my_title = "Population Map of 48 Contiguous States and District of Columbia,\nwith quantile interval scale"
q = b + geom_polygon(aes(x=long, y=lat, group=group, fill=pop.quantile), color = "dark grey", size = 0.3)
q = q +
plain_theme + no_ylab + no_xlab + poly + scale_fill_brewer(palette = "YlOrBr", name = "State Population", labels=c("Under 1.33 million","1.33-2.94 million","2.94-4.65 million","4.65-6.55 million","6.55-9.94 million", "9.94-38.8 million")) + ggtitle(my_title)
q
my_title = "Population Map of 48 Contiguous States and District of Columbia,\nwith standard deviation scale"
s = b + geom_polygon(aes(x=long, y=lat, group=group, fill=pop.std), color = "dark grey", size = 0.3)
s = s +
plain_theme + no_ylab + no_xlab + poly + scale_fill_brewer(palette = "YlOrBr", name = "State Population", labels=c("Under 7.77 million","7.77-15 million","15-22.2 million","22.2-29.3 million","36.5-38.8 million")) + ggtitle(my_title)
s
my_title = "Population Map of 48 Contiguous States and District of Columbia,\nwith natural scale"
n = b + geom_polygon(aes(x=long, y=lat, group=group, fill=pop.natural), color = "dark grey", size = 0.3)
n = n +
plain_theme + no_ylab + no_xlab + poly + scale_fill_brewer(palette = "YlOrBr", name = "State Population", labels=c("Under 3.6 million","3.6-7.06 million","7.06-12.9 million","12.9-19.9 million","19.9-27 million","27-38.8 million")) + ggtitle(my_title)
n
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.4 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] maptools_0.8-36 mapproj_1.2-3 maps_2.3-10
## [4] RCurl_1.95-4.7 bitops_1.0-6 classInt_0.1-22
## [7] rgdal_1.0-4 sp_1.1-1 RColorBrewer_1.1-2
## [10] ggmap_2.4 ggplot2_1.0.1 dplyr_0.4.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.11.6 formatR_1.2 plyr_1.8.3
## [4] class_7.3-13 tools_3.2.1 digest_0.6.8
## [7] evaluate_0.7 gtable_0.1.2 lattice_0.20-33
## [10] png_0.1-7 DBI_0.3.1 yaml_2.1.13
## [13] parallel_3.2.1 proto_0.3-10 e1071_1.6-4
## [16] stringr_1.0.0 knitr_1.10.5 rgeos_0.3-11
## [19] RgoogleMaps_1.2.0.7 grid_3.2.1 R6_2.1.0
## [22] jpeg_0.1-8 foreign_0.8-65 rmarkdown_0.7
## [25] RJSONIO_1.3-0 reshape2_1.4.1 magrittr_1.5
## [28] scales_0.2.5 htmltools_0.2.6 MASS_7.3-42
## [31] assertthat_0.1 colorspace_1.2-6 geosphere_1.4-3
## [34] labeling_0.3 stringi_0.5-5 lazyeval_0.1.10
## [37] munsell_0.4.2 rjson_0.2.15
The next installment, to be announced by @technocrat on twitter, will deal with insets (for Alaska and Hawaii) and outsets (for Delaware, the District of Columbia and Rhode Island) plus labeling of states. A future installment will discuss selection of data transforms, color and workflow recommendations for journalists preparing these types of thematic maps.
I am unable to answer questions relating to the use of this code under Windows.
Copyright (c) 2015, Richard Careaga
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This could be done on demand within R; however, the data does not change frequently (annual editions). There are three versions of the cartographic boundary file. cb_2014_us_state_5m.zip is intermediate in resolution; cb_2014_us_state_500K is unsatisfactorily low-resolution; cb_2014_us_state_20m.zip is unnecessarily high-resolution for displaying thematic data.↩
These are the state name, the Census FIPS code and the postal abbreviation for each state. We will use these to match up how various data sources are indexed with the corresponding state codes in the base map data object.↩
Setting the working directory will save keystrokes and help ensure files are being accessed and saved to the right locations.↩
The advantage of creating an object, x, by using the c function to combine the names of the libraries, is that it is easily saved together with the following line as a snippet and keeps all the required libraries in a single unit. The second line calls the lapply function to apply the library function to each element of the object x. Later, we’ll combine this and other timesavers into a standard script.↩
The advantages of saving objects this way are minimizing cut-and-paste or transcription errors and reducing effort to locate the object.↩
Too many thematic maps simply show data that is distributed substantially identically to population.↩
This requires looking at the population object, with head(population) and picking the field with the data of interest. This may, of course, require going back to the source for documentation of the field names.↩
Everyone has their own way of naming; give consideration to using names that are easy to associate with content.↩
If you try to plot this object, by entering its name, b, you will get an error message, Error: No layers in plot because there is nothing to indicate what type of plot it should be.↩
See R-Cookbook for a rundown of ggplot2 color.↩
A map projection is to compensate for the distortion of displaying a portion of a sphere on a flat surface.↩
The continuous scale is difficult to read. Converting it to a discrete scale requires deciding on the appropriate intervals into which to divide the data, in this case population. Example, states under 1 million, 1-5 million, 5-10 million, 10-20 million, 20-30 million, 30 million and up. More than 9 categories are difficult to display effectively; fewer than 6 categories may conceal important differences.↩
A quantile interval scale divides the data into categories based on equal percentages of states within each category.↩
A standard deviation scale divides the data according to how many standard deviations each state is from the mean population of all states.↩
A natural scale divides the data according to how the states cluster near each other.↩