County-level analysis by individual states. The framework uses spatial polygons for organizing the data.
Decide on domain.
FP1 = 55; AB1 = "WI"
FP2 = 19; AB2 = "IA"
FP3 = 17; AB3 = "IL"
FP4 = 27; AB4 = "MN"
FP5 = 38; AB5 = "ND"
FP6 = 46; AB6 = "SD"
FP7 = 31; AB7 = "NE"
FP8 = 20; AB8 = "KS"
FP9 = 29; AB9 = "MO"
FP10 = 40; AB10 = "OK"
FP11 = 18; AB11 = "IN"
FP12 = 26; AB12 = "MI"
FP13 = 5; AB13 = "AR"
FP14 = 8; AB14 = "CO"
FP15 = 35; AB15 = "NM"
FP16 = 48; AB16 = "TX"
FP17 = 22; AB17 = "LA"
FP18 = 28; AB18 = "MS"
FP19 = 1; AB19 = "AL"
FP20 = 47; AB20 = "TN"
FP21 = 21; AB21 = "KY"
FP22 = 39; AB22 = "OH"
FP23 = 13; AB23 = "GA"
FP24 = 56; AB24 = "WY"
FP = c(FP1, FP2, FP3, FP4, FP5, FP6, FP7, FP8, FP9, FP10, FP11,
FP12, FP13, FP14, FP15, FP16, FP17, FP18, FP19, FP20, FP21, FP22, FP23, FP24)
AB = c(AB1, AB2, AB3, AB4, AB5, AB6, AB7, AB8, AB9, AB10, AB11,
AB12, AB13, AB14, AB15, AB16, AB17, AB18, AB19, AB20, AB21, AB22, AB23, AB24)
The model is built using data from the SPC tornado dataset. Data are stored as a spatial lines data frame. Transform the native coordinates to a Mercator projection.
#setwd("~/Dropbox/Tyler")
if(!file.exists("torn")){
download.file(url = "http://www.spc.noaa.gov/gis/svrgis/zipped/tornado.zip",
destfile = "tornado.zip")
unzip("tornado.zip")
}
library("rgdal")
## Loading required package: sp
## rgdal: version: 1.0-4, (SVN revision 548)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
## Linking to sp version: 1.1-1
TornL = readOGR(dsn = "torn", layer = "torn",
stringsAsFactors = FALSE)
## OGR data source with driver: ESRI Shapefile
## Source: "torn", layer: "torn"
## with 58959 features
## It has 21 fields
TornL = spTransform(TornL, CRS("+init=epsg:3857")) #Mercator projection
Boundaries are from https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html. The 20m = 1:20,000,000 low resolution boundaries. Here we use the 5m = 1:5,000,000.
if(!file.exists("cb_2014_us_county_5m.shp")){
download.file("http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_county_5m.zip",
"cb_2014_us_county_5m.zip", mode = "wb")
unzip("cb_2014_us_county_5m.zip")
}
US.sp = readOGR(dsn = ".", layer = "cb_2014_us_county_5m",
stringsAsFactors = FALSE)
## OGR data source with driver: ESRI Shapefile
## Source: ".", layer: "cb_2014_us_county_5m"
## with 3233 features
## It has 9 fields
## Warning in readOGR(dsn = ".", layer = "cb_2014_us_county_5m",
## stringsAsFactors = FALSE): Z-dimension discarded
The warning message about the Z-dimension can be safely ignored.
Extract the state using the state Federal Information Processing Standard (FIPS). Print out the number of counties and the area of state in square km. Check with Wolfram alpha.
library("rgeos")
## rgeos version: 0.3-11, (SVN revision 479)
## GEOS runtime version: 3.4.2-CAPI-1.8.2 r3921
## Linking to sp version: 1.1-0
## Polygon checking: TRUE
ST.sp = US.sp[as.integer(US.sp$STATEFP) %in% FP, ]
nrow(ST.sp) #number of counties
## [1] 2168
county = paste(ST.sp$STATEFP, ST.sp$COUNTYFP, sep = "")
countiesun.sp = geometry(spChFIDs(ST.sp, county))
counties.sp = spTransform(countiesun.sp, CRS(proj4string(TornL)))
gArea(counties.sp) / 10^6 #total area of the state in square km
## [1] 7593453
Use the population estimates from the Census Bureau from 1970-2012 see http://www.nber.org/data/census-intercensal-county-population.html. Use the 2012 value for 2013 & 2014. Census U.S. Intercensal County Population Data, 1970-2014 from http://www.nber.org/data/census-intercensal-county-population.html.
library("reshape2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:rgeos':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
L = "http://myweb.fsu.edu/jelsner/data/county_population.csv"
pop = read.csv(L) %>%
filter(state_fips %in% FP, county_fips != 0) %>%
mutate(pop2013 = pop2012,
pop2014 = pop2012,
county = as.character(fips)) %>%
select(county, pop1970:pop2012, pop2013, pop2014)
Pop.df = melt(pop, id.vars = "county") %>%
mutate(yr = as.numeric(substring(variable, first = 4, last = 7))) %>%
rename(pop = value) %>%
mutate(lpop = log10(pop)) %>%
select(county, pop, yr, lpop)
Pop.df$ID = match(Pop.df$county, as.integer(county)) #generate spatial ID
Pop.df = Pop.df[order(Pop.df$ID), ] #order by spatial ID
The county population data are available through 2014 but the raw file requires some manual manipulation.
PC.df = Pop.df %>%
group_by(ID) %>%
summarize(Change = (pop[yr == max(yr)] - pop[yr == min(yr)])/pop[yr == max(yr)] * 100) %>%
data.frame()
row.names(PC.df) = county
spdf = SpatialPolygonsDataFrame(counties.sp, PC.df)
spdf$Name = ST.sp$NAME
spdf$area = rgeos::gArea(counties.sp, byid = TRUE)
Latest population values by county are needed for correlation and choropleth. The log of the population density is the common log.
poplatest = subset(Pop.df, yr == max(yr))
spdf$pop = poplatest$pop
spdf$Lpop = log10(spdf$pop)
spdf$density = spdf$pop/spdf$area * 10^6
spdf$Ldensity = log10(spdf$density)
Add a buffer to the tracks based on the width specified in the attribute table to create a spatial polygons data frame of tornado paths. Overlay paths onto the extent and return a vector indicating either NA (is not inside extent) or the county number. Remove duplicate paths. Start with 1970 for a 45-year climatology. capStyle is set to “FLAT” so the track is not made longer.
syr = 1970
eyr = 2014
Width = TornL$wid * .9144
sum(Width[Width == 0])
## [1] 0
TornP = gBuffer(TornL, byid = TRUE, width = Width/2, capStyle = "FLAT")
tc = over(TornP, counties.sp)
TornP2 = subset(TornP, !is.na(tc))
TornP2 = subset(TornP2, yr >= syr)
df = data.frame(SLAT = TornP2$slat,
SLON = TornP2$slon,
ELAT = TornP2$elat,
ELON = TornP2$elon,
DATE = TornP2$date)
dup = duplicated(df)
sum(dup)
## [1] 405
sum(dup)/dim(TornP2@data)[1] * 100
## [1] 1.055897
TornP3 = subset(TornP2, !dup)
dim(TornP3@data)
## [1] 37951 21
All EF0+ tornadoes since 1970. Overlay tornado paths on counties. Each element of the list is a county subset of the tornado track file. The data frame nTor.df contains the per-county tornado count, area, and rate.
The PathsAll.df data frame contains the tornado path attributes listed by county. Information is repeated for each county in cases where the path intersects more than one county. Padding is needed for counties without tornadoes.
TornP3 = subset(TornP3, mag >= 0) #EF0+
ct = over(counties.sp, TornP3, returnList = TRUE)
area = gArea(counties.sp, byid = TRUE)
names(ct) = county
nT = sapply(ct, function(x) nrow(x))
nTd = sapply(ct, function(x) length(unique(x$date)))
Nyears = diff(range(TornP3$yr)) + 1
nTor.df = data.frame(county = county,
Nyears = Nyears,
nT = nT,
area = area/1000000,
nTd = nTd,
rate = nT/area * 10^6 * 100 * 100 / Nyears,
stringsAsFactors = FALSE, ID = 1:length(county)
)
spdf$county = county
spdf$Nyears = nTor.df$Nyears
spdf$nT = nTor.df$nT
spdf$nTd = nTor.df$nTd
spdf$area = nTor.df$area
spdf$rate = nTor.df$rate
PathsAll.df = plyr::ldply(ct, data.frame)
colnames(PathsAll.df)[1] = "county"
none = nTor.df$county[nTor.df$nT == 0] # for counties without tornadoes
if(length(none) > 0){
tmp = PathsAll.df[1:length(none), ]
tmp[!is.na(tmp)] = NA
tmp$county = none
PathsAll.df = rbind(PathsAll.df, tmp)
}
Area is now given in square km. Rate is annual number of tornadoes per sq 100 km. Choropleth map of tornado counts & tornado days.
library("maps")
##
## # ATTENTION: maps v3.0 has an updated 'world' map. #
## # Many country borders and names have changed since 1990. #
## # Type '?world' or 'news(package="maps")'. See README_v3. #
library("maptools")
## Checking rgeos availability: TRUE
##
## Attaching package: 'maptools'
## The following object is masked from 'package:sp':
##
## nowrapSpatialLines
states = map('state', fill = TRUE,
region = c("wisconsin", "iowa", "illinois", "minnesota",
"north dakota", "south dakota", "nebraska",
"kansas", "missouri", "oklahoma", "arkansas",
"michigan", "colorado", "new mexico", "texas",
"indiana", "louisiana", "mississippi", "alabama",
"tennessee", "kentucky", "georgia", "ohio", "wyoming"),
plot = FALSE)
IDs = sapply(strsplit(states$names, ":"), function(x) x[1])
ll = "+proj=longlat +datum=WGS84"
states.sl = map2SpatialLines(states, IDs = IDs, proj4string = CRS(ll))
states.sl = spTransform(states.sl, CRS(proj4string(TornL)))
library("wesanderson")
range(spdf$nT)
## [1] 0 217
rng = seq(0, 240, 40)
crq = wes_palette(6, name = "Zissou", type = "continuous")
spplot(spdf, "nT", col = "white", at = rng,
sp.layout = list("sp.lines", states.sl),
col.regions = crq,
colorkey = list(space = "bottom", labels = paste(rng)),
par.settings = list(axis.line = list(col = NA)),
sub = "Number of EF0+ Tornadoes (1970-2014)")
library("RColorBrewer")
range(spdf$nTd)
## [1] 0 135
rng = seq(0, 140, 20)
crq = brewer.pal(7, "GnBu")
spplot(spdf, "nTd", col = "white", at = rng,
sp.layout = list("sp.lines", states.sl),
col.regions = crq,
colorkey = list(space = "bottom", labels = paste(rng)),
par.settings = list(axis.line = list(col = NA)),
sub = "Number of EF0+ Tornado Days (1970-2014)")
The data frame nTor.year is the number of tornadoes affecting the region by year.
nTor.year = PathsAll.df %>%
group_by(yr) %>%
summarize(nT = n()) %>%
data.frame()
nTor.year$yr2 = nTor.year$yr
sum(nTor.year$nT)
## [1] 43598
The data frame nTor.day0 contains the tornado county and tornado day county by year and county.
nTor.day0 = PathsAll.df %>%
group_by(county, yr) %>%
summarize(nT = length(county),
nTd = length(unique(date)))
nTor.day0$county = as.factor(nTor.day0$county)
nTor.day0$yr = as.factor(nTor.day0$yr)
nTor.day0 = left_join(expand.grid(county = levels(as.factor(PathsAll.df$county)),
yr = levels(as.factor(PathsAll.df$yr))),
nTor.day0)
## Joining by: c("county", "yr")
nTor.day0[is.na(nTor.day0)] = 0
nTor.day0$county = as.character(nTor.day0$county)
nTor.day0$yr = as.integer(as.character(nTor.day0$yr))
nTor.day = base::merge(nTor.df[c("county", "area", "ID")],
nTor.day0) %>%
arrange(ID, yr)
Merge tornado and population data.
nTor.day$county = as.integer(nTor.day$county)
popNtor = base::merge(nTor.day, Pop.df, all = FALSE) %>%
mutate(density = pop/area,
Ldensity = log2(density),
yr2 = yr) %>% #needed for the INLA models
arrange(ID, yr)
Some global controls for INLA. Use as needed.
#source("http://www.math.ntnu.no/inla/givemeINLA.R")
library("INLA")
## Loading required package: Matrix
## Loading required package: splines
control = list(
predictor = list(compute = TRUE, link = 1),
inla = list(strategy = "laplace",
fast = FALSE,
stencil = 7,
npoints = 198,
int.strategy = "grid",
dz = .5),
results = list(return.marginals.random = TRUE),
compute = list(config = TRUE, mlik = TRUE, cpo = TRUE, dic = TRUE, po = TRUE),
family = list(variant = 1, hyper = list(theta = list(prior = "loggamma", param = c(1, 1)))))
Spatial neighborhood definition as an inla graph
library("spdep")
nb = poly2nb(spdf)
nb2INLA("tornb.inla", nb)
tornb.inla = inla.read.graph("tornb.inla")
The area multiplied by Nyears indicates the square-meter years exposed to tornadoes (E). Scale the exposure to have a mean close to unity. The model has a spatial id. E is scaled to have a mean of one. constr = constant rate. The DIC measures how well the model fits the data; the larger this is, the worse the fit. Year is centered on 1991.
To use the model for prediction, pad the data set by one year and put NAs for the number of tornadoes that year. Use latest population for estimate population. This gives the expected observed frequencies for any year, not the expected total frequencies, which is had only by estimating a larger population.
popNtor$ID2 = popNtor$ID
tmp = subset(popNtor, yr == 1991)
tmp$Ldensity = max(popNtor$Ldensity)
tmp$yr = 2015
tmp$yr2 = tmp$yr
tmp$nT = NA
popNtor2 = rbind(tmp, popNtor)
Model with spatial term. Determine the model that results in the lowest DIC.
mean(popNtor2$area)
## [1] 3502.515
formula = nT ~ f(ID, model = "besag", graph = tornb.inla, constr = TRUE) +
Ldensity +
Ldensity:I(yr - 1991) + I(yr - 1991)
model = inla(formula = formula, family = "nbinomial", E = area/3500,
data = popNtor2,
quantiles = c(.05, .5, .95),
control.compute = control$compute,
control.predictor = control$predictor,
control.results = control$results,
control.family = control$family
)
df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 3500 #per sq 100 km
#df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 2000 / .386102 #per sq 100 miles
row.names(df) = county
The predicted values give the annual expected rate for each county. The rate is the number of tornadoes per 100km by 100km square region per year.
Model diagnostics. The predictive quality of the model is assessed by the cross-validated log score. A smaller value of the score indicates better prediction quality (Gneiting and Raftery 2007).
cpopit = data.frame(model$cpo,model$summary.fitted.values)[!is.na(model$cpo$pit), ]
modPIT = cpopit$pit - runif(nrow(cpopit)) * cpopit$cpo
goftest::ad.test(modPIT)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
##
## data: modPIT
## An = 1.5482, p-value = 0.1654
-mean(log(cpopit$cpo)) #equivalent to a predictive mean square error (lower is better).
## [1] 0.8296817
cor.test(popNtor$nT, cpopit$mean, conf.level = .9)
##
## Pearson's product-moment correlation
##
## data: popNtor$nT and cpopit$mean
## t = 77.111, df = 97558, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 90 percent confidence interval:
## 0.2347129 0.2446402
## sample estimates:
## cor
## 0.2396828
#plot(popNtor$nT + runif(n = length(popNtor$nT)), cpofit$mean)
Brier score
brier.score <- function(x, m){
with(m, {mean(x^2) - 2 * mean(x * mean) + mean(mean^2 + sd^2)})
}
brier.score(popNtor[["nT"]], cpopit) # (lower is better)
## [1] 0.9810801
library("ggplot2")
ggplot() +
geom_histogram(aes(x = modPIT), color = "white",
fill = "grey", binwidth = .05) +
xlab("Probability") +
ylab("Frequency") +
ggtitle("Distribution of modified PIT") #looks good! Model fits the data well over/under fit & bias not a problem
spdfR = spCbind(spdf, df)
range(log2(spdfR$mean))
## [1] -4.501836 2.070215
rng = seq(-5, 3, 1)
crq = brewer.pal(8, "Oranges")
as.character(round(2^rng, 2))
## [1] "0.03" "0.06" "0.12" "0.25" "0.5" "1" "2" "4" "8"
labs = c("0.03", "0.06", "0.12", "0.25", "0.5", "1", "2", "4", "8")
centroids = SpatialPointsDataFrame(gCentroid(spdfR, byid = TRUE), data.frame(spdfR))
cxy = coordinates(centroids)
spdfR$mean2 = log2(spdfR$mean)
spplot(spdfR, "mean2", col = "transparent", at = rng,
sp.layout = list("sp.lines", states.sl),
col.regions = crq,
colorkey = list(space = "bottom", labels = labs),
par.settings = list(axis.line = list(col = NA)),
sub = expression("Expected Annual Tornado (EF0+) Occurrence Rate [per 100 km square region]"))
range(spdfR$sd)
rngs = seq(0, 1, .2)
crqs = brewer.pal(5, "Greens")
spplot(spdfR, "sd", col = "transparent", at = rngs,
sp.layout = list("sp.lines", states.sl),
col.regions = crqs,
colorkey = list(space = "bottom", labels = paste(rngs)),
par.settings = list(axis.line = list(col = NA)),
sub = expression("Standard Error"))
Write out the shapefiles.
writeOGR(spdfR, dsn = "All", layer = "Rates", overwrite_layer = TRUE,
driver = "ESRI Shapefile")