Long-Term View of Tornado Risk: County-Level Tornado Rates Adjusted for Population & Exposure

James B. Elsner, Thomas H. Jagger, and Tyler Fricker

County-level analysis by individual states. The framework uses spatial polygons for organizing the data.

Decide on a spatial domain.

FP1 = 55; AB1 = "WI"
FP2 = 19; AB2 = "IA"
FP3 = 17; AB3 = "IL"
FP4 = 27; AB4 = "MN"
FP5 = 38; AB5 = "ND"
FP6 = 46; AB6 = "SD"
FP7 = 31; AB7 = "NE"
FP8 = 20; AB8 = "KS"
FP9 = 29; AB9 = "MO"
FP10 = 40; AB10 = "OK"
FP11 = 18; AB11 = "IN"
FP12 = 26; AB12 = "MI"
FP13 = 5; AB13 = "AR"
FP14 = 8; AB14 = "CO"
FP15 = 35; AB15 = "NM"
FP16 = 48; AB16 = "TX"
FP17 = 22; AB17 = "LA"
FP18 = 28; AB18 = "MS"
FP19 = 1; AB19 = "AL"
FP20 = 47; AB20 = "TN"
FP21 = 21; AB21 = "KY"
FP22 = 39; AB22 = "OH"
FP23 = 13; AB23 = "GA"
FP24 = 56; AB24 = "WY"
FP = c(FP1, FP2, FP3, FP4, FP5, FP6, FP7, FP8, FP9, FP10, FP11, 
       FP12, FP13, FP14, FP15, FP16, FP17, FP18, FP19, FP20, FP21, FP22, FP23, FP24)
AB = c(AB1, AB2, AB3, AB4, AB5, AB6, AB7, AB8, AB9, AB10, AB11, 
       AB12, AB13, AB14, AB15, AB16, AB17, AB18, AB19, AB20, AB21, AB22, AB23, AB24)

Get tornado data

The model is built using data from the SPC tornado dataset. Data are stored as a spatial lines data frame. Transform the native coordinates to a Mercator projection.

if(!file.exists("torn")){ 
  download.file(url = "http://www.spc.noaa.gov/gis/svrgis/zipped/tornado.zip",
              destfile = "tornado.zip")
  unzip("tornado.zip")
  }
library("rgdal")

## Loading required package: sp

## rgdal: version: 1.0-4, (SVN revision 548)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
##  Linking to sp version: 1.1-1

TornL = readOGR(dsn = "torn", layer = "torn", 
                stringsAsFactors = FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: "torn", layer: "torn"
## with 60114 features
## It has 22 fields

TornL = spTransform(TornL, CRS("+init=epsg:3857")) #Mercator projection

Get County Boundaries

Boundaries are from https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html. The 20m = 1:20,000,000 low resolution boundaries. Here we use the 5m = 1:5,000,000.

if(!file.exists("cb_2014_us_county_5m.shp")){
  download.file("http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_county_5m.zip",
               "cb_2014_us_county_5m.zip", mode = "wb")
  unzip("cb_2014_us_county_5m.zip")
  }
US.sp = readOGR(dsn = ".", layer = "cb_2014_us_county_5m", 
                                    stringsAsFactors = FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: ".", layer: "cb_2014_us_county_5m"
## with 3233 features
## It has 9 fields

## Warning in readOGR(dsn = ".", layer = "cb_2014_us_county_5m",
## stringsAsFactors = FALSE): Z-dimension discarded

The warning message about the Z-dimension can be safely ignored.

Extract the state using the state Federal Information Processing Standard (FIPS). Print out the number of counties and the area of state in square km. Check with Wolfram alpha.

library("rgeos")

## rgeos version: 0.3-11, (SVN revision 479)
##  GEOS runtime version: 3.4.2-CAPI-1.8.2 r3921 
##  Linking to sp version: 1.1-0 
##  Polygon checking: TRUE

ST.sp = US.sp[as.integer(US.sp$STATEFP) %in% FP, ]
nrow(ST.sp) #number of counties

## [1] 2168

county = paste(ST.sp$STATEFP, ST.sp$COUNTYFP, sep = "")
countiesun.sp = geometry(spChFIDs(ST.sp, county)) 
counties.sp = spTransform(countiesun.sp, CRS(proj4string(TornL)))
gArea(counties.sp) / 10^6 #total area of the spatial domain in square km

## [1] 7593453

Organize county-level population data

Use the population estimates from the Census Bureau from 1970-2012 see http://www.nber.org/data/census-intercensal-county-population.html. Use the 2012 value for 2013 & 2014. Census U.S. Intercensal County Population Data, 1970-2014 from http://www.nber.org/data/census-intercensal-county-population.html.

library("reshape2")
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:rgeos':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

L = "http://myweb.fsu.edu/jelsner/data/county_population.csv"
pop = read.csv(L) %>%
  filter(state_fips %in% FP, county_fips != 0) %>%
  mutate(pop2013 = pop2012,
         pop2014 = pop2012,
         pop2015 = pop2012,
         county = as.character(fips)) %>%
  select(county, pop1970:pop2012, pop2013, pop2014, pop2015)

Pop.df = melt(pop, id.vars = "county") %>%
  mutate(yr = as.numeric(substring(variable, first = 4, last = 7))) %>%
  rename(pop = value) %>%
  mutate(lpop = log10(pop)) %>%
  select(county, pop, yr, lpop)

Pop.df$ID = match(Pop.df$county, as.integer(county)) #generate spatial ID
Pop.df = Pop.df[order(Pop.df$ID), ] #order by spatial ID

The county population data are available through 2014 but the raw file requires some manual manipulation.

PC.df = Pop.df %>%
  group_by(ID) %>%
  summarize(Change = (pop[yr == max(yr)] - pop[yr == min(yr)])/pop[yr == max(yr)] * 100) %>%
  data.frame() 

row.names(PC.df) = county
spdf = SpatialPolygonsDataFrame(counties.sp, PC.df)
spdf$Name = ST.sp$NAME
spdf$area = rgeos::gArea(counties.sp, byid = TRUE)

Latest population values by county are needed for correlation and choropleth. The log of the population density is the common log.

poplatest = subset(Pop.df, yr == max(yr))
spdf$pop = poplatest$pop
spdf$Lpop = log10(spdf$pop)
spdf$density = spdf$pop/spdf$area * 10^6
spdf$Ldensity = log10(spdf$density)

Buffer tornado tracks to make tornado paths

Add a buffer to the tracks based on the width specified in the attribute table to create a spatial polygons data frame of tornado paths. Overlay paths onto the extent and return a vector indicating either NA (is not inside extent) or the county number. Remove duplicate paths. Start with 1970 for a 45-year climatology. capStyle is set to “FLAT” so the track is not made longer.

syr = 1970
eyr = 2015
Width = TornL$wid * .9144
sum(Width[Width == 0])

## [1] 0

TornP = gBuffer(TornL, byid = TRUE, width = Width/2, capStyle = "FLAT")
tc = over(TornP, counties.sp)
TornP2 = subset(TornP, !is.na(tc))
TornP2 = subset(TornP2, yr >= syr, yr <= eyr)

df = data.frame(SLAT = TornP2$slat, 
                SLON = TornP2$slon, 
                ELAT = TornP2$elat, 
                ELON = TornP2$elon,
                DATE = TornP2$date)
dup = duplicated(df)
sum(dup)

## [1] 405

sum(dup)/dim(TornP2@data)[1] * 100

## [1] 1.027397

TornP3 = subset(TornP2, !dup)
dim(TornP3@data)

## [1] 39015    22

All EF0+ tornadoes since 1970. Overlay tornado paths on counties. Each element of the list is a county subset of the tornado track file. The data frame nTor.df contains the per-county tornado count, area, and rate.

df = as.data.frame(TornP3) %>%
  group_by(yr) %>%
  summarize(nT = n())

library("ggplot2")
library("ggrepel")
ggplot(df, aes(x = yr, y = nT)) + 
  geom_point() +
  geom_smooth() +
#  geom_text(aes(label = nT), vjust = -.5, size = 4, color = "darkblue") +
  geom_text_repel(aes(label = nT), color = "darkblue") +
  xlab("Year") + ylab("Annual Number of EF0+ Tornadoes") +
  scale_y_continuous(limits = c(0, 1600)) +
#  scale_y_continuous(limits = c(0, 500)) +
  theme(axis.text.x  = element_text(size = 7), 
        legend.position = "none")

Figure: Time series tornado counts over the domain

The PathsAll.df data frame contains the tornado path attributes listed by county. Information is repeated for each county in cases where the path intersects more than one county. Padding is needed for counties without tornadoes.

TornP3 = subset(TornP3, mag >= 0) #EF0+
ct = sp::over(counties.sp, TornP3, returnList = TRUE)
area = gArea(counties.sp, byid = TRUE)
names(ct) = county
nT = sapply(ct, function(x) nrow(x))
nTd = sapply(ct, function(x) length(unique(x$date)))
Nyears = diff(range(TornP3$yr)) + 1
nTor.df = data.frame(county = county, 
                     Nyears =  Nyears,
                     nT = nT, 
                     area = area/1000000, 
                     nTd = nTd,
                     rate = nT/area * 10^6 * 100 * 100 / Nyears,
                     stringsAsFactors = FALSE, ID = 1:length(county)
                     )
spdf$county = county
spdf$Nyears = nTor.df$Nyears
spdf$nT = nTor.df$nT
spdf$nTd = nTor.df$nTd
spdf$area = nTor.df$area
spdf$rate = nTor.df$rate
PathsAll.df = plyr::ldply(ct, data.frame)
colnames(PathsAll.df)[1] = "county"

none = nTor.df$county[nTor.df$nT == 0] # for counties without tornadoes
if(length(none) > 0){
  tmp = PathsAll.df[1:length(none), ]
  tmp[!is.na(tmp)] = NA
  tmp$county = none
  PathsAll.df = rbind(PathsAll.df, tmp)
}
length(none)/dim(pop)[1] * 100

## [1] 0.8763838

Area is now given in square km. Rate is annual number of tornadoes per sq 100 km. Choropleth map of tornado counts & tornado days. Less than 1% of all counties have not experienced a tornado.

library("maps")

## 
##  # ATTENTION: maps v3.0 has an updated 'world' map.        #
##  # Many country borders and names have changed since 1990. #
##  # Type '?world' or 'news(package="maps")'. See README_v3. #

library("maptools")

## Checking rgeos availability: TRUE

states = map('state', fill = TRUE, 
             region = c("wisconsin", "iowa", "illinois", "minnesota", 
                        "north dakota", "south dakota", "nebraska", 
                        "kansas", "missouri", "oklahoma", "arkansas",
                        "michigan", "colorado", "new mexico", "texas",
                        "indiana", "louisiana", "mississippi", "alabama",
                        "tennessee", "kentucky", "georgia", "ohio", "wyoming"), 
             plot = FALSE)
IDs = sapply(strsplit(states$names, ":"), function(x) x[1])
ll = "+proj=longlat +datum=WGS84"
states.sl = map2SpatialLines(states, IDs = IDs, proj4string = CRS(ll))
states.sl = spTransform(states.sl, CRS(proj4string(TornL)))

library("wesanderson")
range(spdf$nT)

## [1]   0 219

rng = seq(0, 240, 40)
crq = wes_palette(6, name = "Zissou", type = "continuous")
spplot(spdf, "nT", col = "transparent", at = rng, 
       sp.layout = list("sp.lines", states.sl),
       col.regions = crq,
       colorkey = list(space = "bottom", labels = paste(rng)),
       par.settings = list(axis.line = list(col = NA)),
       sub = "Number of EF0+ Tornadoes (1970-2015)")

df = as.data.frame(spdf) %>%
  arrange(desc(rate))
df[1:10, ]

##      ID   Change       Name      area     pop     Lpop   density Ldensity
## 1  1324 43.48717  Galveston 1484.2562  300484 5.477821 202.44754 2.306312
## 2   712 50.82998  Lafayette  940.5175  227055 5.356131 241.41497 2.382764
## 3  1996       NA Broomfield  146.9212   58298 4.765654 396.79784 2.598569
## 4   861 59.04946     Harris 6131.8439 4253700 6.628767 693.70651 2.841176
## 5  1751 15.83376     Acadia 2288.5315   61912 4.791775  27.05316 1.432218
## 6  1654 18.85442     Denver  683.4865  634265 5.802271 927.98468 2.967541
## 7    24 70.17160    Johnson 2678.8557  153441 5.185941  57.27856 1.757992
## 8  1998 59.57576      Adams 5213.0624  459598 5.662378  88.16277 1.945285
## 9   385 28.85811   Oklahoma 2819.1453  741781 5.870276 263.12266 2.420158
## 10  933 69.19153  Cleveland 2165.7340  265638 5.424290 122.65495 2.088685
##    county Nyears  nT nTd      rate
## 1   48167     46  74  66 10.838396
## 2   22055     46  33  28  7.627623
## 3   08014     46   5   4  7.398230
## 4   48201     46 192 124  6.806946
## 5   22001     46  71  49  6.744405
## 6   08031     46  21  20  6.679309
## 7   48251     46  75  42  6.086310
## 8   08001     46 141 102  5.879879
## 9   40109     46  75  47  5.783437
## 10  40027     46  57  42  5.721526

library("RColorBrewer")
range(spdf$nTd)

## [1]   0 137

rng = seq(0, 140, 20)
crq = brewer.pal(7, "GnBu")
spplot(spdf, "nTd", col = "transparent", at = rng, 
       sp.layout = list("sp.lines", states.sl),
       col.regions = crq,
       colorkey = list(space = "bottom", labels = paste(rng)),
       par.settings = list(axis.line = list(col = NA)),
       sub = "Number of EF0+ Tornado Days (1970-2015)")

range(log2(spdf$pop))

## [1]  6.149747 22.318752

rng = seq(6, 24, 3)
cr = brewer.pal(6, "Blues")
labs = as.character(round(2^rng))
spdf$pop2 = log2(spdf$pop)

spplot(spdf, "pop2", col = "transparent", at = rng, 
       sp.layout = list("sp.lines", states.sl),
       col.regions = cr,
       colorkey = list(space = "bottom", labels = labs),
       par.settings = list(axis.line = list(col = NA)),
       sub = "Population (2012)")

Annual counts by county

The data frame nTor.year is the number of tornadoes affecting the region by year. Need to filter by != NA to remove the row containing the count of counties without tornadoes as the year is set to NA.

nTor.year = PathsAll.df %>%
  group_by(yr) %>%
  summarize(nT = n()) %>%
  filter(yr != "NA") %>%   
  data.frame()
nTor.year$yr2 = nTor.year$yr
sum(nTor.year$nT)

## [1] 44789

The data frame nTor.day0 contains the tornado county and tornado day county by year and county.

nTor.day0 = PathsAll.df %>%
  group_by(county, yr) %>%
  summarize(nT = length(county),
            nTd = length(unique(date)))

nTor.day0$county = as.factor(nTor.day0$county)
nTor.day0$yr = as.factor(nTor.day0$yr)

nTor.day0 = left_join(expand.grid(county = levels(as.factor(PathsAll.df$county)), 
                                  yr = levels(as.factor(PathsAll.df$yr))), 
                      nTor.day0)

## Joining by: c("county", "yr")

nTor.day0[is.na(nTor.day0)] = 0
nTor.day0$county = as.character(nTor.day0$county)
nTor.day0$yr = as.integer(as.character(nTor.day0$yr))

nTor.day = base::merge(nTor.df[c("county", "area", "ID")],
                 nTor.day0) %>%
  arrange(ID, yr)

Merge tornado and population data.

nTor.day$county = as.integer(nTor.day$county)
popNtor = base::merge(nTor.day, Pop.df, all = FALSE) %>%
  mutate(density = pop/area,
         Ldensity = log2(density),
         yr2 = yr) %>% #needed for the INLA models
  arrange(ID, yr)

Spatial model for county-level tornado counts

Some global controls for INLA. Use as needed.

#source("http://www.math.ntnu.no/inla/givemeINLA.R")
library("INLA")

## Loading required package: Matrix

## Loading required package: splines

control = list(
  predictor = list(compute = TRUE, link = 1),
  inla = list(strategy = "laplace", 
              fast = FALSE,
              stencil = 7,
              npoints = 198,
              int.strategy = "grid", 
              dz = .5),
  results = list(return.marginals.random = TRUE),
  compute = list(config = TRUE, mlik = TRUE, cpo = TRUE, dic = TRUE, po = TRUE),
  family = list(variant = 1, hyper = list(theta = list(prior = "loggamma", param = c(1, 1)))))

Spatial neighborhood definition as an inla graph

library("spdep")
nb = poly2nb(spdf)
nb2INLA("tornb.inla", nb)
tornb.inla = inla.read.graph("tornb.inla")

The area multiplied by Nyears indicates the square-meter years exposed to tornadoes (E). Scale the exposure to have a mean close to unity. The model has a spatial id. E is scaled to have a mean of one. constr = constant rate. The DIC measures how well the model fits the data; the larger this is, the worse the fit. Year is centered on 1991.

To use the model for prediction, pad the data set by one year and put NAs for the number of tornadoes that year. Use latest population for estimate population. This gives the expected observed frequencies for any year, not the expected total frequencies, which is had only by estimating a larger population.

popNtor$ID2 = popNtor$ID
tmp = subset(popNtor, yr == 1991)
tmp$Ldensity = max(popNtor$Ldensity)
tmp$yr = 2016
tmp$yr2 = tmp$yr
tmp$nT = NA
popNtor2 = rbind(tmp, popNtor)

Model with spatial term. Determine the model that results in the lowest DIC. 40 minutes to run on Mac pro.

mean(popNtor2$area)

## [1] 3502.515

formula = nT ~ f(ID, model = "besag", graph = tornb.inla, constr = TRUE) + 
               Ldensity + 
               Ldensity:I(yr - 1991) + I(yr - 1991)
model = inla(formula = formula, family = "nbinomial", E = area/3500,
             data = popNtor2,
             quantiles = c(.05, .5, .95),
             control.compute = control$compute,
             control.predictor = control$predictor,
             control.results = control$results,
             control.family = control$family
             )
df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 3500   #per sq 100 km
#df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 2000 / .386102  #per sq 100 miles
row.names(df) = county

The predicted values give the annual expected rate for each county. The rate is the number of tornadoes per 100km by 100km square region per year.

Model diagnostics. The predictive quality of the model is assessed by the cross-validated log score. A smaller value of the score indicates better prediction quality (Gneiting and Raftery 2007).

cpopit = data.frame(model$cpo,model$summary.fitted.values)[!is.na(model$cpo$pit), ]
modPIT = cpopit$pit - runif(nrow(cpopit)) * cpopit$cpo
goftest::ad.test(modPIT)

## 
##  Anderson-Darling test of goodness-of-fit
##  Null hypothesis: uniform distribution
## 
## data:  modPIT
## An = 2.8838, p-value = 0.03136

-mean(log(cpopit$cpo)) #equivalent to a predictive mean square error (lower is better).

## [1] 0.8314674

cor.test(popNtor$nT, cpopit$mean, conf.level = .95)

## 
##  Pearson's product-moment correlation
## 
## data:  popNtor$nT and cpopit$mean
## t = 78.994, df = 99726, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2368168 0.2484987
## sample estimates:
##       cor 
## 0.2426665

cor(popNtor$nT, cpopit$mean, method = "s")

## [1] 0.2223245

#plot(popNtor$nT + runif(n = length(popNtor$nT)), cpofit$mean)

Brier score

brier.score <- function(x, m){
  with(m, {mean(x^2) - 2 * mean(x * mean) + mean(mean^2 + sd^2)})
}
brier.score(popNtor[["nT"]], cpopit) # (lower is better)

## [1] 0.9884705

library("ggplot2")
ggplot() + 
  geom_histogram(aes(x = modPIT), color = "white",
                 fill = "grey", binwidth = .05) + 
  xlab("Probability") +
  ylab("Frequency")

#  ggtitle("Distribution of modified PIT") #looks good! Model fits the data well over/under fit & bias not a problem

spdfR = spCbind(spdf, df)
cor(spdfR$rate, spdfR$mean)

## [1] 0.7940903

ggplot(as.data.frame(spdfR), aes(x = rate, y = mean)) +
  geom_point() +
  geom_abline(slope = 1) +
  scale_x_continuous(limits = c(0, 11), breaks = seq(0, 11, 2)) +
  scale_y_continuous(limits = c(0, 11), breaks = seq(0, 11, 2)) +
  geom_smooth(method = lm, se = FALSE, color = "red") +
  xlab("Raw Rate\n [Tornadoes/100 km square region]") +
  ylab("Expected Rate\n [Tornadoes/100 km square region]")

range(log2(spdfR$mean))

## [1] -4.500615  2.108635

rng = seq(-5, 3, 1)
crq = brewer.pal(8, "Oranges")
as.character(round(2^rng, 2))

## [1] "0.03" "0.06" "0.12" "0.25" "0.5"  "1"    "2"    "4"    "8"

labs = c("0.03", "0.06", "0.12", "0.25", "0.5", "1", "2", "4", "8")

centroids = SpatialPointsDataFrame(gCentroid(spdfR, byid = TRUE), data.frame(spdfR))
cxy = coordinates(centroids)
spdfR$mean2 = log2(spdfR$mean)
spplot(spdfR, "mean2", col = "transparent", at = rng,
       sp.layout = list("sp.lines", states.sl),
       col.regions = crq,
       colorkey = list(space = "bottom", labels = labs),
       par.settings = list(axis.line = list(col = NA)),
       sub = expression("Expected Annual Tornado (EF1+) Occurrence Rate [per 100 km square region]"))

range(spdfR$sd)

## [1] 0.01457663 0.92964066

rngs = seq(0, 1, .2)
crqs = brewer.pal(5, "Greens")
spplot(spdfR, "sd", col = "transparent", at = rngs,
       sp.layout = list("sp.lines", states.sl),
       col.regions = crqs,
       colorkey = list(space = "bottom", labels = paste(rngs)),
       par.settings = list(axis.line = list(col = NA)),
       sub = expression("Standard Error"))

Write out the shapefiles.

writeOGR(spdfR, dsn = "All", layer = "Rates", overwrite_layer = TRUE,
         driver = "ESRI Shapefile")

Repeat for EF1+ tornadoes.

TornP3 = subset(TornP3, mag >= 1) #EF1+
ct = sp::over(counties.sp, TornP3, returnList = TRUE)
area = gArea(counties.sp, byid = TRUE)
names(ct) = county
nT = sapply(ct, function(x) nrow(x))
nTd = sapply(ct, function(x) length(unique(x$date)))
Nyears = diff(range(TornP3$yr)) + 1
nTor.df = data.frame(county = county, 
                     Nyears =  Nyears,
                     nT = nT, 
                     area = area/1000000, 
                     nTd = nTd,
                     rate = nT/area * 10^6 * 100 * 100 / Nyears,
                     stringsAsFactors = FALSE, ID = 1:length(county)
                     )
spdf$county = county
spdf$Nyears = nTor.df$Nyears
spdf$nT = nTor.df$nT
spdf$nTd = nTor.df$nTd
spdf$area = nTor.df$area
spdf$rate = nTor.df$rate
PathsAll.df = plyr::ldply(ct, data.frame)
colnames(PathsAll.df)[1] = "county"

none = nTor.df$county[nTor.df$nT == 0] # for counties without tornadoes
if(length(none) > 0){
  tmp = PathsAll.df[1:length(none), ]
  tmp[!is.na(tmp)] = NA
  tmp$county = none
  PathsAll.df = rbind(PathsAll.df, tmp)
}
length(none)/dim(pop)[1] * 100

## [1] 2.306273

nTor.year = PathsAll.df %>%
  group_by(yr) %>%
  summarize(nT = n()) %>%
  filter(yr != "NA") %>%   
  data.frame()
nTor.year$yr2 = nTor.year$yr
sum(nTor.year$nT)

## [1] 24266

nTor.day0 = PathsAll.df %>%
  group_by(county, yr) %>%
  summarize(nT = length(county),
            nTd = length(unique(date)))

nTor.day0$county = as.factor(nTor.day0$county)
nTor.day0$yr = as.factor(nTor.day0$yr)

nTor.day0 = left_join(expand.grid(county = levels(as.factor(PathsAll.df$county)), 
                                  yr = levels(as.factor(PathsAll.df$yr))), 
                      nTor.day0)

## Joining by: c("county", "yr")

nTor.day0[is.na(nTor.day0)] = 0
nTor.day0$county = as.character(nTor.day0$county)
nTor.day0$yr = as.integer(as.character(nTor.day0$yr))

nTor.day = base::merge(nTor.df[c("county", "area", "ID")],
                 nTor.day0) %>%
  arrange(ID, yr)

nTor.day$county = as.integer(nTor.day$county)
popNtor = base::merge(nTor.day, Pop.df, all = FALSE) %>%
  mutate(density = pop/area,
         Ldensity = log2(density),
         yr2 = yr) %>% #needed for the INLA models
  arrange(ID, yr)

popNtor$ID2 = popNtor$ID
tmp = subset(popNtor, yr == 1991)
tmp$Ldensity = max(popNtor$Ldensity)
tmp$yr = 2016
tmp$yr2 = tmp$yr
tmp$nT = NA
popNtor2 = rbind(tmp, popNtor)

mean(popNtor2$area)

## [1] 3502.515

formula = nT ~ f(ID, model = "besag", graph = tornb.inla, constr = TRUE) + 
               Ldensity + 
               I(yr - 1991)
model = inla(formula = formula, family = "nbinomial", E = area/3500,
             data = popNtor2,
             quantiles = c(.05, .5, .95),
             control.compute = control$compute,
             control.predictor = control$predictor,
             control.results = control$results,
             control.family = control$family
             )
df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 3500   #per sq 100 km
#df = model$summary.fitted.values[1:nrow(tmp), ] * 100 * 100 / 2000 / .386102  #per sq 100 miles
row.names(df) = county

cpopit = data.frame(model$cpo,model$summary.fitted.values)[!is.na(model$cpo$pit), ]
modPIT = cpopit$pit - runif(nrow(cpopit)) * cpopit$cpo
goftest::ad.test(modPIT)

## 
##  Anderson-Darling test of goodness-of-fit
##  Null hypothesis: uniform distribution
## 
## data:  modPIT
## An = 2.3749, p-value = 0.05766

-mean(log(cpopit$cpo)) #equivalent to a predictive mean square error (lower is better).

## [1] 0.5774859

cor.test(popNtor$nT, cpopit$mean, conf.level = .95)

## 
##  Pearson's product-moment correlation
## 
## data:  popNtor$nT and cpopit$mean
## t = 64.778, df = 99726, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1949795 0.2068911
## sample estimates:
##       cor 
## 0.2009427

brier.score(popNtor[["nT"]], cpopit) # (lower is better)

## [1] 0.431243

spdfR = spCbind(spdf, df)

ggplot(as.data.frame(spdfR), aes(x = rate, y = mean)) +
  geom_point() +
  geom_abline(slope = 1) +
  scale_x_continuous(limits = c(0, 5), breaks = seq(0, 5, 1)) +
  scale_y_continuous(limits = c(0, 5), breaks = seq(0, 5, 1)) +
  geom_smooth(method = lm, se = FALSE, color = "red") +
  xlab("Raw Rate\n [Tornadoes/100 km square region]") +
  ylab("Expected Rate\n [Tornadoes/100 km square region]")

cor(spdfR$rate, spdfR$mean)

## [1] 0.8438553

range(log2(spdfR$mean))

## [1] -6.2270547  0.8163864

rng = seq(-7, 1, 1)
crq = brewer.pal(8, "Oranges")
as.character(round(2^rng, 3))

## [1] "0.008" "0.016" "0.031" "0.062" "0.125" "0.25"  "0.5"   "1"     "2"

labs = as.character(round(2^rng, 3))

spdfR$mean2 = log2(spdfR$mean)
spplot(spdfR, "mean2", col = "transparent", at = rng,
       sp.layout = list("sp.lines", states.sl),
       col.regions = crq,
       colorkey = list(space = "bottom", labels = labs),
       par.settings = list(axis.line = list(col = NA)),
       sub = expression("Expected Annual Tornado (EF1+) Occurrence Rate [per 100 km square region]"))

Write out the shapefiles.

writeOGR(spdfR, dsn = "All1", layer = "Rates", overwrite_layer = TRUE,
         driver = "ESRI Shapefile")