County Level Tornado Activity - Ohio

Tyler Fricker

Load packages.

setwd("~/Documents/Advance Spatial Statistics (ASS)/Class Project")
library(sp)
library(rgeos)

## rgeos version: 0.3-2, (SVN revision 413M) GEOS runtime version:
## 3.3.3-CAPI-1.7.4 Polygon checking: TRUE

library(rgdal)

## rgdal: version: 0.8-14, (SVN revision 496) Geospatial Data Abstraction
## Library extensions to R successfully loaded Loaded GDAL runtime: GDAL
## 1.9.2, released 2012/10/08 Path to GDAL shared files:
## /Library/Frameworks/R.framework/Versions/3.0/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480] Path to
## PROJ.4 shared files:
## /Library/Frameworks/R.framework/Versions/3.0/Resources/library/rgdal/proj

library(maptools)

## Checking rgeos availability: TRUE

library(plyr)
library(maps)
library(ggplot2)
library(spdep)

## Loading required package: Matrix

library(mapproj)
library(RColorBrewer)
library(sm)

## Package `sm', version 2.2-5: type help(sm) for summary information

library(PBSmapping)

## ----------------------------------------------------------- PBS Mapping
## 2.67.60 -- Copyright (C) 2003-2013 Fisheries and Oceans Canada
## 
## PBS Mapping comes with ABSOLUTELY NO WARRANTY; for details see the file
## COPYING. This is free software, and you are welcome to redistribute it
## under certain conditions, as outlined in the above file.
## 
## A complete user guide 'PBSmapping-UG.pdf' is located at
## /Library/Frameworks/R.framework/Versions/3.0/Resources/library/PBSmapping/doc/PBSmapping-UG.pdf
## 
## Packaged on 2014-03-27 Pacific Biological Station, Nanaimo
## 
## All available PBS packages can be found at
## http://code.google.com/p/pbs-software/
## 
## To see demos, type '.PBSfigs()'.
## -----------------------------------------------------------

library(spatstat)

## spatstat 1.36-0 (nickname: 'Intense Scrutiny') For an introduction to
## spatstat, type 'beginner'

Part 0: Get tornado data

Data from the Storm Prediction Center (SPC) http://www.spc.noaa.gov/gis/svrgis/.

download.file("http://myweb.fsu.edu/jelsner/data/tornado.zip", "tornado.zip", 
    mode = "wb")

Read the tornado data.

unzip("tornado.zip")
TornL = readOGR(dsn = ".", layer = "tornado", stringsAsFactors = FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: ".", layer: "tornado"
## with 57150 features and 21 fields
## Feature type: wkbLineString with 2 dimensions

Change the data types in the data slot.

TornL$Year = as.integer(TornL$YR)
TornL$Month = as.integer(TornL$MO)
TornL$EF = as.integer(TornL$MAG)
TornL$Date = as.Date(TornL$DATE)
TornL$Length = as.numeric(TornL$LEN) * 1609.34
TornL$Width = as.numeric(TornL$WID) * 0.9144
TornL$FAT = as.integer(TornL$FAT)
TornL$SLON = as.numeric(TornL$SLON)
TornL$SLAT = as.numeric(TornL$SLAT)
TornL$ELON = as.numeric(TornL$ELON)
TornL$ELAT = as.numeric(TornL$ELAT)
TornL$INJ = as.numeric(TornL$INJ)

Part 1: Overlay tornado tracks on counties

There are three steps: (1) Use map algebra to determine tornado counts per county polygon and county centroids, (2) merge counts per polgyon data with county borders data frame, and (3) create choropleth map.

counties = maps::map("county", "ohio", fill = TRUE, plot = FALSE)
ll = "+proj=longlat +datum=WGS84"
IDs = sapply(strsplit(counties$names, ","), function(x) x[2])
counties.spll = maptools::map2SpatialPolygons(counties, IDs = IDs, proj4string = CRS(ll))
counties.spT = sp::spTransform(counties.spll, CRS(proj4string(TornL)))
centers.spT = rgeos::gCentroid(counties.spT, byid = TRUE)
centers.spll = sp::spTransform(centers.spT, CRS(ll))
centers.dfll = as.data.frame(coordinates(centers.spll))
names(centers.dfll) = c("long", "lat")
all(rownames(centers.dfll) == names(counties.spT))

## [1] TRUE

TornL1 = subset(TornL, Year >= 1954 & EF >= 1)
# ct = over(counties.spT, as(TornL1, 'SpatialLines'), returnList = TRUE)
ct = over(counties.spT, TornL1, returnList = TRUE)
nT = sapply(ct, nrow)

county = rownames(centers.dfll)
area = rgeos::gArea(counties.spT, byid = TRUE)
nTor.df = data.frame(state = "OH", county = county, nT = nT, area = area, rate = nT/area * 
    10^6 * 100, stringsAsFactors = FALSE)

borders.df = ggplot2::map_data("county", region = "ohio")
names(borders.df) = c("long", "lat", "group", "order", "state", "county")
df2 = merge(borders.df, nTor.df, by = "county")
all(rownames(centers.dfll) == nTor.df$county)

## [1] TRUE

centers.dfll$nT = nTor.df$nT
centers.dfll$rate = round(nTor.df$rate, 2)

mapT = ggplot(df2, aes(x = long, y = lat, group = county, fill = nT)) + geom_polygon(colour = "gray", 
    size = 0.2) + coord_map(project = "polyconic") + theme(panel.grid.minor = element_blank(), 
    panel.grid.major = element_blank(), panel.background = element_blank(), 
    axis.ticks = element_blank(), axis.text = element_blank(), legend.position = "none") + 
    xlab("") + ylab("") + scale_fill_gradient() + geom_text(data = centers.dfll, 
    aes(x = long, y = lat, group = NULL, label = nT), vjust = 0.5, size = 5, 
    color = "white")
mapT

plot of chunk stepThree

This is a choropleth map showing number of tornadoes from ggplot.

Create a spatial polygons data frame.

spdf = SpatialPolygonsDataFrame(counties.spT, nTor.df)
spplot(spdf, "nT")

plot of chunk createSPDF

This is a choropleth map showing number of tornadoes from spplot.

Part 2: Find and add population and elevation data.

Population data from American Fact Finder.

pop = read.csv("PEP_2012_PEPANNRES/PEP_2012_PEPANNRES_with_ann.csv", stringsAsFactors = FALSE)
ctys = sapply(strsplit(pop$GEO.display.label, ","), function(x) x[1])
ctys2 = tolower(sapply(strsplit(ctys, " "), function(x) x[1]))
pop = pop[order(ctys2), ]
all(tolower(substring(pop$GEO.display.label, 1, nchar(spdf$county))) == spdf$county)

## [1] TRUE

spdf$pop = pop$respop72012

Add elevation data from http://www.ngs.noaa.gov/cgi-bin/sf_archive.prl ftp://ftp.ngs.noaa.gov/pub/DS_ARCHIVE/ShapeFiles/ NOAA survey markers.

Elev = readOGR(dsn = "OH", layer = "oh", stringsAsFactors = FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: "OH", layer: "oh"
## with 14797 features and 35 fields
## Feature type: wkbPoint with 2 dimensions

Elev.df = as.data.frame(Elev)
Elev.df$Elev = as.numeric(Elev.df$ELEVATION)
CE.df = ddply(Elev.df, .(COUNTY), summarize, mEl = mean(Elev, na.rm = TRUE), 
    sEl = sd(Elev, na.rm = TRUE))
all(spdf$county == tolower(CE.df$COUNTY))

## [1] TRUE

spdf$elev = CE.df$mEl
spdf$elevS = CE.df$sEl

Moran's I

spdf.nb = poly2nb(spdf)
spdf.wts = nb2listw(spdf.nb)
m = length(spdf$nT)
s = Szero(spdf.wts)
moran(spdf$nT, spdf.wts, n = m, S0 = s)

## $I
## [1] 0.2963
## 
## $K
## [1] 2.368

Is the Moran's I significant?

moran.test(spdf$nT, spdf.wts, random = FALSE)

## 
##  Moran's I test under normality
## 
## data:  spdf$nT  
## weights: spdf.wts  
## 
## Moran I statistic standard deviate = 4.703, p-value = 1.283e-06
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##          0.296347         -0.011494          0.004285

sm.density(spdf$nT, xlab = "Number of Tornadoes per County", model = "Normal")

plot of chunk significance

There is moderate spatial autocorrelation with a Moran's I of 0.296. These results are significant with a p-value of 1.283e-06, showing no problems with normality.

Spatial Lag Variable

number = spdf$nT
Wnumber = lag.listw(spdf.wts, number)
df = data.frame(number = number, Wnumber = Wnumber)
ggplot(df, aes(x = number, y = Wnumber)) + geom_point() + geom_smooth(method = lm) + 
    xlab("Number of Tornadoes") + ylab("Spatial Lag of Number of Tornadoes")

plot of chunk spatiallag

The scatterplot shows the moderate spatial autocorrelation. Counties with higher numbers of tornadoes tend to be surrounded by counties with higher numbers of tornadoes (at least in part of the state).

Check the slope of the Moran's I

lm(Wnumber ~ number, data = df)

## 
## Call:
## lm(formula = Wnumber ~ number, data = df)
## 
## Coefficients:
## (Intercept)       number  
##       6.338        0.296

Everything is consistent (Moran's I of .296).

Spatial Regression

spdf.lm = lm(nT ~ pop + elevS, data = spdf)
summary(spdf.lm)

## 
## Call:
## lm(formula = nT ~ pop + elevS, data = spdf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.22  -3.30  -0.37   2.84  10.38 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03e+01   1.09e+00    9.45  6.5e-15 ***
## pop          6.72e-06   2.28e-06    2.95   0.0041 ** 
## elevS       -7.25e-02   3.12e-02   -2.32   0.0227 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.5 on 85 degrees of freedom
## Multiple R-squared:  0.129,  Adjusted R-squared:  0.109 
## F-statistic: 6.32 on 2 and 85 DF,  p-value: 0.00277

spdf$residuals = residuals(spdf.lm)
range(spdf$residuals)

## [1] -8.225 10.376

rng = seq(-10, 10, 5)
cls = brewer.pal(4, "RdGy")
spplot(spdf, "residuals", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Tornado Residuals\n (Tornadoes by County)")

plot of chunk spatialregression

This graph shows the model residuals for the regression analysis. The areas of black shade indicate underprediction (positive residuals) and the areas of red shade indicate overprediction (negative residuals).

Next, I looked at the Moran's I of the residuals

lm.morantest(spdf.lm, spdf.wts)

## 
##  Global Moran's I for regression residuals
## 
## data:  
## model: lm(formula = nT ~ pop + elevS, data = spdf)
## weights: spdf.wts
## 
## Moran I statistic standard deviate = 3.582, p-value = 0.0001702
## alternative hypothesis: greater
## sample estimates:
## Observed Moran's I        Expectation           Variance 
##           0.208093          -0.021549           0.004109

Moran's I on the residuals is 0.208. This compares with the value of 0.296 for the number of tornadoes alone. Thus, part of the spatial autocorrelation is “absorbed” by the explanatory variables. With a p-value of .0001, I can reject the null hypothesis of no spatial autocorrelation in the residuals and conclude that a spatial regression model is better. Knowing that the spatial regression model is warranted, I must decide on which model to use (Spatial Lag or Spatial Error).

Lagrange multiplier test for spatial autocorrelation

lm.LMtests(spdf.lm, spdf.wts, test = c("LMerr", "LMlag"))

## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = nT ~ pop + elevS, data = spdf)
## weights: spdf.wts
## 
## LMerr = 9.51, df = 1, p-value = 0.002043
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = nT ~ pop + elevS, data = spdf)
## weights: spdf.wts
## 
## LMlag = 11.76, df = 1, p-value = 0.0006044

lm.LMtests(spdf.lm, spdf.wts, test = c("RLMerr", "RLMlag"))

## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = nT ~ pop + elevS, data = spdf)
## weights: spdf.wts
## 
## RLMerr = 0.7307, df = 1, p-value = 0.3927
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = nT ~ pop + elevS, data = spdf)
## weights: spdf.wts
## 
## RLMlag = 2.983, df = 1, p-value = 0.08415

Both the spatial error model and spatial lag model are significant in the first test, however, the spatial error model is not significant in the second (robust) attempt, while the spatial lag error is.

I move on using the spatial lag model.

spdf.lag = lagsarlm(nT ~ pop + elevS, data = spdf, listw = spdf.wts, tol.solve = 4.28669e-13)
summary(spdf.lag)

## 
## Call:lagsarlm(formula = nT ~ pop + elevS, data = spdf, listw = spdf.wts, 
##     tol.solve = 4.28669e-13)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -8.54712 -2.82469 -0.35303  2.35359  9.11954 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  5.2797e+00  1.5724e+00  3.3578 0.0007856
## pop          5.1469e-06  2.0721e-06  2.4839 0.0129950
## elevS       -3.9622e-02  2.8737e-02 -1.3787 0.1679743
## 
## Rho: 0.4653, LR test value: 10.79, p-value: 0.0010217
## Asymptotic standard error: 0.1224
##     z-value: 3.803, p-value: 0.00014309
## Wald statistic: 14.46, p-value: 0.00014309
## 
## Log likelihood: -250.2 for lag model
## ML residual variance (sigma squared): 16.44, (sigma: 4.055)
## Number of observations: 88 
## Number of parameters estimated: 5 
## AIC: 510.4, (AIC for lm: 519.2)
## LM test for residual autocorrelation
## test value: 0.00131, p-value: 0.97113

x = 2 * (logLik(spdf.lag) - logLik(spdf.lm))/49
x[1]

## [1] 0.2202

So there's a large improvement in the model. I find a lag likelihood of -250.2214 and an AIC of 510.44 for the spatial lag model.

Can we make predictions?

pre = predict(spdf.lag)
str(pre)

## Class 'sarlm.pred'  atomic [1:88] 8.36 10.03 10.4 6.53 5.84 ...
##   ..- attr(*, "trend")= num [1:88] 3.48 5.19 4.35 3.89 4.21 ...
##   ..- attr(*, "signal")= num [1:88] 4.89 4.84 6.05 2.64 1.63 ...

spdf$fit = as.numeric(pre)
spdf$trend = attr(pre, "trend")
spdf$signal = attr(pre, "signal")
range(spdf$fit)

## [1]  2.887 15.690

rng = seq(0, 20, 5)
cls = brewer.pal(4, "RdGy")
spplot(spdf, "fit", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Predicted Tornadoes by County")

plot of chunk predictions

range(spdf$trend)

## [1]  1.802 10.261

rng = seq(0, 10, 2)
cls = brewer.pal(5, "RdGy")
spplot(spdf, "trend", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Predicted Tornadoes by County (Trend)")

plot of chunk predictions

range(spdf$signal)

## [1] 0.5816 6.3278

rng = seq(0, 8, 2)
cls = brewer.pal(4, "RdGy")
spplot(spdf, "signal", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Predicted Tornadoes by County (Signal)")

plot of chunk predictions

Above is a collection of maps for predicitons (using the predicted, trend, and signal)

Compare Model Residuals

spdf$residualsL = residuals(spdf.lag)
range(spdf$residualsL)

## [1] -8.547  9.120

rng = seq(-10, 10, 5)
cls = brewer.pal(4, "RdGy")
spplot(spdf, "residualsL", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Tornado Residuals (Spatial Regression)")

plot of chunk compare

spplot(spdf, "residuals", col.regions = cls, at = rng, colorkey = list(space = "bottom"), 
    sub = "Tornado Residuals")

plot of chunk compare

Moving forward, it would be interesting to take this county level lattice data and link it with point pattern data around major cities in Ohio. If populations are increasing in areas with a high risk for tornadoes, the relative risk of individuals will increase. Looking at this Bullseye's effect (Ashley et al., 2014) may proove to be a new hot topic for tornado research.

For Fun:

Read in new data

download.file("http://myweb.fsu.edu/jelsner/data/tornado2011.zip", "tornado.zip", 
    mode = "wb")
unzip("tornado.zip")
Torn = readOGR(dsn = ".", layer = "tornado")

## OGR data source with driver: ESRI Shapefile 
## Source: ".", layer: "tornado"
## with 56221 features and 28 fields
## Feature type: wkbPoint with 2 dimensions

Torn = subset(Torn, FSCALE >= 0 & YEAR >= 1954)
Torn$EF = factor(Torn$FSCALE)

Download county level data

download.file("http://myweb.fsu.edu/jelsner/data/tl_2013_us_county.zip", "tl_2013_us_county.zip", 
    mode = "wb")
unzip("tl_2013_us_county.zip")
USc = readOGR(dsn = ".", layer = "tl_2013_us_county", stringsAsFactors = FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: ".", layer: "tl_2013_us_county"
## with 3234 features and 17 fields
## Feature type: wkbPolygon with 2 dimensions

USc$StFP = as.numeric(as.character(USc$STATEFP))
OH = subset(USc, StFP == 39)
OH = spTransform(OH, CRS(proj4string(Torn)))
cc = coordinates(OH)
IDBins = cut(cc[, 1], range(cc[, 1]), include.lowest = TRUE)
OHw = unionSpatialPolygons(OH, IDBins)
plot(OHw)

plot of chunk countyleveldata

Next I summarize the point pattern object

W = as(OHw, "owin")
All.ppp = as.ppp(Torn["EF"])
All.ppp = All.ppp[W]
unitname(All.ppp) = c("meter", "meters")
summary(All.ppp)

## Marked planar point pattern: 920 points
## Average intensity 8.01e-09 points per square meter  
## 
## *Pattern contains duplicated points*
## 
## Coordinates are given to 2 decimal places 
## i.e. rounded to the nearest multiple of 0.01 meters  
## 
## Multitype:
##   frequency proportion intensity
## 0       294    0.32000  2.56e-09
## 1       390    0.42400  3.39e-09
## 2       180    0.19600  1.57e-09
## 3        40    0.04350  3.48e-10
## 4        13    0.01410  1.13e-10
## 5         3    0.00326  2.61e-11
## 
## Window: polygonal boundary
## single connected closed polygon with 15749 vertices
## enclosing rectangle: [925371, 1296618] x [20355, 475642] meters 
## Window area =  1.14923e+11 square meters 
## Unit of length: 1 meter

TT = All.ppp
plot(split(TT))

plot of chunk ppo

Plot the touchdown locations

H = unmark(TT[TT$marks == 2 | TT$marks == 1 | TT$marks == 0])
I = unmark(TT[TT$marks == 3 | TT$marks == 4 | TT$marks == 5])
T2 = superimpose(H = H, I = I)

## Warning: data contain duplicated points

plot(I, pch = 25, cols = "red", main = "")
plot(OH, add = TRUE, lwd = 0.1)

plot of chunk touchdownlocs

Compute spatial densities and relative risk (use 6.4% due to that being the relative risk for intense and violent tornadoes).

eps = 64
dAll = density(unmark(T2), dimyx = c(eps, eps))
mean(dAll$v, na.rm = TRUE)

## [1] 8.176e-09

dH = density(H, dimyx = c(eps, eps))
dI = density(I, dimyx = c(eps, eps))
plot(dI)
plot(OH, add = TRUE, lwd = 0.1)

plot of chunk rr

So, the SW part of the state appears to be at the most risk (including my hometown).

Now I'll look at relative risk for all storms.

rr = relrisk(T2, dimyx = c(eps, eps))
plot(rr)
plot(OH, add = TRUE, lwd = 0.1)

plot of chunk rrall

Now both the SW and NE part of the state are at the most risk. These include both Cincinnati and Cleveland, so if those cities are increasing in population, we may see more injuries and devastation than maybe previously thought.