Casualty vulnerability from U.S. tornadoes is highest in urbanized areas across the Mid South

James B. Elsner & Tyler Fricker

Set working directory and load packages.

suppressMessages(library("ggplot2"))
suppressMessages(library("raster"))
suppressMessages(library("ggmap"))
suppressMessages(library("dplyr"))
suppressMessages(library("tidyr"))
suppressMessages(library("rgdal"))
suppressMessages(library("rgeos"))
suppressMessages(library("ggthemes"))
suppressMessages(library("RColorBrewer"))
suppressMessages(library("scales"))
suppressMessages(library("broom"))
suppressMessages(library("maps"))
suppressMessages(library("sf"))
suppressMessages(library("xtable"))
suppressMessages(library("lubridate"))

Data and Variables

Tornado casualties

Data from the Storm Prediction Center (SPC) http://www.spc.noaa.gov/gis/svrgis/.

download.file("http://www.spc.noaa.gov/gis/svrgis/zipped/tornado.zip",
              "tornado.zip", mode = "wb")
unzip("tornado.zip")

Read the tornado data.

Torn.sfdf = st_read(dsn = "torn", 
                   layer = "torn", 
                   stringsAsFactors = FALSE)
## Reading layer `torn' from data source `/Users/jelsner/Dropbox/Tornadoes/Casualties/torn' using driver `ESRI Shapefile'
## converted into: LINESTRING
## Simple feature collection with 61092 features and 22 fields
## geometry type:  LINESTRING
## dimension:      XY
## bbox:           xmin: -6279240 ymin: -1835231 xmax: 3405399 ymax: 3888106
## epsg (SRID):    NA
## proj4string:    +proj=lcc +lat_1=33 +lat_2=45 +lat_0=39 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs
as.data.frame(Torn.sfdf) %>%
  mutate(cas = fat + inj) %>%
  summarize(nT = sum(cas < 20),
            nAll = length(fat),
            nT/nAll * 100)
##      nT  nAll nT/nAll * 100
## 1 60225 61092      98.58083

Compare with Table 4 in Merrell et al. (2005).

as.data.frame(Torn.sfdf) %>%
  filter(yr >= 1990 & yr <= 2001) %>%
  summarize(nF = sum(fat))
##    nF
## 1 662
Ages = c("0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+")
Census2009 = c(6.9 + 6.6, 6.8 + 7.1, 7.0 + 6.9, 6.5 + 6.9, 7.3 + 7.5, 6.9 + 6.0, 4.8 + 3.6, 2.9 + 2.5, 1.9 + 1.7)/100
Table4 = c(66, 51, 60, 91, 98, 83, 75, 81, 50)
sum(Table4)
## [1] 655

Add new columns and filter. We only consider tornadoes occurring within the conterminous United States.

Torn.sfdf = Torn.sfdf %>%
  mutate(Date = as.Date(date),
         Year = yr,
         cas = inj + fat,
         Length = len * 1609.34,
         Width = wid * .9144,
         Width = ifelse(Width == 0, min(Width[Width > 0]), Width),
         AreaPath = Length * Width,
         AreaVortex = pi * (Width/2)^2,
         Ma = factor(month.abb[mo], levels = month.abb[1:12])) %>%
  filter(Year >= 1990, st != "HI", st != "AK", st != "PR")

Warnings? Try

assign("last.warning", NULL, envir = baseenv())
Torn.sfdf %>%
  filter(cas > 0) %>%
  summarize(nT = n(),
            nC = sum(cas),
            pF = sum(fat)/nC * 100)
## Simple feature collection with 1 feature and 3 fields
## geometry type:  MULTILINESTRING
## dimension:      XY
## bbox:           xmin: -2279065 ymin: -1429838 xmax: 2130657 ymax: 1210879
## epsg (SRID):    NA
## proj4string:    +proj=lcc +lat_1=33 +lat_2=45 +lat_0=39 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs
##     nT    nC       pF                       geometry
## 1 2784 31587 6.214582 MULTILINESTRING((-223554.61...

During the 22-year period 1995–2016 there are 26,863 tornadoes recorded in the conterminous United States. Of these, 2,208 are linked to 25,968 casualties. Only 6.7 percent of all casualties lead to death.

df = as.data.frame(Torn.sfdf) %>%
  arrange(desc(cas)) %>%
  dplyr::select(date, time, st, mag, inj, fat, cas)
head(df)
##         date     time st mag  inj fat  cas
## 1 2011-04-27 15:43:00 AL   4 1500  64 1564
## 2 2011-05-22 16:34:00 MO   5 1150 158 1308
## 3 1999-05-03 17:26:00 OK   5  583  36  619
## 4 2015-12-26 18:46:00 TX   4  468  10  478
## 5 1990-08-28 14:30:00 IL   5  350  29  379
## 6 2008-05-10 16:20:00 OK   4  350  21  371

Distribution of casualties by tornado count.

df = as.data.frame(Torn.sfdf) %>%
  filter(cas > 0) %>%
  group_by(cas) %>%
  summarize(nT = n()) %>%
  arrange(desc(cas))
head(df)
## # A tibble: 6 x 2
##     cas    nT
##   <int> <int>
## 1  1564     1
## 2  1308     1
## 3   619     1
## 4   478     1
## 5   379     1
## 6   371     1
ggplot(df, aes(x = cas, y = nT)) +
      geom_point() +
      scale_y_log10(breaks = c(1, 10, 100, 1000)) + 
      scale_x_log10(breaks = c(1, 10, 100, 1000)) +
      xlab("Number of Casualties") + ylab("Number of Tornadoes") +
      theme_minimal() +
      ggtitle("Casualty-Producing Tornadoes in the United States\n(1995-2016)") 

On average a casualty-producing EF0 tornado results in two casualties, a casualty-producing EF1 tornado results in 3.1 casualties, and a casualty-producing EF2 tornado results in 6.5 casualties.

as.data.frame(Torn.sfdf) %>%
  filter(cas > 0) %>%
  group_by(mag) %>%
  summarize(cas = sum(cas),
            nT = n(),
            fat = sum(fat),
            inj = sum(inj),
            avgcas = cas/nT) %>%
xtable(., digits = 1)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 08:51:13 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrr}
##   \hline
##  & mag & cas & nT & fat & inj & avgcas \\ 
##   \hline
## 1 &  0 & 481 & 237 & 11 & 470 & 2.0 \\ 
##   2 &  1 & 3016 & 963 & 97 & 2919 & 3.1 \\ 
##   3 &  2 & 6159 & 923 & 251 & 5908 & 6.7 \\ 
##   4 &  3 & 9343 & 488 & 591 & 8752 & 19.1 \\ 
##   5 &  4 & 8571 & 154 & 532 & 8039 & 55.7 \\ 
##   6 &  5 & 4017 & 19 & 481 & 3536 & 211.4 \\ 
##    \hline
## \end{tabular}
## \end{table}

Table 1 Summary statistics of tornado casualties by EF rating.

Casualties by location and size.

df = as.data.frame(Torn.sfdf) %>%
  filter(cas > 0) %>%
  mutate(label = paste("EF", mag, sep = ""))

states.df = map_data("state") %>%
  filter(region != 'alaska',  region != 'district of columbia')

ggplot(states.df, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white") +
  geom_path(color = "gray85") +
  coord_map(project = "polyconic") + 
  geom_point(aes(x = slon, y = slat, size = cas, group = om), 
             alpha = .3, data = df) +
  xlab("") + ylab("") +
  facet_wrap(~ label, ncol = 2) +
  theme_minimal() +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        legend.position = "bottom") +
  scale_size_continuous("Number of\n Casualties")

Figure 1 Location of casualty-producing tornadoes (1995–2016) by EF rating.

Alabama had the most casualties (3,937) followed by Oklahoma (2,500), Missouri (2,025), Arkansas (1,759), Texas (1,704), and Tennessee (1,678).

sfdf = Torn.sfdf %>%
  filter(cas > 0) %>%
  group_by(st) %>%
  summarize(nT = n(),
            nC = sum(cas),
            nF = sum(fat),
            ratio = nC/nT) %>%
  arrange(desc(nC))
st_geometry(sfdf) = NULL #removes the geometry column
sfdf
## # A tibble: 44 x 5
##       st    nT    nC    nF     ratio
##  * <chr> <int> <int> <int>     <dbl>
##  1    AL   191  4407   376 23.073298
##  2    OK   142  2794   142 19.676056
##  3    GA   175  2218   126 12.674286
##  4    MO   124  2153   223 17.362903
##  5    TX   245  2063   109  8.420408
##  6    TN   148  1905   161 12.871622
##  7    MS   161  1854   123 11.515528
##  8    AR   138  1804   116 13.072464
##  9    IL   135  1334    67  9.881481
## 10    NC   104  1122    48 10.788462
## # ... with 34 more rows

Make a choropleth map. Use geom_polygon and coord_map from ggplot.

states.df = map_data("state") %>%
  filter(region != 'alaska',  region != 'district of columbia') %>%
  mutate(st = state.abb[match(region, tolower(state.name))]) %>%
  left_join(sfdf, by = "st") %>%
  arrange(order)

pp = ggplot(states.df, aes(x = long, y = lat, group = group, fill = log10(nC))) +
  geom_polygon() +
  geom_path(color = "gray75") +
  coord_map(project = "polyconic") + 
#  ggtitle("Tornado Casualties [1995-2016]") +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(), 
        panel.background = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_blank(),
        legend.position = "bottom") +
        xlab("") + ylab("") +
  scale_fill_continuous(low = "#fee6ce", high = "#e6550d",
                        "Number of\n Casualties",
                        breaks = 1:3, 
                        labels = c("10", "100", "1000"))
pp

The tmap package is a flexible, layer-based, and easy to use approach to create thematic maps including choropleths and bubble maps. It is based on the grammar of graphics, and resembles the syntax of ggplot2.

library(tmap)
library(albersusa)
us_sf = usa_sf("aeqd") %>%
  mutate(st = as.character(iso_3166_2))

us_sf = us_sf %>%
  filter(st != "AK" & st != "HI")


sfdf2 = Torn.sfdf %>%
  group_by(st) %>%
  summarize(nT = n(),
            nC = sum(cas),
            nF = sum(fat),
            nI = sum(inj))
st_geometry(sfdf2) = NULL

sfdf3 = left_join(sfdf2, 
                  as.data.frame(us_sf), by = "st") %>%
         sf::st_as_sf() %>%
         select(nT, nC, nF, nI)

tm1 = tm_shape(sfdf3) +
  tm_polygons("nI", 
              border.col = NULL,
              title = "Injuries",
              palette = "Purples") +
  tm_text("nI", size = .5) +
tm_shape(us_sf) + tm_borders(alpha = .2) +
  tm_compass() + tm_scale_bar(lwd = .5) +
  tm_format_Europe2(legend.position = c("left", "bottom"),
                attr.position = c("left", "bottom")) 

tm2 = tm_shape(sfdf3) +
  tm_polygons("nF", 
              border.col = NULL,
              title = "Fatalities",
              palette = "Oranges") +
  tm_text("nF", size = .5) +
tm_shape(us_sf) + tm_borders(alpha = .2) +
  tm_compass() + tm_scale_bar(lwd = .5) +
  tm_format_Europe2(legend.position = c("left", "bottom"),
                attr.position = c("left", "bottom")) 

tmap_arrange(tm1, tm2, asp = NA, ncol = 1)

Figure 2 Tornado casualties (top: injuries, bottom: fatalities) by state (1995-2016). All casualties are assigned to the state in which genesis occurred.

Create tornado paths and extract the average population density (people per sq. km under the path). The population data (Gridded Population of the World Volume 4–2010) are on a raster. With the extract method and weights = TRUE the calculation is done based on percentage of cell under the path. This takes more time. Multiply the average population density by the path area (L x W) to estimate the number of people under the path.

First convert the tornado simple feature data frame to a spatial lines file. Then buffer the track to make a path.

TornL.sfdf = Torn.sfdf %>%
  filter(cas > 0)
TornL = as(TornL.sfdf, "Spatial")
TornP = gBuffer(TornL, byid = TRUE, width = TornL$Width/2, capStyle = "FLAT")
#TornP$AreaPath = gArea(TornP, byid = TRUE)

Population density

Population data are obtained from the Gridded Population of the World, version four (GPW, v4) from the Socioeconomic Data and Applications Center at Columbia University, USA. The database contain decennial census density estimates for 1990, 2000, and 2010 represented as people per square kilometer. Densities are based on residential population. The native cell resolution is .0083\(^{\circ}\) latitude/longitude, which at 36\(^{\circ}\) N latitude means a cell having the dimension of .9 km in the north-south direction and .7 km in the east-west direction.

Load the population raster(s) and crop to defined extent. Use the extract function to obtain the population density. The projectRaster and extract functions take about 10 minutes per raster. Group years by mid-year of the census year. Assign the population density using the first decennial estimate then successively replace with later estimates by grouped years.

stime = proc.time()
PopD2000 = raster("gpw-v4-population-density_2000.tif")
PopD2010 = raster("gpw-v4-population-density_2010.tif")
ext = raster::extent(c(-125, -67, 24, 50))
PopD2000 = crop(PopD2000, ext)
PopD2010 = crop(PopD2010, ext)

PopD2000p = projectRaster(PopD2000, crs = proj4string(TornP))
PopD2010p = projectRaster(PopD2010, crs = proj4string(TornP))
TornP$popD2000 = raster::extract(PopD2000p, TornP, fun = mean, na.rm = TRUE,
                                       weights = TRUE, normalizeWeights = FALSE)[, 1]
TornP$popD2010 = raster::extract(PopD2010p, TornP, fun = mean, na.rm = TRUE,
                                       weights = TRUE, normalizeWeights = FALSE)[, 1]

TornP$popD = TornP$popD2000
yearGroup = TornP$yr > 2005
TornP$popD[yearGroup] = TornP$popD2010[yearGroup]

TornP$pop = TornP$popD * TornP$AreaPath/10^6

Torn.df = as.data.frame(TornP)
proc.time() - stime
##    user  system elapsed 
## 736.443  19.534 757.062

Note: The warning results from tornadoes with paths over areas with no people. It can be safely ignored.

The popD attribute for each tornado corresponds to the population density from the nearest decennial estimate so that a tornado that occurred in 1996 uses the 2000 decennial estimate while a tornado that occurred in 2012 uses the 2010 estimate.

For the set of 2208 tornadoes with at least one casualty the median population density per tornado is 31.3 people per square kilometer with an inter-quartile range between 9.6 and 136 people per square kilometer.

1 mi^2 = 2.589988 km^2 http://mcdc.missouri.edu/TenThings/urbanrural.shtml

Torn.df %>%
  summarize(nT = n(),
            avgPopD = mean(popD),
            medianPopD = median(popD),
            q25 = quantile(popD, probs = .25),
            q75 = quantile(popD, probs = .75),
            maxPopD = max(popD),
            minPopD = min(popD)) %>%
  t()
##                    [,1]
## nT          2784.000000
## avgPopD      224.088765
## medianPopD    30.954250
## q25            9.031192
## q75          144.605224
## maxPopD    13949.001032
## minPopD        0.000000
Torn.df %>%
  dplyr::select(date, st, mag, inj, fat, cas, popD) %>%
  arrange(desc(popD)) %>%
  head(., n = 10) %>%
  xtable(., digits = 0)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:08 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllrrrrr}
##   \hline
##  & date & st & mag & inj & fat & cas & popD \\ 
##   \hline
## 1 & 2007-08-08 & NY & 2 & 9 & 0 & 9 & 13949 \\ 
##   2 & 1998-01-09 & CA & 1 & 1 & 0 & 1 & 12249 \\ 
##   3 & 1999-01-18 & PA & 0 & 18 & 0 & 18 & 12220 \\ 
##   4 & 2010-09-16 & NY & 1 & 1 & 1 & 2 & 9524 \\ 
##   5 & 2003-03-27 & FL & 2 & 14 & 1 & 15 & 6588 \\ 
##   6 & 2010-07-25 & NY & 1 & 7 & 0 & 7 & 6094 \\ 
##   7 & 2013-07-19 & FL & 0 & 3 & 0 & 3 & 5224 \\ 
##   8 & 2014-07-28 & MA & 2 & 2 & 0 & 2 & 4440 \\ 
##   9 & 1998-05-05 & CA & 2 & 1 & 0 & 1 & 4340 \\ 
##   10 & 2015-05-24 & TX & 0 & 4 & 0 & 4 & 4047 \\ 
##    \hline
## \end{tabular}
## \end{table}
sum(Torn.df$popD == 0)
## [1] 21
df = Torn.df %>%
  filter(yr >= 1990 & yr <= 2001) %>%
  summarize(totalPop = sum(pop))

Annual average number of people exposed to tornadoes that caused at least one casualty.

df = Torn.df %>%
  filter(cas > 0) %>%
  group_by(Year) %>%
  summarize(nT = n(),
            mPop = mean(pop),
            sdPop = sd(pop),
            sePop = sdPop/sqrt(nT),
            mPopm = mPop - sePop,
            mPopp = mPop + sePop)
A1 = ggplot(df, aes(x = Year, y = mPop)) +
  geom_point() +
#  geom_smooth(method = lm) +
  geom_errorbar(aes(ymin = mPopm, ymax = mPopp), width = .2) +
  scale_y_continuous(limits = c(0, 2500)) +
  scale_x_continuous(breaks = seq(1995, 2016, 4)) +
  xlab("Year") + ylab("Number of People Exposed") +
  theme_minimal()
#  ggtitle("Tornadoes Resulting in at Least One Casualty")

Distribution of population and population density by EF rating.

PeopleByEF = Torn.df %>%
  group_by(mag) %>%
  summarize(nT = n(),
            nP = round(sum(pop), 0),
            avgP = round(mean(pop), 0),
            medP = round(median(pop), 0),
            avgPD = mean(popD),
            medPD = median(popD))
sum(PeopleByEF$nP)
## [1] 1753011
xtable(PeopleByEF, digits = 0)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:08 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrr}
##   \hline
##  & mag & nT & nP & avgP & medP & avgPD & medPD \\ 
##   \hline
## 1 & 0 & 237 & 10193 & 43 & 2 & 424 & 46 \\ 
##   2 & 1 & 963 & 246273 & 256 & 14 & 289 & 37 \\ 
##   3 & 2 & 923 & 442369 & 479 & 67 & 198 & 30 \\ 
##   4 & 3 & 488 & 552440 & 1132 & 239 & 91 & 24 \\ 
##   5 & 4 & 154 & 393415 & 2555 & 680 & 104 & 24 \\ 
##   6 & 5 & 19 & 108321 & 5701 & 3307 & 106 & 27 \\ 
##    \hline
## \end{tabular}
## \end{table}
PeopleByEFc= Torn.df %>%
  filter(cas > 0) %>%
  group_by(mag) %>%
  summarize(nT = n(),
            nP = round(sum(pop), 0),
            avgP = round(mean(pop), 0),
            medP = round(median(pop), 0),
            avgPD = mean(popD),
            medPD = median(popD))
sum(PeopleByEFc$nP)
## [1] 1753011
xtable(PeopleByEFc, digits = 0)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:08 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrr}
##   \hline
##  & mag & nT & nP & avgP & medP & avgPD & medPD \\ 
##   \hline
## 1 & 0 & 237 & 10193 & 43 & 2 & 424 & 46 \\ 
##   2 & 1 & 963 & 246273 & 256 & 14 & 289 & 37 \\ 
##   3 & 2 & 923 & 442369 & 479 & 67 & 198 & 30 \\ 
##   4 & 3 & 488 & 552440 & 1132 & 239 & 91 & 24 \\ 
##   5 & 4 & 154 & 393415 & 2555 & 680 & 104 & 24 \\ 
##   6 & 5 & 19 & 108321 & 5701 & 3307 & 106 & 27 \\ 
##    \hline
## \end{tabular}
## \end{table}

Table 2 Distribution of population and population density in the path of tornadoes by EF rating.

Distribution of population density in the path of casualty-producing tornadoes.

df = Torn.df %>%
  filter(cas > 0) %>%
  filter(popD > 0) %>%
  group_by(mag) %>%
  summarize(pop = sum(pop),
            nT = n(),
            avgpop = pop/nT)

df = Torn.df %>%
  filter(cas > 0)
A2 = ggplot(df[df$popD > 0, ], aes(popD)) +
  geom_histogram(binwidth = .5, color = "white") +
  scale_x_log10(breaks = c(.01, .1, 1, 10, 100, 1000, 10000), 
                labels = c(".01", ".1", "1", "10", "100", "1000", "10,000")) +
  xlab("Population Density [people/sq. km]") +
  scale_y_continuous(limits = c(0, 1000)) +
  ylab("Number of Tornadoes") +
  theme_minimal()

Table 3: Selected towns and cities by state. Population density is given in parentheses.

Energy dissipation

Empirical model for tornado winds by EF rating taken from Table 3-1 of NRC 2007. Percent area by EF rating for each EF category. Threshold wind speeds (m/s) are a lower bound 3 sec gusts on the operational EF Scale (Table 2-1 of NRC2007).

perc = c(1, 0, 0, 0, 0, 0, 
         .772, .228, 0, 0, 0, 0,
         .616, .268, .115, 0, 0, 0,
         .529, .271, .133, .067, 0, 0,
         .543, .238, .131, .056, .032, 0,
         .538, .223, .119, .07, .033, .017)
percM = matrix(perc, ncol = 6, byrow = TRUE)

Compute energy dissipation. \(E = A_p \rho \sum_{j=0}^{J} w_j v_j^{3},\) where \(A_p\) is the area of the path, \(\rho\) is area density [1 kg/m^3] \(v_j\) is the midpoint wind speed for each rating, and \(w_j\) is the corresponding fraction of path area by EF rating. With no upper bound on the EF5 wind speeds, the midpoint wind speed is set at 97 m~s\(^{-1}\) (7.5 m~s\(^{-1}\) above the threshold wind speed consistent with the EF4 midpoint speed relative to its threshold).

Torn.df = Torn.df %>%
  filter(cas > 0)
threshW = c(29.06, 38.45, 49.62, 60.8, 74.21, 89.41)
midptW = c(diff(threshW)/2 + threshW[-length(threshW)], threshW[length(threshW)] + 7.5)
ef = Torn.df$mag + 1
EW3 = numeric()
for(i in 1:length(ef)) EW3[i] = midptW^3 %*% percM[ef[i], ]
Torn.df = Torn.df %>%
  mutate(ED = EW3 * AreaPath)

Annual energy dissipation of tornadoes with at least one casualty.

df = Torn.df %>%
  group_by(Year) %>%
  summarize(nT = n(),
            mED = mean(ED),
            sdED = sd(ED),
            seED = sdED/sqrt(nT),
            mEDm = mED - seED,
            mEDp = mED + seED)

B1 = ggplot(df, aes(x = Year, y = mED/10^12)) +
  geom_point() +
  geom_smooth(method = lm) +
  geom_errorbar(aes(ymin = mEDm/10^12, ymax = mEDp/10^12), width = .2) +
  scale_y_continuous(limits = c(-.15, NA)) +
  scale_x_continuous(breaks = seq(1995, 2016, 4)) +
  xlab("Year") + ylab("Energy Dissipation [PW]") +
  theme_minimal() 
#  ggtitle("Tornadoes Resulting in at Least One Casualty")

Distribution of energy dissipation.

B2 = ggplot(Torn.df[Torn.df$cas > 0,], aes(ED)) +
  geom_histogram(binwidth = .5, color = "white") +
  scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
                     labels = trans_format("log10", math_format(10^.x))) +
  xlab("Energy Dissipation [Watts]") +
  scale_y_continuous(limits = c(0, 1000)) +
  ylab("Number of Tornadoes") +
  theme_minimal()

Combine figures.

source("multiplot.txt")
mtrx = matrix(c(1, 2), nrow = 2, byrow = TRUE)
A1 = A1 + ggtitle("A") + 
  theme(plot.title = element_text(hjust = -.03))
B1 = B1 + ggtitle("C") + 
  theme(plot.title = element_text(hjust = 0))
#multiplot(A1, B1, layout = mtrx)
mtrx = matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE)
A2 = A2 + ggtitle("B") + 
  theme(plot.title = element_text(hjust = 0))
B2 = B2 + ggtitle("D") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(A1, A2, B1, B2, layout = mtrx)
## Loading required package: grid

Figure 3 People exposed and energy dissipation for all casualty-producing tornadoes (1995-2016). (A) Annual average number of people exposed per tornado, (B) distribution of per tornado population density, (C) annual average energy dissipation per tornado, and (D) distribution of per tornado energy dissipation.

For the set of 2208 tornadoes with at least one casualty the median energy dissipation is 95.5 gigawatt with an inter-quartile range between 14 and 511 GW. The Tallulah-Yazoo City-Durant tornado (Louisiana and Mississippi) of 24 April 2010 that killed ten and injured 146 has the highest energy dissipation at 66.2 PW.

Torn.df %>%
  summarize(nT = n(),
            avgED = mean(ED)/10^9,
            medianED = median(ED)/10^9,
            q25 = quantile(ED, probs = .25)/10^9,
            q75 = quantile(ED, probs = .75)/10^9,
            maxED = max(ED)/10^9,
            minED = min(ED)/10^9)
##     nT    avgED medianED      q25      q75    maxED       minED
## 1 2784 791.6291 77.61539 10.85139 445.4082 66169.66 0.005659764
Torn.df %>%
  dplyr::select(date, st, mag, inj, fat, cas, popD, ED) %>%
  arrange(desc(ED)) %>%
  head(., n = 10) %>%
  xtable(., digits = 1)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:09 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllrrrrrr}
##   \hline
##  & date & st & mag & inj & fat & cas & popD & ED \\ 
##   \hline
## 1 & 2010-04-24 & LA &  4 & 146 & 10 & 156 & 8.5 & 66169658959790.3 \\ 
##   2 & 2011-04-27 & AL &  5 & 145 & 72 & 217 & 52.5 & 49073840644331.3 \\ 
##   3 & 2004-05-22 & NE &  4 & 38 &  1 & 39 & 13.0 & 34265399398044.4 \\ 
##   4 & 2011-04-27 & AL &  4 & 1500 & 64 & 1564 & 132.7 & 30251617428152.3 \\ 
##   5 & 2011-04-27 & AL &  4 & 54 & 13 & 67 & 21.5 & 25950329144119.0 \\ 
##   6 & 2011-04-27 & AL &  4 & 85 & 22 & 107 & 32.1 & 24704083877123.4 \\ 
##   7 & 2008-02-05 & AR &  4 & 139 & 13 & 152 & 10.3 & 23193868125876.3 \\ 
##   8 & 1998-04-16 & TN &  5 & 36 &  3 & 39 & 18.4 & 20640754792221.8 \\ 
##   9 & 2008-05-10 & OK &  4 & 350 & 21 & 371 & 15.1 & 19165780063306.2 \\ 
##   10 & 2011-04-25 & AR &  2 & 16 &  4 & 20 & 37.2 & 19160328444788.8 \\ 
##    \hline
## \end{tabular}
## \end{table}

Results

Descriptive statistics

Summary statistics. Descriptive statistics for dependent and independent variables used in the regresion models, 1995–2016 (N = 2,192)

Torn.df %>%
  filter(popD > 0) %>%
  summarize(nT = n(),
            casM = mean(cas),
            casMax = max(cas),
            casMin = min(cas),
            casSD = sd(cas),
            popDM = mean(popD),
            popDMax = max(popD),
            popDMin = min(popD),
            popDSD = sd(popD),
            EDM = mean(ED),
            EDMax = max(ED),
            EDMin = min(ED),
            EDSD = sd(ED)) %>%
  t() %>%
xtable(., digits = 0)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:09 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rr}
##   \hline
##  & x \\ 
##   \hline
## nT & 2763 \\ 
##   casM & 11 \\ 
##   casMax & 1564 \\ 
##   casMin & 1 \\ 
##   casSD & 49 \\ 
##   popDM & 226 \\ 
##   popDMax & 13949 \\ 
##   popDMin & 0 \\ 
##   popDSD & 667 \\ 
##   EDM & 797564860290 \\ 
##   EDMax & 66169658959790 \\ 
##   EDMin & 5659764 \\ 
##   EDSD & 2720667862347 \\ 
##    \hline
## \end{tabular}
## \end{table}

Population and casualties and energy dissipation and casualties. There are seven orders of magnitude separating the lowest from the highest per-tornado energy dissipation. There are five orders of magnitude separating the lowest from highest per-tornado population density.

A3 = ggplot(Torn.df[Torn.df$popD > 0, ], aes(x = popD, y = cas)) +
  geom_point() +
  scale_x_log10(breaks = c(.01, 1, 100, 10000), 
                labels = c(".01", "1", "100", "10,000")) +
  scale_y_log10(breaks = c(1, 10, 100, 1000)) +
  xlab("Population Density [people/sq. km]") +
  ylab("Number of Casualties") +
  theme_minimal()

B3 = ggplot(Torn.df, aes(x = ED, y = cas)) +
  geom_point() +
  scale_x_log10(breaks = c(10^7, 10^9, 10^11, 10^13),
                labels = c(".001", ".1",  "10", "1,000")) +
  scale_y_log10(breaks = c(1, 10, 100, 1000)) +
  xlab("Energy Dissipation [GW]") +
  ylab("Number of Casualties") +
  theme_minimal()

mtrx = matrix(c(1, 2), nrow = 1, byrow = TRUE)
A3 = A3 + ggtitle("A") + 
  theme(plot.title = element_text(hjust = 0))
B3 = B3 + ggtitle("B") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(A3, B3, layout = mtrx)

Figure 4 Casualties versus population density (A) and energy dissipation (B).

Correlation between ED and popD.

Torn.df %>%
  group_by(mag) %>%
  summarize(nT = n(),
            totalED = sum(ED)/10^12,
            avgED = mean(ED)/10^12,
            medED = median(ED)/10^12)
## # A tibble: 6 x 5
##     mag    nT    totalED       avgED       medED
##   <int> <int>      <dbl>       <dbl>       <dbl>
## 1     0   237   2.204371 0.009301145 0.001414941
## 2     1   963  72.875709 0.075675710 0.017000513
## 3     2   923 375.868959 0.407225307 0.123699523
## 4     3   488 839.176917 1.719624830 0.841821518
## 5     4   154 726.489552 4.717464627 2.240879245
## 6     5    19 187.280005 9.856842348 6.758806234
Torn.df %>%
  filter(popD > 0) %>%
  mutate(lED = log10(ED),
         lpopD = log10(popD)) %>%
  summarize(r1 = cor(ED, popD, method = "p"),
            r2 = cor(ED, popD, method = "s"),
            r3 = cor(lED, lpopD))
##            r1         r2         r3
## 1 -0.07129524 -0.2012558 -0.1828996

Casualty model

Mean and variance of casualty counts.

Torn.df %>%
  filter(popD > 0) %>%
  summarize(meanCas = mean(cas),
            varCas = var(cas),
            ratio = varCas/meanCas)
##    meanCas   varCas    ratio
## 1 11.41585 2387.651 209.1523

For the set of tornadoes with at least one casualty, the mean and variance of the counts are 11.8 and 2826, respectively suggesting a negative binomial model for counts.

Use the glm.nb function from the MASS package to fit the model and get the estimate for theta.

df = Torn.df %>%
  filter(popD > 0) %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

formula = cas ~ lpopD + lED + lpopD:lED

suppressMessages(library(MASS))
model0 = glm.nb(formula, data = df, link = log, init.theta = 1)
model1 = glm(formula, data = df, family = neg.bin(theta = model0$theta))
summary(model1)
## 
## Call:
## glm(formula = formula, family = neg.bin(theta = model0$theta), 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0763  -1.0004  -0.5362   0.0699   7.8741  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.08017    0.74271  -4.147 3.47e-05 ***
## lpopD       -1.59540    0.40936  -3.897 9.96e-05 ***
## lED          0.39146    0.06768   5.784 8.13e-09 ***
## lpopD:lED    0.19533    0.03824   5.109 3.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial family taken to be 3.066176)
## 
##     Null deviance: 5225.3  on 2762  degrees of freedom
## Residual deviance: 2872.1  on 2759  degrees of freedom
## AIC: 16661
## 
## Number of Fisher Scoring iterations: 17
xtable(model1)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:10 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrr}
##   \hline
##  & Estimate & Std. Error & t value & Pr($>$$|$t$|$) \\ 
##   \hline
## (Intercept) & -3.0802 & 0.7427 & -4.15 & 0.0000 \\ 
##   lpopD & -1.5954 & 0.4094 & -3.90 & 0.0001 \\ 
##   lED & 0.3915 & 0.0677 & 5.78 & 0.0000 \\ 
##   lpopD:lED & 0.1953 & 0.0382 & 5.11 & 0.0000 \\ 
##    \hline
## \end{tabular}
## \end{table}

Table 4 Coefficients of a negative binomial regression model of tornado casualties.

Use the interplot package.

suppressMessages(library(interplot))
A4 = interplot(m = model1, var1 = "lED", var2 = "lpopD", hist = TRUE) +
    scale_x_continuous(breaks = c(-2, 0, 2, 4), 
                       labels = c(".01", "1", "100", "10,000")) +
    scale_y_continuous(breaks = pretty_breaks()) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    xlab("Population Density [people/sq. km]") +
    ylab("Energy Elasticity") +
    theme_minimal() 

B4 = interplot(m = model1, var1 = "lpopD", var2 = "lED", hist = TRUE) +
    scale_x_continuous(breaks = c(8, 10, 12, 14),
                       labels = c(".1", "10", "1000", "100,000")) +
    scale_y_continuous(breaks = pretty_breaks()) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    xlab("Energy Dissipation [GW]") +
    ylab("Population Elasticity") +
    theme_minimal()

mtrx = matrix(c(1, 2), nrow = 2, byrow = TRUE)
A4 = A4 + ggtitle("A") + 
  theme(plot.title = element_text(hjust = 0))
B4 = B4 + ggtitle("B") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(A4, B4, layout = mtrx)

Figure 6 Marginal effects plots. Relative percentage of observations are shown as a histogram along the horizontal axis.

Use the effects package to get marginal predictions. theta in the neg.bin argument is determined by the glm.nb function above.

suppressMessages(library(effects))
eff = allEffects(model1, xlevels = list(lED = seq(8, 13, 1), 
                                       lpopD = seq(0, 3, 1)))
results1 = as.data.frame(eff[[1]]) %>%
  mutate(popD = 10^lpopD,
         ED = 10^lED,
         lbl = paste("Population Density", popD, "[people/sq. km]", sep = " "))

ggplot(results1, aes(x = ED, y = fit)) +
  geom_point() +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .1) +
  scale_x_log10(breaks = c(10^8, 10^10, 10^12),
                labels = c(".1", "10", "1000")) +
  scale_y_continuous(limits = c(0, NA), breaks = pretty_breaks()) +
  facet_wrap(~ lbl, scales = "free_y") +
  xlab("Energy Dissipation [GW]") +
  ylab("Casualty Rate") +
  theme_minimal()

Figure 7 Predicted effect of energy dissipation on casualties for given levels of population density.

Model adequacy

pchisq(model1$deviance, model1$df.residual, lower.tail = FALSE) # model is adequate p-value = .07.
## [1] 0.06533669

Variation in the population/energy interaction

Regionally

df0 = Torn.df %>%
  group_by(st) %>%
  summarize(nT = n())
sts = df0$st[df0$nT >= 70]

df = Torn.df %>%
  filter(popD > 0) %>%
  mutate(lcas = log10(cas),
         lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

formula = cas ~ lpopD + lED + lpopD:lED
results0 = df %>%
  filter(st %in% sts) %>%
  group_by(st) %>%
#  do(mod0 = glm.nb(formula, data = ., link = log, 
#                   epsilon = 1e-5, start = c(0, 1, 1, 0))) %>%
  do(mod0 = glm.nb(formula, data = ., link = log, init.theta = 1)) %>%
  summarize(st = first(st),
            theta = mod0$theta,
            pEl = coef(mod0)[2],
            eEl = coef(mod0)[3],
            mEf = coef(mod0)[4],
            sig = coef(summary(mod0))[4, 4])
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning in glm.nb(formula, data = ., link = log, init.theta = 1):
## alternation limit reached
pEl = NULL
eEl = NULL
mEf = NULL
sig = NULL
for(i in 1:length(sts)){
  df1 = df[df$st == sts[i], ]
  mod1 = glm(formula, data = df1, 
                family = neg.bin(theta = results0$theta[i]), 
                epsilon = 1e-5)
  pEl = c(pEl, coef(mod1)[2])
  eEl = c(eEl, coef(mod1)[3])
  mEf = c(mEf, coef(mod1)[4])
  sig = c(sig, coef(summary(mod1))[4, 4])
}
## Warning: glm.fit: algorithm did not converge
results1 = data.frame(sts, pEl, eEl, mEf, sig)
results1 %>%
  arrange(desc(mEf)) %>%
  xtable(., digits = 3)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:13 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlrrrr}
##   \hline
##  & sts & pEl & eEl & mEf & sig \\ 
##   \hline
## 1 & AR & -9.510 & -0.075 & 0.872 & 0.000 \\ 
##   2 & MO & -5.818 & 0.069 & 0.597 & 0.002 \\ 
##   3 & IL & -4.675 & 0.161 & 0.486 & 0.001 \\ 
##   4 & KY & -4.875 & 0.193 & 0.478 & 0.042 \\ 
##   5 & TN & -4.066 & 0.078 & 0.439 & 0.020 \\ 
##   6 & AL & -3.851 & 0.281 & 0.414 & 0.009 \\ 
##   7 & OK & -2.288 & 0.375 & 0.293 & 0.030 \\ 
##   8 & MS & -2.465 & 0.526 & 0.265 & 0.137 \\ 
##   9 & NC & -2.079 & 0.154 & 0.214 & 0.553 \\ 
##   10 & KS & -1.742 & 0.788 & 0.214 & 0.233 \\ 
##   11 & TX & -1.854 & 0.286 & 0.211 & 0.004 \\ 
##   12 & LA & -0.865 & 0.689 & 0.113 & 0.441 \\ 
##   13 & GA & -0.323 & 0.581 & 0.059 & 0.779 \\ 
##   14 & IN & 0.036 & 0.606 & 0.055 & 0.813 \\ 
##   15 & FL & 1.116 & 0.998 & -0.091 & 0.536 \\ 
##   16 & SC & 1.703 & 0.380 & -0.112 & 0.637 \\ 
##    \hline
## \end{tabular}
## \end{table}

Table 5 Interaction coefficients and associated \(p\)-value by state.

Compare conditional energy elasticities between states with substantive and significant interaction (AR, TN, MO) and those with insignificant interaction. Leave out states where there is significant, but weaker interactions.

formula = cas ~ lpopD + lED + lpopD:lED

df = Torn.df %>%
  filter(popD > 0) %>%
#  filter(yr >= 2006 & yr <= 2016) %>%
#  filter(st == "IL") %>%
  filter(st == "AR" | st == "TN" | st == "MO") %>%
#  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "IL" | st == "OK" |
#         st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

range(df$lED)
## [1]  8.167772 13.365373
labs = (2^seq(-.5, 2.5, .5) - 1) * 100
model0A = glm.nb(formula, data = df, link = log, init.theta = 1)
model1A = glm(formula, data = df, family = neg.bin(theta = model0A$theta), epsilon = 1e-5)
A5 = interplot(m = model1A, var1 = "lED", var2 = "lpopD", hist = FALSE) +
    scale_x_continuous(limits = c(0, 3.5),
                       breaks = c(0, 1, 2, 3), 
                       labels = c("1", "10", "100", "1000")) +
#    scale_y_continuous(limits = c(-.4, 2.7), breaks = pretty_breaks()) +
    scale_y_continuous(limits = c(-.4, 2.7),
                       breaks = c(0, 1, 2), 
                       labels = c("0", "100 %", "200 %")) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    xlab("Population Density [people/sq. km]") +
    ylab("Increase in Casualties\nPer Doubling of Energy") +
    theme_minimal() 

df = Torn.df %>%
  filter(popD > 0) %>%
#  filter(yr >= 2006 & yr <= 2016) %>%
#  filter(st == "AL") %>%
#  filter(st == "GA" | st == "FL" | st == "LA") %>%
  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & st != "IL" & 
        st != "OK" & st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

range(df$lED)
## [1]  6.752798 13.534856
model0B = glm.nb(formula, data = df, link = log, init.theta = 1)
model1B = glm(formula, data = df, family = neg.bin(theta = model0B$theta), epsilon = 1e-5)
B5 = interplot(m = model1B, var1 = "lED", var2 = "lpopD", hist = FALSE) +
    scale_x_continuous(limits = c(0, 3.5),
                       breaks = c(0, 1, 2, 3), 
                       labels = c("1", "10", "100", "1000")) +
      scale_y_continuous(limits = c(-.4, 2.7),
                         breaks = c(0, 1, 2), 
                         labels = c("0", "100 %", "200 %")) +
#    scale_y_continuous(limits = c(-.4, 2.7), breaks = pretty_breaks()) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    xlab("Population Density [people/sq. km]") +
    ylab("Increase in Casualties\nPer Doubling of Energy") +
    theme_minimal()

mtrx = matrix(c(1, 2), nrow = 1, byrow = TRUE)
A5 = A5 + ggtitle("A") + 
  theme(plot.title = element_text(hjust = 0))
B5 = B5 + ggtitle("B") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(A5, B5, layout = mtrx)
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 48 rows containing missing values (geom_path).

Figure 8 The effect of energy on casualties conditional on population across the Mid South (A) and elsewhere (B).

Diurnally

Night vs day. Overall statistics.

suppressMessages(library(maptools))
tzone = "Etc/GMT+6"
p4s = "+proj=longlat +datum=WGS84"

loc = matrix(c(Torn.df$slon, Torn.df$slat), ncol = 2)
genesisTime = ymd_hms(paste(Torn.df$date, Torn.df$time), tz = tzone)
ssetTime = sunriset(crds = loc, genesisTime, 
                proj4string = CRS("+proj=longlat +datum=WGS84"), 
                direction = "sunset", POSIXct = TRUE)$time
sriseTime = sunriset(crds = loc, genesisTime, 
                proj4string = CRS("+proj=longlat +datum=WGS84"), 
                direction = "sunrise", POSIXct = TRUE)$time

loc2 = matrix(c(Torn.sfdf$slon, Torn.sfdf$slat), ncol = 2)
genesisTime2 = ymd_hms(paste(Torn.sfdf$date, Torn.sfdf$time), tz = tzone)
ssetTime2 = sunriset(crds = loc2, genesisTime2, 
                proj4string = CRS("+proj=longlat +datum=WGS84"), 
                direction = "sunset", POSIXct = TRUE)$time
sriseTime2 = sunriset(crds = loc2, genesisTime2, 
                proj4string = CRS("+proj=longlat +datum=WGS84"), 
                direction = "sunrise", POSIXct = TRUE)$time
Torn.df %>%
#  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
#         hr = hour(DateTime),
#         Daytime = hr > 10 & hr <= 22) %>%
  mutate(genesisTime = genesisTime,
         sriseTime = sriseTime,
         ssetTime = ssetTime,
         Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  filter(yr <= 2007) %>%
  group_by(Daytime) %>%
  summarize(nT = n(),
            nC = sum(cas),
            nF = sum(fat),
            casRate = nC/nT,
            fatRate = nF/nT)
## # A tibble: 2 x 6
##   Daytime    nT    nC    nF   casRate   fatRate
##     <lgl> <int> <int> <int>     <dbl>     <dbl>
## 1   FALSE   767  7884   508 10.279009 0.6623207
## 2    TRUE  1179 11034   484  9.358779 0.4105174
as.data.frame(Torn.sfdf) %>%
#  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
#         hr = hour(DateTime),
#         Daytime = hr > 10 & hr <= 22) %>%
    mutate(genesisTime = genesisTime2,
           sriseTime = sriseTime2,
           ssetTime = ssetTime2,
           Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  group_by(Daytime) %>%
  summarize(nT = n(),
            nTC = sum(cas > 0),
            nTF = sum(fat > 0),
            nC = sum(cas),
            pTC = nTC/nT,
            pTF = nTF/nT)
## # A tibble: 2 x 7
##   Daytime    nT   nTC   nTF    nC        pTC        pTF
##     <lgl> <int> <int> <int> <int>      <dbl>      <dbl>
## 1   FALSE  8950  1093   259 10954 0.12212291 0.02893855
## 2    TRUE 23729  1691   300 20633 0.07126301 0.01264276

Predicted casualty rates for tornadoes with at least one casualty. Mid South versus all states.

formula = cas ~ lpopD + lED + lpopD:lED

df = Torn.df %>%
  mutate(genesisTime = genesisTime,
         sriseTime = sriseTime,
         ssetTime = ssetTime,
         Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  filter(popD > 0) %>%
#  filter(cas < 1000) %>%
#  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "IL" | st == "OK" |
#         st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
  filter(st == "AR" | st == "TN" | st == "MO") %>%
  filter(Daytime) %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Day = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Day = glm(formula, data = df, 
             family = neg.bin(theta = model0Day$theta), 
             epsilon = 1e-5)
effDay = allEffects(model1Day, xlevels = list(lED = seq(9, 12, 1), 
                       lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                    confidence.level = .95)
resultsDay = as.data.frame(effDay[[1]]) %>%
  mutate(popD = 10^lpopD,
         ED = 10^lED,
         lbl = paste(popD, "[people/sq. km]", sep = " "),
         Time = "Daytime")

df = Torn.df %>%
    mutate(genesisTime = genesisTime,
         sriseTime = sriseTime,
         ssetTime = ssetTime,
         Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  filter(popD > 0) %>%
#  filter(cas < 1000) %>%
#  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "IL" | st == "OK" |
#         st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
  filter(st == "AR" | st == "TN" | st == "MO") %>%
  filter(!Daytime) %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))
model0Night = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Night = glm(formula, data = df, 
             family = neg.bin(theta = model0Night$theta), 
             epsilon = 1e-5)
effNight = allEffects(model1Night, xlevels = list(lED = seq(9, 12, 1), 
                       lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                      confidence.level = .95)
resultsNight = as.data.frame(effNight[[1]]) %>%
  mutate(popD = 10^lpopD,
         ED = 10^lED,
         lbl = paste(popD, "[people/sq. km]", sep = " "),
         Time = "Nighttime")

results = rbind(resultsDay, resultsNight)

dodge = position_dodge(width = .5)
B6 = ggplot(results, aes(x = ED, y = fit, color = Time)) +
  geom_point(position = dodge) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .1, position = dodge) +
  scale_x_log10(breaks = c(10^9, 10^10, 10^11, 10^12),
                labels = c("1", "10", "100", "1000")) +
  scale_y_continuous(limits = c(0, NA), breaks = pretty_breaks()) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .1, position = dodge) +
  facet_wrap(~ lbl, scales = "free_y") +
  xlab("Energy Dissipation [GW]") +
  ylab("Casualty Rate") +
  scale_color_discrete("") +
  theme_minimal() +
    theme(legend.position="bottom")

df = Torn.df %>%
  mutate(genesisTime = genesisTime,
         sriseTime = sriseTime,
         ssetTime = ssetTime,
         Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  filter(popD > 0) %>%
#  filter(cas < 1000) %>%
#  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & st != "IL" & st != "OK" &
#         st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  filter(Daytime) %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Day = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Day = glm(formula, data = df, 
             family = neg.bin(theta = model0Day$theta), 
             epsilon = 1e-5)
effDay = allEffects(model1Day, xlevels = list(lED = seq(9, 12, 1), 
                       lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                    confidence.level = .95)
resultsDay = as.data.frame(effDay[[1]]) %>%
  mutate(popD = 10^lpopD,
         ED = 10^lED,
         lbl = paste(popD, "[people/sq. km]", sep = " "),
         Time = "Daytime")

df = Torn.df %>%
  mutate(genesisTime = genesisTime,
         sriseTime = sriseTime,
         ssetTime = ssetTime,
         Daytime = genesisTime > sriseTime & genesisTime < ssetTime) %>%
  filter(popD > 0) %>%
#  filter(cas < 1000) %>%
#  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & st != "IL" & st != "OK" &
#         st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  filter(!Daytime) %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Night = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Night = glm(formula, data = df, 
             family = neg.bin(theta = model0Night$theta), 
             epsilon = 1e-5)
effNight = allEffects(model1Night, xlevels = list(lED = seq(9, 12, 1), 
                       lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                      confidence.level = .95)
resultsNight = as.data.frame(effNight[[1]]) %>%
  mutate(popD = 10^lpopD,
         ED = 10^lED,
         lbl = paste(popD, "[people/sq. km]", sep = " "),
         Time = "Nighttime")

results = rbind(resultsDay, resultsNight)

dodge = position_dodge(width = .5)
A6 = ggplot(results, aes(x = ED, y = fit, color = Time)) +
  geom_point(position = dodge) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .1, position = dodge) +
  scale_x_log10(breaks = c(10^9, 10^10, 10^11, 10^12),
                labels = c("1", "10", "100", "1000")) +
  scale_y_continuous(limits = c(0, NA), breaks = pretty_breaks()) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = .1, position = dodge) +
  facet_wrap(~ lbl, scales = "free_y") +
  xlab("Energy Dissipation [GW]") +
  ylab("Casualty Rate") +
  scale_color_discrete("") +
  theme_minimal() +
    theme(legend.position="bottom")

mtrx = matrix(c(1, 2), nrow = 1, byrow = TRUE)
A6 = A6 + ggtitle("B       Everywhere") + 
  theme(plot.title = element_text(hjust = 0))
B6 = B6 + ggtitle("A       Mid South") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(B6, A6, layout = mtrx)

Figure 9 Predicted casualty rates daytime versus nighttime for the Mid South (A) and all states (B).

Daily

Weekday versus weekend

formula = cas ~ lpopD + lED + lpopD:lED

df = Torn.df %>%
  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
         dow = wday(DateTime, label = TRUE, abbr = TRUE)) %>%
  filter(popD > 0) %>%
  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "OK" | st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
#  filter(st == "AR" | st == "TN" | st == "MO") %>%
  filter(dow == "Sun" | dow == "Sat") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Weekend = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)

model1Weekend = glm(formula, data = df, 
             family = neg.bin(theta = model0Weekend$theta), 
             epsilon = 1e-5)
effWeekend = allEffects(model1Weekend, xlevels = list(lED = seq(9, 12, 1), 
                        lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                 confidence.level = .95)
resultsWeekend = as.data.frame(effWeekend[[1]]) %>%
  mutate(popD = round(10^lpopD),
         ED = 10^lED,
         Density = paste(popD, "[people/sq. km]", sep = " "),
         Day = "Weekend")

df = Torn.df %>%
  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
         dow = wday(DateTime, label = TRUE, abbr = TRUE)) %>%
  filter(popD > 0) %>%
  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "OK" | 
         st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
#  filter(st == "AR" | st == "TN" | st == "MO") %>%
  filter(dow != "Sun" & dow != "Sat") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Weekday = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Weekday = glm(formula, data = df, 
             family = neg.bin(theta = model0Weekday$theta), 
             epsilon = 1e-5)
effWeekday = allEffects(model1Weekday, xlevels = list(lED = seq(9, 12, 1), 
                        lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                 confidence.level = .95)
resultsWeekday = as.data.frame(effWeekday[[1]]) %>%
  mutate(popD = round(10^lpopD),
         ED = 10^lED,
         Density = paste(popD, "[people/sq. km]", sep = " "),
         Day = "Weekday")

results = rbind(resultsWeekend, resultsWeekday)

A7 = ggplot(results, aes(x = ED, y = fit, color = Day)) +
  geom_point(position = dodge) +
  scale_x_log10(breaks = c(10^9, 10^10, 10^11, 10^12),
                labels = c("1", "10", "100", "1000")) +
  scale_y_continuous(limits = c(0, NA), breaks = pretty_breaks()) +
  facet_wrap(~ Density, scales = "free_y", nrow = 2) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = 0, position = dodge) +
  xlab("Energy Dissipation [GW]") +
  ylab("Casualty Rate") +
  scale_color_discrete("") +
  theme_minimal() +
    theme(legend.position="bottom") 

df = Torn.df %>%
  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
         dow = wday(DateTime, label = TRUE, abbr = TRUE)) %>%
  filter(popD > 0) %>%
  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & st != "OK" &
          st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  filter(dow == "Sun" | dow == "Sat") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Weekend = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)

model1Weekend = glm(formula, data = df, 
             family = neg.bin(theta = model0Weekend$theta), 
             epsilon = 1e-5)
effWeekend = allEffects(model1Weekend, xlevels = list(lED = seq(9, 12, 1), 
                        lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                 confidence.level = .95)
resultsWeekend = as.data.frame(effWeekend[[1]]) %>%
  mutate(popD = round(10^lpopD),
         ED = 10^lED,
         Density = paste(popD, "[people/sq. km]", sep = " "),
         Day = "Weekend")

df = Torn.df %>%
  mutate(DateTime = as.POSIXct(paste(date, time), format = "%Y-%m-%d %H:%M:%S"),
         dow = wday(DateTime, label = TRUE, abbr = TRUE)) %>%
  filter(popD > 0) %>%
  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & st != "OK" & 
          st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  filter(dow != "Sun" & dow != "Sat") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))

model0Weekday = glm.nb(formula, data = df, link = log, init.theta = 1, epsilon = 1e-5)
model1Weekday = glm(formula, data = df, 
             family = neg.bin(theta = model0Weekday$theta), 
             epsilon = 1e-5)
effWeekday = allEffects(model1Weekday, xlevels = list(lED = seq(9, 12, 1), 
                        lpopD = c(log10(1), log10(10), log10(100), log10(1000))),
                 confidence.level = .95)
resultsWeekday = as.data.frame(effWeekday[[1]]) %>%
  mutate(popD = round(10^lpopD),
         ED = 10^lED,
         Density = paste(popD, "[people/sq. km]", sep = " "),
         Day = "Weekday")

results = rbind(resultsWeekend, resultsWeekday)

B7 = ggplot(results, aes(x = ED, y = fit, color = Day)) +
  geom_point(position = dodge) +
  scale_x_log10(breaks = c(10^9, 10^10, 10^11, 10^12),
                labels = c("1", "10", "100", "1000")) +
  scale_y_continuous(limits = c(0, NA), breaks = pretty_breaks()) +
  facet_wrap(~ Density, scales = "free_y", nrow = 2) +
  geom_errorbar(aes(ymax = upper, ymin = lower), width = 0, position = dodge) +
  xlab("Energy Dissipation [GW]") +
  ylab("Casualty Rate") +
  scale_color_discrete("") +
  theme_minimal() +
    theme(legend.position="bottom") 

mtrx = matrix(c(1, 2), nrow = 1, byrow = TRUE)
A7 = A7 + ggtitle("A       Greater Mid South") + 
  theme(plot.title = element_text(hjust = 0))
B7 = B7 + ggtitle("B       Elsewhere") + 
  theme(plot.title = element_text(hjust = 0))
multiplot(A7, B7, layout = mtrx)

Figure 10 Predicted casualty rates weekday versus weekend.

From FEMA: Safer, Stronger, Smarter: A Guide to Improving School Natural Hazard Safety June 2017. In May 2013, an EF5 tornado struck Moore, Oklahoma and resulted in 24 fatalities, including seven children at the Plaza Towers Elementary School. In April 2014, an EF4 tornado leveled a brand-new school still under construction in Vilonia, asuburb of Little Rock, Arkansas. While schools generally have some shortterm notification of a tornado warning, and tornado safe rooms are becoming an accepted standard of care, and are now a requirement for new schools in certain locations under the 2015 International Building Code, many schools remain vulnerable to tornadoes with no safe haven for students or staff.

Percent elderly: State Level https://www.census.gov/quickfacts/fact/table/

state = c("Arkansas", "Tennessee", "Missouri", "Kentucky", "Illinois", "Oklahoma",
          "Alabama", "Mississippi", "Texas", "North Carolina", "Louisiana",
          "Georgia")
Interaction = c(1.023, .717, .607, .493, .417, .404, .347, .329, 
                .308, .286, .246, .128)
Elderly2010 = c(14.4, 13.4, 14, 13.3, 12.5, 13.5, 13.8, 12.8, 
                10.3, 12.9, 12.3, 10.7)
Elderly2016 = c(16.3, 15.7, 16.1, 15.6, 16.1, 15, 16.1, 15.1,
                12, 15.5, 14.4, 13.1)
BachDegrees = c(21.1, 24.9, 27.1, 22.3, 32.3, 24.1, 23.5, 
                20.7, 27.6, 28.4, 22.5, 28.8)
Disability = c(12.3, 11.2, 10.4, 12.9, 7.1, 11.3, 11.8, 11.9, 8.1, 
               9.6, 11.0, 8.8)
Rent = c(677, 764, 746, 675, 907, 727, 717, 717, 882, 797, 788, 879)
Income = c(41371, 45219, 48173, 43740, 57574, 46879, 43623, 
           39665, 53207, 46868, 45047, 49620)
Black = c(15.4, 16.7, 11.6, 7.8, 14.5, 7.4, 26.2, 37, 11.8, 21.5, 
          32, 30.5)
White = c(74.5, 75.6, 81, 86.3, 63.7, 68.7, 67, 58, 45.3, 65.3, 
          60.3, 55.9)

cor.test(Interaction, Elderly2010)
## 
##  Pearson's product-moment correlation
## 
## data:  Interaction and Elderly2010
## t = 2.9821, df = 10, p-value = 0.01376
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1850385 0.9040284
## sample estimates:
##       cor 
## 0.6860824
dg = data.frame(state, Interaction, Elderly2010, Elderly2016)
xtable(dg)
## % latex table generated in R 3.4.1 by xtable 1.8-2 package
## % Tue Aug 22 09:04:20 2017
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlrrr}
##   \hline
##  & state & Interaction & Elderly2010 & Elderly2016 \\ 
##   \hline
## 1 & Arkansas & 1.02 & 14.40 & 16.30 \\ 
##   2 & Tennessee & 0.72 & 13.40 & 15.70 \\ 
##   3 & Missouri & 0.61 & 14.00 & 16.10 \\ 
##   4 & Kentucky & 0.49 & 13.30 & 15.60 \\ 
##   5 & Illinois & 0.42 & 12.50 & 16.10 \\ 
##   6 & Oklahoma & 0.40 & 13.50 & 15.00 \\ 
##   7 & Alabama & 0.35 & 13.80 & 16.10 \\ 
##   8 & Mississippi & 0.33 & 12.80 & 15.10 \\ 
##   9 & Texas & 0.31 & 10.30 & 12.00 \\ 
##   10 & North Carolina & 0.29 & 12.90 & 15.50 \\ 
##   11 & Louisiana & 0.25 & 12.30 & 14.40 \\ 
##   12 & Georgia & 0.13 & 10.70 & 13.10 \\ 
##    \hline
## \end{tabular}
## \end{table}

Table 6 Percent elderly and model interaction term.

r(Elderly2010, Interaction) = .69 Variable r Elderly2010: %65+ in 2010 +.69 Percent White alone +.62 Percent Disabled <65 +.42 Median Gross Rent 2011-2015 -.53 Percent Black alone -.46 Percent Bachelor Degrees -.33 Median Household Income -.31

Speculation: Elderly whites: Greater degree of fatalism, passivity (physically harder to do something), and inattention to warnings.

Percent elderly: City Level Wikipedia: city name %65+ popD Memphis, TN 15.4 770
Hot Springs, AR 23.2 392 Little Rock, AR 11.6 640 Fort Smith, AR 13.7 537 Joplin, MO 13.2 540 Bella Vista, AR 41.9 234

Gainesville, GA 10.5 468 Albany, GA 11.3 535 Alexandria, LA 15.1 680 Birmingham, AL 13.5 551

The elderly are moving to the suburbs and exurbs. Hot Springs, Arkansas is a hotspot for old people.

Things are getting worse

Predicted casualty rates using the Mid-South and Elsewhere models fit to data in overlapping windows of W consecutive years.

formula = cas ~ lpopD + lED + lpopD:lED

W = 10 # number of years in the window
S = 13 # tornado strength log energy dissipation
P = log10(500) # population density log scale

df = Torn.df %>%
  filter(popD > 0) %>%
  filter(st == "AR" | st == "TN" | st == "MO" | st == "KY" | st == "OK" |
         st == "AL" | st == "MS" | st == "TX" | st == "LA") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))
fit = numeric()
UL = numeric(); UU = numeric()
for(decadeEndYr in 2011:2016){
  df2 = df %>%
    filter(yr > decadeEndYr - W & yr <= decadeEndYr)
    
  model0A = glm.nb(formula, data = df2, link = log, init.theta = 1)
  model1A = glm(formula, data = df2, family = neg.bin(theta = model0A$theta), epsilon = 1e-5)
  
  pre = predict.glm(model1A, newdata = data.frame(lED = S, lpopD = P), type = "response", se.fit = TRUE)
  fit = c(fit, pre$fit)
  UL = c(UL, pre$fit - pre$se.fit)
  UU = c(UU, pre$fit + pre$se.fit)
}

resultsMS = data.frame(Year = 2011:2016, fit, UL, UU, Region = "Greater Mid South")

df = Torn.df %>%
  filter(popD > 0) %>%
  filter(st != "AR" & st != "TN" & st != "MO" & st != "KY" & 
        st != "OK" & st != "AL" & st != "MS" & st != "TX" & st != "LA") %>%
  mutate(lpopD = log10(popD),
         lpop = log10(pop),
         lED = log10(ED))
fit = numeric()
UL = numeric(); UU = numeric()
for(decadeEndYr in 2011:2016){
  df2 = df %>%
    filter(yr > decadeEndYr - W & yr <= decadeEndYr)
    
  model0B = glm.nb(formula, data = df2, link = log, init.theta = 1, epsilon = 1e-5)
  model1B = glm(formula, data = df2, family = neg.bin(theta = model0B$theta), epsilon = 1e-5)
  
  pre = predict.glm(model1B, newdata = data.frame(lED = S, lpopD = P), type = "response", se.fit = TRUE)
  fit = c(fit, pre$fit)
  UL = c(UL, pre$fit - pre$se.fit)
  UU = c(UU, pre$fit + pre$se.fit)
}
## Warning: glm.fit: algorithm did not converge
## Warning in glm.nb(formula, data = df2, link = log, init.theta = 1, epsilon
## = 1e-05): alternation limit reached
resultsElsewhere = data.frame(Year = 2011:2016, fit, UL, UU, Region = "Elsewhere")

results = rbind(resultsMS, resultsElsewhere)

ggplot(results, aes(x = Year, y = fit, color = Region)) +
  geom_point() +
  geom_errorbar(aes(ymin = UL, ymax = UU), width = .1) +
  scale_y_continuous(limits = c(0, NA)) +
  scale_x_continuous(breaks = 2011:2016) +
  ylab("Casualty Rate") +
  theme_minimal()

Figure 11 Predicted casualty rates for a tornado with 1 GW of energy dissipation over an area with a population density of 300 people/sq. km.

https://www.census.gov/population/socdemo/statbriefs/agebrief.html

April 2011 outbreak. https://www.cdc.gov/mmwr/preview/mmwrhtml/mm6128a3.htm

Stan model

To check the model we use some Bayesian formulations.

library(brms)
options(mc.cores = parallel::detectCores())

Model

df = Torn.df %>%
  filter(popD > 0) %>%
  mutate(lpopD = log10(popD),
         lED = log10(ED),
         cas = cas - 1)

formula = cas ~ lpopD + lED + lpopD:lED
#formula = fat  ~ lpopD + lED + lpopD:lED

family = negbinomial

betas = set_prior("normal(0,5)", class = "b")

model1 = brm(family = family,  # likelihood
           formula = formula,    # predictors
           data = df,  # data
           prior = betas, # prior
           warmup = 1000, iter = 2000, chains = 2,
           control = list(adapt_delta = .9))

fixef(model1)

pp_check(model1, type = "dens") + scale_x_log10()

yrep = posterior_predict(model1)
df.yrep = as.data.frame(t(yrep))
df.out = reshape2::melt(df.yrep) %>%
  group_by(variable) %>%
  summarize(mx = max(value),
            mn = mean(value))
ggplot(df.out, aes(mx)) + 
  geom_histogram() +
  geom_vline(xintercept = max(df$cas), color = "red") +
  ylab("Posterior Frequency") +
  xlab("Worst Case Per Tornado Loss (Number of Casaulties)") +
  theme_minimal()
max(yrep)

Another view of the posterior predictive samples is the distribution of maximum losses. Maximum of the max compared to the data max. Looks good.

set.seed(344562)
mx = numeric()
for(i in 1:200){
yrep = posterior_predict(model1)
df.yrep = as.data.frame(t(yrep))
df.out = reshape2::melt(df.yrep) %>%
  group_by(variable) %>%
  summarize(mx = max(value),
            mn = mean(value))
mx = c(mx, max(df.out$mx))
}

ggplot(data.frame(mx), aes(mx)) +
  geom_density() +
  geom_vline(xintercept = max(df$cas), color = "red") +
    ylab("Posterior Frequency") +
  xlab("Worst Case Per Tornado Loss (Number of Casaulties)") +
  theme_minimal()

Census data

Need 3.4.0 or higher R for this.

library(UScensus2010)
library(UScensus2000cdp)
citation("UScensus2010")

data(arkansas.cdp)
df = as.data.frame(arkansas.cdp)
df = df[!duplicated(df), ]
dfAR = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfAR$State = "Arkansas"

data(tennessee.cdp)
df = as.data.frame(tennessee.cdp)
df = df[!duplicated(df), ]
dfTN = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfTN$State = "Tennessee"

data(missouri.cdp)
df = as.data.frame(missouri.cdp)
df = df[!duplicated(df), ]
dfMO = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfMO$State = "Missouri"

data(kentucky.cdp)
df = as.data.frame(kentucky.cdp)
df = df[!duplicated(df), ]
dfKY = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfKY$State = "Kentucky"

dfMidSouth = rbind(dfAR, dfTN, dfMO, dfKY)
dfMidSouth$Region = "MidSouth"

#data(illinois.cdp)
#df = as.data.frame(illinois.cdp)
#df = df[!duplicated(df), ]
#df %>%
#  filter(pop2000 > 10000) %>%
#  dplyr::select(name, pop2000, age.65.up) %>%
#  mutate(percElderly = age.65.up/pop2000 * 100) %>%
#  arrange(desc(percElderly))

data(georgia.cdp)
df = as.data.frame(georgia.cdp)
df = df[!duplicated(df), ]
dfGA = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfGA$State = "Georgia"

data(alabama.cdp)
df = as.data.frame(alabama.cdp)
df = df[!duplicated(df), ]
dfAL = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfAL$State = "Alabama"

data(mississippi.cdp)
df = as.data.frame(mississippi.cdp)
df = df[!duplicated(df), ]
dfMS = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfMS$State = "Mississippi"

data(louisiana.cdp)
df = as.data.frame(louisiana.cdp)
df = df[!duplicated(df), ]
dfLA = df %>%
  filter(pop2000 > 10000) %>%
  dplyr::select(name, pop2000, age.65.up) %>%
  mutate(percElderly = age.65.up/pop2000 * 100) %>%
  filter(percElderly > 18) %>%
  arrange(desc(percElderly))
dfLA$State = "Louisiana"

dfSouth = rbind(dfGA, dfAL, dfMS, dfLA)
dfSouth$Region = "South"

dfAll = rbind(dfMidSouth, dfSouth)
dfAll %>%
  group_by(Region) %>%
  summarize(nCities = n(),
            Pop = sum(pop2000),
            PopElderly = sum(age.65.up),
            percElderly = PopElderly/Pop * 100)