## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: grid

Synopsys

This report summarizes the top weather events in the United States between the years 1950 and 2011 as reported on the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The analysis determined the most severe types of events according to their effects on population health and economic cost finding Tornado type of events with the highest impact across all effects under consideration.

Data Processsing.

The data is freely available from the NOOA site. We first proceed to download the data, select the relevant fields and inspect the format.

DBREMOTEURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
DBLOCALPATH <- "repdata.csv.bz2"
if (! file.exists(DBLOCALPATH)) {
  download.file(url = DBREMOTEURL, destfile = DBLOCALPATH)
}
DBDATA <- read.csv(bzfile(DBLOCALPATH)) %>% 
  select(c(EVTYPE, STATE, COUNTY, COUNTYNAME, FATALITIES, INJURIES, LATITUDE, LONGITUDE, PROPDMG, CROPDMG))
head(DBDATA)
##    EVTYPE STATE COUNTY COUNTYNAME FATALITIES INJURIES LATITUDE LONGITUDE
## 1 TORNADO    AL     97     MOBILE          0       15     3040      8812
## 2 TORNADO    AL      3    BALDWIN          0        0     3042      8755
## 3 TORNADO    AL     57    FAYETTE          0        2     3340      8742
## 4 TORNADO    AL     89    MADISON          0        2     3458      8626
## 5 TORNADO    AL     43    CULLMAN          0        2     3412      8642
## 6 TORNADO    AL     77 LAUDERDALE          0        6     3450      8748
##   PROPDMG CROPDMG
## 1    25.0       0
## 2     2.5       0
## 3    25.0       0
## 4     2.5       0
## 5     2.5       0
## 6     2.5       0

The number of NA values is sow low that we’ll remove those cases from consideration. After this we introduce a helper column for the total damages for each event.

colMeans(apply(DBDATA, 2, is.na))
##       EVTYPE        STATE       COUNTY   COUNTYNAME   FATALITIES 
## 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 
##     INJURIES     LATITUDE    LONGITUDE      PROPDMG      CROPDMG 
## 0.000000e+00 5.208928e-05 0.000000e+00 0.000000e+00 0.000000e+00
DBDATA <- DBDATA %>% filter(complete.cases(.)) %>% 
  mutate(DAMAGES = PROPDMG + CROPDMG, COUNT = 1)

We check the number of events present in the data

str(DBDATA)
## 'data.frame':    902250 obs. of  12 variables:
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DAMAGES   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ COUNT     : num  1 1 1 1 1 1 1 1 1 1 ...

We see a very high number of EVTYPE levels. We try to determine subgroup of the rows that lets reduce the dimensionality of the data while still keeping a reasonable degree of accuracy

SUMMRY <- ddply(DBDATA, .(EVTYPE), colwise(sum, .(DAMAGES, FATALITIES, INJURIES, COUNT)))
TOTALS <- apply(select(SUMMRY, c(DAMAGES, FATALITIES, INJURIES, COUNT)), 2, sum)
CUMDMG <- data.frame(
  records = cumsum(rep(1, dim(SUMMRY)[1])), 
  cumdmgpct = cumsum(sort(SUMMRY$DAMAGES, decreasing = T) / TOTALS["DAMAGES"]),
  cumfatpct = cumsum(sort(SUMMRY$FATALITIES, decreasing = T) / TOTALS["FATALITIES"]), 
  cuminjpct = cumsum(sort(SUMMRY$INJURIES, decreasing = T) / TOTALS["INJURIES"]),
  cumcntpct = cumsum(sort(SUMMRY$COUNT, decreasing = T) / TOTALS["COUNT"]))
pd0 <- ggplot(CUMDMG, aes(x = records, color = Effect)) +
  geom_line(aes(y = cumdmgpct, color = "Damages")) +
  geom_line(aes(y = cumfatpct, color = "Fatalities")) +
  geom_line(aes(y = cuminjpct, color = "Injuries")) +
  geom_line(aes(y = cuminjpct, color = "Events")) +
  xlab("Number of event categories") +
  ylab("Cummulative effect percentage") +
  ggtitle("Cumulative effects vs Number of event categories")
pd1 <- pd0 + xlim(0, 250)
pd2 <- pd0 + xlim(0, 25)
grid.arrange(pd1, pd2, ncol = 1)

We see that in all cases we cummulative distribution of the effects obeys an exponential relationship to the number of event categories. This means that going forward we’ll be able to focus on the top 25 event categories for each variable under study (total damages, fatalities and injuries) while still accounting for over 90% of the effects in each case.

We want to characterize the nominal effect of the events under each EVTYPE level for each variable along with their relative frequency.

We start first by retrieving the top event categories for each effect variable

NUMCAT <- 25
EVTYPE <- head(SUMMRY %>% arrange(desc(DAMAGES)), NUMCAT) %>%
  join(head(SUMMRY %>% arrange(desc(FATALITIES)), NUMCAT), by = 'EVTYPE', type = 'full') %>%
  join(head(SUMMRY %>% arrange(desc(INJURIES)), NUMCAT), by = 'EVTYPE', type = 'full')
EVTYPE
##                     EVTYPE    DAMAGES FATALITIES INJURIES  COUNT
## 1                  TORNADO 3312276.68       5633    91346  60652
## 2              FLASH FLOOD 1598825.05        978     1777  54262
## 3                TSTM WIND 1445168.21        504     6957 219940
## 4                     HAIL 1268289.66         15     1361 288661
## 5                    FLOOD 1067976.36        470     6789  25326
## 6        THUNDERSTORM WIND  943635.62        133     1488  82563
## 7                LIGHTNING  606812.39        816     5230  15751
## 8       THUNDERSTORM WINDS  464978.11         64      908  20843
## 9                HIGH WIND  342014.77        248     1137  20212
## 10            WINTER STORM  134699.58        206     1321  11433
## 11              HEAVY SNOW  124417.71        127     1021  15708
## 12                WILDFIRE   88823.54         75      911   2761
## 13               ICE STORM   67689.62         89     1975   2006
## 14             STRONG WIND   64610.71        103      280   3566
## 15              HEAVY RAIN   61964.94         98      251  11695
## 16              HIGH WINDS   57384.60         35      302   1533
## 17          TROPICAL STORM   54322.80         58      340    690
## 18        WILD/FOREST FIRE   43534.49         12      545   1457
## 19                 DROUGHT   37997.67          0        4   2488
## 20          FLASH FLOODING   33623.20         19        8    682
## 21    URBAN/SML STREAM FLD   28845.74         28       79   3392
## 22                BLIZZARD   25490.48        101      805   2719
## 23               HURRICANE   20852.99         61       46    174
## 24       FLOOD/FLASH FLOOD   20580.95         17       15    624
## 25             STORM SURGE   19398.49         13       38    261
## 26          EXCESSIVE HEAT    1954.40       1903     6525   1678
## 27                    HEAT     961.20        937     2100    767
## 28             RIP CURRENT       1.00        368      232    470
## 29               AVALANCHE    1623.90        224      170    386
## 30            RIP CURRENTS     162.00        204      297    304
## 31               HEAT WAVE    1524.55        172      309     74
## 32            EXTREME COLD   13778.68        160      231    655
## 33 EXTREME COLD/WIND CHILL    2704.00        125       24   1002
## 34               HIGH SURF    3041.62        101      152    725
## 35            EXTREME HEAT      10.11         96      155     22
## 36         COLD/WIND CHILL    2590.00         95       12    539
## 37       HURRICANE/TYPHOON   10637.85         64     1275     88
## 38                     FOG    8849.81         62      734    538
## 39              DUST STORM    6651.00         22      440    427
## 40          WINTER WEATHER   11989.90         33      398   7026
## 41               DENSE FOG    8225.45         18      342   1293

We observe there’s further opportunities to consolidate the number of EVTYPE factors

EVTYPE$EVTYPE <- revalue(EVTYPE$EVTYPE, c("FLASH FLOOD" = "FLOOD/FLASH FLOOD",
                                          "FLOOD" = "FLOOD/FLASH FLOOD",
                                          "FLASH FLOODING" = "FLOOD/FLASH FLOOD",
                                          "URBAN/SML STREAM FLD" = "FLOOD/FLASH FLOOD",
                                          "TSTM WIND" = "THUNDERSTORM WIND",
                                          "THUNDERSTORM WINDS" = "THUNDERSTORM WIND",
                                          "STRONG WIND" = "HIGH WIND",
                                          "HIGH WINDS" = "HIGH WIND",
                                          "WILDFIRE" = "WILD/FOREST FIRE",
                                          "EXCESSIVE HEAT" = "HEAT",
                                          "HEAT WAVE" = "HEAT",
                                          "EXTREME HEAT" = "HEAT",
                                          "HURRICANE" = "HURRICANE/TYPHOON",
                                          "DENSE FOG" = "FOG",
                                          "EXTREME COLD" = "COLD/WIND CHILL",
                                          "EXTREME COLD/WIND CHILL" = "COLD/WIND CHILL",
                                          "RIP CURRENTS" = "RIP CURRENT"))
EVTYPE <- ddply(EVTYPE, .(EVTYPE), colwise(sum, .(DAMAGES, FATALITIES, INJURIES, COUNT)))
EVTYPE
##               EVTYPE    DAMAGES FATALITIES INJURIES  COUNT
## 1          AVALANCHE    1623.90        224      170    386
## 2           BLIZZARD   25490.48        101      805   2719
## 3    COLD/WIND CHILL   19072.68        380      267   2196
## 4                FOG   17075.26         80     1076   1831
## 5            DROUGHT   37997.67          0        4   2488
## 6         DUST STORM    6651.00         22      440    427
## 7               HEAT    4450.26       3108     9089   2541
## 8  FLOOD/FLASH FLOOD 2749851.30       1512     8668  84286
## 9               HAIL 1268289.66         15     1361 288661
## 10        HEAVY RAIN   61964.94         98      251  11695
## 11        HEAVY SNOW  124417.71        127     1021  15708
## 12         HIGH SURF    3041.62        101      152    725
## 13         HIGH WIND  464010.08        386     1719  25311
## 14 HURRICANE/TYPHOON   31490.84        125     1321    262
## 15         ICE STORM   67689.62         89     1975   2006
## 16         LIGHTNING  606812.39        816     5230  15751
## 17       RIP CURRENT     163.00        572      529    774
## 18       STORM SURGE   19398.49         13       38    261
## 19 THUNDERSTORM WIND 2853781.94        701     9353 323346
## 20           TORNADO 3312276.68       5633    91346  60652
## 21    TROPICAL STORM   54322.80         58      340    690
## 22  WILD/FOREST FIRE  132358.03         87     1456   4218
## 23      WINTER STORM  134699.58        206     1321  11433
## 24    WINTER WEATHER   11989.90         33      398   7026

Results

We can graph the relevant statistics per event type to determine their relative effects

EVMELT <- melt(EVTYPE, id.vars ="EVTYPE", variable.name = "Effect", value.name = "Value")
ggplot(EVMELT, aes(EVTYPE, y = Value)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ Effect, scales = "free_y", ncol = 1) +
  theme(axis.text.x = element_text(angle = 90), axis.title.y = element_blank())

We find that the highest impact across all three of the effects under consideration is found for events in category ‘Tornado’, even though it’s only the 4th most frequent category of events among the sample.