library(dplyr)
library(ggplot2)

Synopsis

According NOAA Storm Database more than 8700 person were killed and nearly 58000 were injured by disasters caused by weather 1996 to 2011. The total property damage was nearly $252 billion and damage on crop was $35 billion. The most expensive property damages was caused by flooding (often as a consequence of hurricanes and storms) and most people were killed by heat/drought and tornadoes and similar weather phenomena. Flooding, hail and wind/storm caused the highest cost for crop damages.

Data Processing

NOAA has followed weather events since 1950, but in the beginning only tornados was reported. Since 1996 NOAA has followed 48 different types of weather incidents. That is the reason for that this study covers the period 1996 to 2011 that is the latest year in the database.

The data is collected in a database that can be downloaded from here. The data base is described in NOAAs documentation that can be found on their web site.

Looading data

The data is has 37 different variables. Only ten of them are relevant for this report. These ten columns are loaded into a dataframe called “data”:

# Avoid scientific notation
options(scipen = 999)

# Check if the StormData-file is already downloaded, if not, download it
# from the server

if (!file.exists("data/repdata_data_StormData.csv.bz2")) {
    dir.create("data", showWarnings = FALSE)
    url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(url, "./data/repdata_data_StormData.csv.bz2")
}

# Columns that are not of interest for this report are marked with NULL and
# will not be loaded in the data frame 'data'
colclass <- c("NULL", "character", "NULL", "NULL", "NULL", "NULL", "NULL", "character", 
    "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", 
    "NULL", "NULL", "NULL", "NULL", "NULL", "numeric", "numeric", "numeric", 
    "character", "numeric", "character", "NULL", "NULL", "NULL", "NULL", "NULL", 
    "NULL", "NULL", "character", "numeric")

data <- read.table("./data/repdata_data_StormData.csv.bz2", header = T, sep = ",", 
    colClasses = colclass)

The year for every event is extracted and the data is subseted so that only years later than 1995 is included in the data.

# Preparing data for analysis 
data$EVTYPE <- tolower(data$EVTYPE)
data$year <- as.POSIXlt(as.character(data$BGN_DATE), format = "%m/%d/%Y %H:%M:%S")$year+1900 
data <- filter(data, year > 1995)

Calculating cost

The cost for property damage is described with a decimal figure in a column called PROPDMG. The unit of that figure is in column PROPDMG, that can have three different values: “K” for thousands, “M” for millions and “B” for billions. A new column is created (propmultiply) where all letters are replaced with the figures that all the figures in POPMAG should be multiplied by to convert the cost to thousands of USD. The result is saved in the column “prop1000USD”. The cost for crop damage are calculated in a similar way and are saved in “crop1000USD”.

# Changing units to thousands of dollar
data$propmultiply <- 0.001
data[data$PROPDMGEXP == "K", "propmultiply"] <- 1
data[data$PROPDMGEXP == "M", "propmultiply"] <- 1000
data[data$PROPDMGEXP == "B", "propmultiply"] <- 1e+06
data$prop1000USD <- data$PROPDMG * data$propmultiply

data$cropmultiply <- 0.001
data[data$CROPDMGEXP == "K", "cropmultiply"] <- 1
data[data$CROPDMGEXP == "M", "cropmultiply"] <- 1000
data[data$CROPDMGEXP == "B", "cropmultiply"] <- 1e+06
data$crop1000USD <- data$CROPDMG * data$cropmultiply

Then the total cost for both crop and property damage are calculated as the sum of “prop1000USD” and “crop1000USD” and saved in the column “totalcosts1000USD”.

# Calculating the total cost for both property and crop damage by adding the
# cost for property damage and crop damage. Because the fields contains NAs
# two temporary colums are created where NAs are replaced the with zeros.
data$tmpPropCost <- data$prop1000USD
data$tmpCropCost <- data$crop1000USD
data$tmpPropCost[is.na(data$tmpPropCost)] <- 0
data$tmpCropCost[is.na(data$tmpCropCost)] <- 0
data$totalcosts1000USD <- data$tmpPropCost + data$tmpCropCost
data$tmpPropCost = NULL
data$tmpCropCost = NULL

Grouping event types

The data consists of mostly local reports of weather incidents. The reported event types sometimes do not follow NOAAs naming convention. The event types are described by different types of abbreviations for the same event type and are sometimes misspelled. Even if there is only 48 different event types, the database does contain 438 different unique types of events (including different abbreviations, misspellings etcetera). To solve this problem the data has been aggregated to 18 different groups (plus “unknown”, marked as NA).

# Aggregating EVTYPE (not perfect...)

data$Group <- NA
data$Group[grep(".*wind.*|.*storm.*|.*dust.*|.*downburst.*|.*microburst.*|.*stm.*", 
    data$EVTYPE)] <- "wind/storm"
data$Group[grep(".*tornado.*|.*funnel.*|.*torndao.*|.*landspout.*|.*dust\\sdevil.*|.*whirlwind.*|.*waterspout.*", 
    data$EVTYPE)] <- "tornado/funnel storm"
data$Group[grep(".*heat.*|.*warm.*|.*record\\stemp.*|.*hot.*.|.*dry.*|.*drought.*|.*warmth.*", 
    data$EVTYPE)] <- "heat/drought"
data$Group[grep(".*rain.*|.*shower.*|.*precipitation.*|.*wet.*", data$EVTYPE)] <- "rain"
data$Group[grep(".*flood.*|.*rising\\swater.*|.*surge.*|.*stream\\sfld.*|.*high\\swater.*|.*tide.*", 
    data$EVTYPE)] <- "flood"
data$Group[grep(".*blizzard.*|.*snow.*|.*winter\\sweather.*|.*wintry.*", data$EVTYPE)] <- "snow/snow storm"
data$Group[grep(".*avalance.*|.*avalanche.*", data$EVTYPE)] <- "avalance"
data$Group[grep(".*hurricane.*|.*typhoon.*|.*tropical\\sdepression.*", data$EVTYPE)] <- "hurricane"
data$Group[grep(".*fire.*", data$EVTYPE)] <- "fire"
data$Group[grep(".*fog.*", data$EVTYPE)] <- "fog"
data$Group[grep(".*hail.*", data$EVTYPE)] <- "hail"
data$Group[grep(".*surf.*|.*current.*|.*seas.*|.*swell.*|.*waves.*|.*seiche.*|.*rogue\\swave.*", 
    data$EVTYPE)] <- "high surf/rip/current/waves"
data$Group[grep(".*landslide.*|.*rock slide.*|.*landslump.*|.*mud.*|.*erosion.*", 
    data$EVTYPE)] <- "landslide/erosion"
data$Group[grep(".*lightning.*|.*ligntning.*", data$EVTYPE)] <- "lightning"
data$Group[grep(".*wind.*|.*storm.*", data$EVTYPE)] <- "wind/storm"
data$Group[grep(".*tsunami.*", data$EVTYPE)] <- "tsunami"
data$Group[grep(".*cold.*|.*chill.*|.*cool*|.*freez.*|.*frost.*|.*icy.*|.*glaze.*|.*ice.*|.*low\\stemperature.*|.*hyperthermia.*|.*sleet.*|.*hypothermia.*", 
    data$EVTYPE)] <- "cold"
data$Group[grep(".*tornado.*", data$EVTYPE)] <- "tornado/funnel storm"
data$Group[grep(".*season\\ssnow*", data$EVTYPE)] <- "snow/snow storm"
data$Group[grep(".*dry\\smicroburst*", data$EVTYPE)] <- "wind/storm"
data$Group[grep(".*volcanic.*", data$EVTYPE)] <- "volcanic activity"

The event groups are:

unique(data$Group)
##  [1] "wind/storm"                  "tornado/funnel storm"       
##  [3] "hail"                        "rain"                       
##  [5] "flood"                       "cold"                       
##  [7] "heat/drought"                "lightning"                  
##  [9] "snow/snow storm"             "high surf/rip/current/waves"
## [11] NA                            "fire"                       
## [13] "fog"                         "avalance"                   
## [15] "landslide/erosion"           "hurricane"                  
## [17] "volcanic activity"           "tsunami"

Does the data contains unreasonable values? A look at the events that caused the highest property damages shows that one event stands out:

head({
  data %>%
  select(EVTYPE, year, prop1000USD, REFNUM) %>%
  arrange(desc(prop1000USD))
})
##              EVTYPE year prop1000USD REFNUM
## 1             flood 2006   115000000 605943
## 2       storm surge 2005    31300000 577616
## 3 hurricane/typhoon 2005    16930000 577615
## 4       storm surge 2005    11260000 581535
## 5 hurricane/typhoon 2005    10000000 569288
## 6 hurricane/typhoon 2005     7350000 581533

This is how the event is described in the REMARKS field in the database:

data[data$REFNUM==605943, "REMARKS"]
## [1] "Major flooding continued into the early hours of January 1st, before the Napa River finally fell below flood stage and the water receeded. Flooding was severe in Downtown Napa from the Napa Creek and the City and Parks Department was hit with $6 million in damage alone. The City of Napa had 600 homes with moderate damage, 150 damaged businesses with costs of at least $70 million."

The event with the highest cost in the database is the flooding of Napa River in 2006. In the event report the cost for the damages are reported as “at least $70 million”, far from the calculated $115 billion (over 1642 times higher). There is strong evidence that the 115 billion figure is incorrect. It would be more than three times the cost for Katrina in 2005. An article in Napa Valley Register from 2006-01-05 mention damages “in excess of $135 million”. A probable cause of the high cost in the database is a simple typo. The exponent has probably by mistake been marked as a “B” (for billion) instead of a “M” (for million). This is corrected in the database

data[data$REFNUM==605943, "PROPDMGEXP"] <- "M"
data[data$REFNUM==605943, "propmultiply"] <- 1000
data[data$REFNUM==605943, "prop1000USD"] <- data[data$REFNUM==605943, "PROPDMG"] * 
  data[data$REFNUM==605943, "propmultiply"]

Results

Fatalities

The most common cause for deaths was heat/drought that killed 2036 peoples 1996 - 2011.

# Calculation deaths and injuries by cause
aggFatInj <- data %>%
  filter(year>1995) %>%
  select(Group, FATALITIES, INJURIES) %>%
  group_by(Group) %>%
  summarise_each(funs(sum)) %>%
  mutate(totalsum = FATALITIES + INJURIES) %>%
  filter(totalsum != 0)

deathsDF <- select(aggFatInj, Group, FATALITIES) %>% arrange(desc(FATALITIES))
deathsDF[is.na(deathsDF$Group), "Group"] <- "other"
ggplot(deathsDF, aes(x=reorder(Group, -FATALITIES), y=FATALITIES)) + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) +
  ggtitle("Fatalities by group, 1996-2011") +
    ggtitle("Fatalities by group, 1996-2011") +
  xlab("Group") + 
  ylab("Fatalities")

plot of chunk unnamed-chunk-3

The individual event types that caused most fatalities were:

head({
data %>% select(EVTYPE, FATALITIES) %>% 
group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES)) %>%
arrange(desc(Fatalities))
}, 10)
## Source: local data frame [10 x 2]
## 
##            EVTYPE Fatalities
## 1  excessive heat       1797
## 2         tornado       1511
## 3     flash flood        887
## 4       lightning        651
## 5           flood        414
## 6     rip current        340
## 7       tstm wind        241
## 8            heat        237
## 9       high wind        235
## 10      avalanche        223

Injuries

Most injuries were caused by tornado/funnel storms (20709), followed by wind/storms (8747) and flooding (8515).

injuriesDF <- select(aggFatInj, Group, INJURIES) %>% arrange(desc(INJURIES))
injuriesDF[is.na(injuriesDF$Group), "Group"] <- "other"
ggplot(injuriesDF, aes(x=reorder(Group, -INJURIES), y=INJURIES)) + 
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1)) +
  ggtitle("Injuries by group, 1996-2011") +
  xlab("Group") + 
  ylab("Injuries")

plot of chunk graph injuries by group

The individual event types that caused most injuries were:

head({
data %>% select(EVTYPE, INJURIES) %>% 
group_by(EVTYPE) %>%
summarise(Injuries = sum(INJURIES)) %>%
arrange(desc(Injuries))
}, 10)
## Source: local data frame [10 x 2]
## 
##               EVTYPE Injuries
## 1            tornado    20667
## 2              flood     6758
## 3     excessive heat     6391
## 4          lightning     4141
## 5          tstm wind     3629
## 6        flash flood     1674
## 7  thunderstorm wind     1400
## 8       winter storm     1292
## 9  hurricane/typhoon     1275
## 10              heat     1222

Property damage

The total cost for property damage was $252 billion. Hurricanes caused the highest cost ($82 billion), followed by wind/storm ($70 billions) and flooding ($44 billions). The cost for property damage by group were:

# List the cost for property damage by group
data %>% select(Group, prop1000USD) %>% 
group_by(Group) %>%
summarise(Property_damages_billion_USD = sum(prop1000USD/1000000, na.rm = TRUE)) %>%
arrange(desc(Property_damages_billion_USD))
## Source: local data frame [18 x 2]
## 
##                          Group Property_damages_billion_USD
## 1                    hurricane                    81.720626
## 2                   wind/storm                    70.368796
## 3                        flood                    44.839357
## 4         tornado/funnel storm                    24.623481
## 5                         hail                    14.595217
## 6                         fire                     7.760450
## 7                         cold                     3.696609
## 8              snow/snow storm                     1.234961
## 9                 heat/drought                     1.055345
## 10                   lightning                     0.743077
## 11                        rain                     0.585989
## 12           landslide/erosion                     0.378724
## 13                     tsunami                     0.144062
## 14 high surf/rip/current/waves                     0.110037
## 15                         fog                     0.020465
## 16                    avalance                     0.003712
## 17                          NA                     0.001208
## 18           volcanic activity                     0.000500
# Total cost for property damage
totalCostProp <- sum(data$prop1000USD, na.rm = TRUE)
totalCostProp
## [1] 251882615

The event types that caused the highest cost were:

head({
data %>% select(EVTYPE, prop1000USD) %>% 
group_by(EVTYPE) %>%
summarise(Property_damages_billion_USD = sum(prop1000USD/1000000, na.rm = TRUE)) %>%
arrange(desc(Property_damages_billion_USD))
}, 10)
## Source: local data frame [10 x 2]
## 
##               EVTYPE Property_damages_billion_USD
## 1  hurricane/typhoon                       69.306
## 2        storm surge                       43.194
## 3              flood                       29.060
## 4            tornado                       24.617
## 5        flash flood                       15.222
## 6               hail                       14.595
## 7          hurricane                       11.813
## 8     tropical storm                        7.642
## 9          high wind                        5.248
## 10          wildfire                        4.759

Crop damage

Damages on crop caused by heat/drought costed most in the studied period ($14 billion), followed by flooding ($6 billion) and hurricanes ($5 billion):

# List the cost for property damage by group
data %>% select(Group, crop1000USD) %>% 
group_by(Group) %>%
summarise(Crop_damages_billion_USD = sum(crop1000USD/1000000, na.rm = TRUE)) %>%
arrange(desc(Crop_damages_billion_USD))
## Source: local data frame [18 x 2]
## 
##                          Group Crop_damages_billion_USD
## 1                 heat/drought                13.860145
## 2                        flood                 6.348063
## 3                    hurricane                 5.350108
## 4                         cold                 2.741186
## 5                         hail                 2.496822
## 6                   wind/storm                 2.409642
## 7                         rain                 0.728420
## 8                         fire                 0.402255
## 9         tornado/funnel storm                 0.283425
## 10             snow/snow storm                 0.093182
## 11           landslide/erosion                 0.020017
## 12 high surf/rip/current/waves                 0.011510
## 13                   lightning                 0.006898
## 14                          NA                 0.001034
## 15                     tsunami                 0.000020
## 16                    avalance                 0.000000
## 17                         fog                 0.000000
## 18           volcanic activity                 0.000000
# Total cost for crop damage
totalCostCrop <- sum(data$crop1000USD, na.rm = TRUE)

The event types that caused the highest cost were:

head({
data %>% select(EVTYPE, crop1000USD) %>% 
group_by(EVTYPE) %>%
summarise(Crop_damages_billion_USD = sum(crop1000USD/1000000, na.rm = TRUE)) %>%
arrange(desc(Crop_damages_billion_USD))
}, 10)
## Source: local data frame [10 x 2]
## 
##               EVTYPE Crop_damages_billion_USD
## 1            drought                  13.3676
## 2              flood                   4.9748
## 3          hurricane                   2.7414
## 4  hurricane/typhoon                   2.6079
## 5               hail                   2.4760
## 6        flash flood                   1.3349
## 7       extreme cold                   1.3090
## 8       frost/freeze                   1.0942
## 9         heavy rain                   0.7282
## 10    tropical storm                   0.6777