library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
knitr::opts_chunk$set(echo = TRUE)

Synopsis

Data on natural disasters is loaded to determine the most damaging events in both loss of life and economic data. The date of desasters is cleaned to event year and then the data is summarized per event and year, taking into account only the last 20 years. By summarizing the data it is seen that excessive heat causes the most fatalities with a peak in 2002. Tornado’s causes the second most fatalities but the most injuries. Flash floods cause the most economic damage closely followed by tornado’s

Data processing

df <- read.csv("repdata_data_StormData.csv.bz2")
tbl <- as_tibble(df)

The year of the event is taken from the event date using regular expressions first removing everything before and including the last / then removing everything after and including the first space.

tbl <- mutate(.data = tbl, year=sub(" .*", "", sub("^.*/", "", tbl$BGN_DATE)))

Next only the last 20 years of the analysis are taken because characterstics of events (like weather events) as well as consequences (e.g. due to increasing population) change. To keep the data relevant only recent data is needed.

tbl <- filter(tbl, year > 1991)

Then the damage per event over time is taken by grouping per event and taking the sum of fatalities, the sum of injuries and the sum of property damage plus crop damage.

damage_per_event_year <- tbl %>%
    group_by(EVTYPE, year) %>%
    summarize(event_fatalities=sum(FATALITIES, na.rm=TRUE),
              event_injuries=sum(INJURIES, na.rm=TRUE),
              event_damage=sum(PROPDMG) + sum(CROPDMG))

Results

The data is then grouped by event type and summarized for fatalities and injuries, showing that Excessive Heat is the even with the most fatalities closely followed by tornado’s with tornado’s causing many more injuries.

top_5_most_harmfull <- tbl %>%
    group_by(EVTYPE) %>%
    summarize(event_fatalities=sum(FATALITIES, na.rm=TRUE),
              event_injuries=sum(INJURIES, na.rm=TRUE)) %>%
    arrange(desc(event_fatalities)) %>%
    top_n(5)
## Selecting by event_injuries
top_5_most_harmfull
## # A tibble: 5 x 3
##   EVTYPE         event_fatalities event_injuries
##   <fct>                     <dbl>          <dbl>
## 1 EXCESSIVE HEAT             1903           6525
## 2 TORNADO                    1660          24633
## 3 LIGHTNING                   816           5230
## 4 FLOOD                       470           6789
## 5 TSTM WIND                   255           3954

To get a feeling of the events over time a time series is taken from 1991 till 2011 per year. It can clearly be seen that there is large variation of the number of fatalitieis per year for Excessive Heat (with 500 fatalities in 2011). For Tornado’s the time series is more constant with one exception with close to 600 fatalities in 2002.

top_5_most_harmfull_over_time <- inner_join(damage_per_event_year, top_5_most_harmfull, by='EVTYPE')

ggplot(data=top_5_most_harmfull_over_time, aes(x=year, y=event_fatalities.x)) +
    geom_bar(stat='identity') +
    scale_x_discrete("Interval", c(1992, 1997, 2002, 2007, 2011)) +
    ylab('Averiag number of steps') +
    facet_wrap(~EVTYPE)

The data is then grouped by event type and summarized for property damage plus crop damage (together named economic damage) and then ordered by by economic damage. This shows that flash floods causes the most economic data closely followed by tornado’s

 top_5_most_economic_damage <- tbl %>%
    group_by(EVTYPE) %>%
    summarize(economic_damage=sum(PROPDMG, na.rm=TRUE) + sum(CROPDMG, na.rm=TRUE)) %>%
    arrange(desc(economic_damage)) %>%
    top_n(5)
## Selecting by economic_damage
top_5_most_economic_damage
## # A tibble: 5 x 2
##   EVTYPE      economic_damage
##   <fct>                 <dbl>
## 1 FLASH FLOOD        1599325.
## 2 TORNADO            1558302.
## 3 TSTM WIND          1445168.
## 4 HAIL               1268290.
## 5 FLOOD              1067976.

Plotting the economic data per year over the last 20 years of data shows that the damage by flash floodss is relatively constant over time though seems to be increasing, similar to the data for tornado’s

top_5_most_economic_damage_over_time <- inner_join(damage_per_event_year, top_5_most_economic_damage, by='EVTYPE')

ggplot(data=top_5_most_economic_damage_over_time, aes(x=year, y=event_damage)) +
    geom_bar(stat='identity') +
    scale_x_discrete("Interval", c(1992, 1997, 2002, 2007, 2011)) +
    ylab('Averiag number of steps') +
    facet_wrap(~EVTYPE)