library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
knitr::opts_chunk$set(echo = TRUE)
Data on natural disasters is loaded to determine the most damaging events in both loss of life and economic data. The date of desasters is cleaned to event year and then the data is summarized per event and year, taking into account only the last 20 years. By summarizing the data it is seen that excessive heat causes the most fatalities with a peak in 2002. Tornado’s causes the second most fatalities but the most injuries. Flash floods cause the most economic damage closely followed by tornado’s
df <- read.csv("repdata_data_StormData.csv.bz2")
tbl <- as_tibble(df)
The year of the event is taken from the event date using regular expressions first removing everything before and including the last / then removing everything after and including the first space.
tbl <- mutate(.data = tbl, year=sub(" .*", "", sub("^.*/", "", tbl$BGN_DATE)))
Next only the last 20 years of the analysis are taken because characterstics of events (like weather events) as well as consequences (e.g. due to increasing population) change. To keep the data relevant only recent data is needed.
tbl <- filter(tbl, year > 1991)
Then the damage per event over time is taken by grouping per event and taking the sum of fatalities, the sum of injuries and the sum of property damage plus crop damage.
damage_per_event_year <- tbl %>%
group_by(EVTYPE, year) %>%
summarize(event_fatalities=sum(FATALITIES, na.rm=TRUE),
event_injuries=sum(INJURIES, na.rm=TRUE),
event_damage=sum(PROPDMG) + sum(CROPDMG))
The data is then grouped by event type and summarized for fatalities and injuries, showing that Excessive Heat is the even with the most fatalities closely followed by tornado’s with tornado’s causing many more injuries.
top_5_most_harmfull <- tbl %>%
group_by(EVTYPE) %>%
summarize(event_fatalities=sum(FATALITIES, na.rm=TRUE),
event_injuries=sum(INJURIES, na.rm=TRUE)) %>%
arrange(desc(event_fatalities)) %>%
top_n(5)
## Selecting by event_injuries
top_5_most_harmfull
## # A tibble: 5 x 3
## EVTYPE event_fatalities event_injuries
## <fct> <dbl> <dbl>
## 1 EXCESSIVE HEAT 1903 6525
## 2 TORNADO 1660 24633
## 3 LIGHTNING 816 5230
## 4 FLOOD 470 6789
## 5 TSTM WIND 255 3954
To get a feeling of the events over time a time series is taken from 1991 till 2011 per year. It can clearly be seen that there is large variation of the number of fatalitieis per year for Excessive Heat (with 500 fatalities in 2011). For Tornado’s the time series is more constant with one exception with close to 600 fatalities in 2002.
top_5_most_harmfull_over_time <- inner_join(damage_per_event_year, top_5_most_harmfull, by='EVTYPE')
ggplot(data=top_5_most_harmfull_over_time, aes(x=year, y=event_fatalities.x)) +
geom_bar(stat='identity') +
scale_x_discrete("Interval", c(1992, 1997, 2002, 2007, 2011)) +
ylab('Averiag number of steps') +
facet_wrap(~EVTYPE)
The data is then grouped by event type and summarized for property damage plus crop damage (together named economic damage) and then ordered by by economic damage. This shows that flash floods causes the most economic data closely followed by tornado’s
top_5_most_economic_damage <- tbl %>%
group_by(EVTYPE) %>%
summarize(economic_damage=sum(PROPDMG, na.rm=TRUE) + sum(CROPDMG, na.rm=TRUE)) %>%
arrange(desc(economic_damage)) %>%
top_n(5)
## Selecting by economic_damage
top_5_most_economic_damage
## # A tibble: 5 x 2
## EVTYPE economic_damage
## <fct> <dbl>
## 1 FLASH FLOOD 1599325.
## 2 TORNADO 1558302.
## 3 TSTM WIND 1445168.
## 4 HAIL 1268290.
## 5 FLOOD 1067976.
Plotting the economic data per year over the last 20 years of data shows that the damage by flash floodss is relatively constant over time though seems to be increasing, similar to the data for tornado’s
top_5_most_economic_damage_over_time <- inner_join(damage_per_event_year, top_5_most_economic_damage, by='EVTYPE')
ggplot(data=top_5_most_economic_damage_over_time, aes(x=year, y=event_damage)) +
geom_bar(stat='identity') +
scale_x_discrete("Interval", c(1992, 1997, 2002, 2007, 2011)) +
ylab('Averiag number of steps') +
facet_wrap(~EVTYPE)