R packages used
library(tidyverse)
library(here)
library(ggplot2)
Research questions
This analysis aims at answering two research questions:
1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
2. Across the United States, which types of events have the greatest economic consequences?
Data Processing
For this analysis, data is downloaded from https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2, unzipped and loaded into a dataframe. The R code section below achieves that and also caches the dataset to disk as an RDS file, to avoid repeated downloading and data reading from csv.
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- file.path(here::here(), "stormdata.csv.bz2")
# downloading the file only if the file is not found on disk already
if ( !file.exists(file.path(here(), "stormdata.csv.bz2")) ) {
download.file(url, destfile, method = "auto")
}
# loading the data from the csv file and creating a disk-cached RDS file
# skipped if the RDS file exists already
if ( !file.exists(file.path(here(), "stormdata.rds")) ) {
df <- readr::read_csv(file = destfile)
# creating a disk-cache of the dataframe as-is after loading with readr::read_csv defaults
saveRDS(object = df, file = file.path(here(), "stormdata.rds"))
} else {
# loading data from a disk-cached RDS file
df <- readRDS(file = file.path(here::here(), "stormdata.rds"))
}
Description and justification of data transformations
The dataset is prepared / enriched by adding a column that contains the YEAR of the event, as reported in the BGN_DATE column. Also, all event type values (EVTYPE) are converted to lower case in order to make sure that events of the same type are treated as such, regardless of letter casing in the data.
# creating a character column with year values
df <- df %>%
dplyr::mutate(YEAR = gsub("^\\d{1,2}/\\d{1,2}/(\\d{4})\\s0:00:00$", "\\1", BGN_DATE) ) %>%
dplyr::mutate(YEAR = as.integer(YEAR)) %>%
dplyr::select(STATE__, BGN_DATE, YEAR, everything() )
# disambiguation of weather event values by turning all to lower case
df <- df %>%
dplyr::mutate(EVTYPE = tolower(EVTYPE) )
Analysis
Question 1: For each event type (EVTYPE) the sum - across all years available - is calculated for FATALITIES and INJURIES. For each of FATALITIES and INJURIES, the top 5 event types are selected and presented in there results section. Then, the top one event type for both FATALITIES and INJURIES is selected and a time series dataset of yearly totals per event type is created, for use in the plot below.
# calculating sum of FATALITIES and INJURIES for each EVTYPE across all years
df_1 <- df %>%
dplyr::group_by(EVTYPE) %>%
dplyr::summarize(FATALITIES_sum = sum(FATALITIES),
INJURIES_sum = sum(INJURIES) )
# filtering out the top-5 event types with regards to fatalities
df_1_fatal_top5 <- df_1 %>%
dplyr::select(EVTYPE, FATALITIES_sum) %>%
dplyr::arrange(desc(FATALITIES_sum)) %>%
dplyr::slice(1:5)
# filtering out the top-5 event types with regards to injuries
df_1_injur_top5 <- df_1 %>%
dplyr::select(EVTYPE, INJURIES_sum) %>%
dplyr::arrange(desc(INJURIES_sum)) %>%
dplyr::slice(1:5)
# getting the event type name of the top 1 event type
# with regards to fatalities
fatal_top_1 <- df_1 %>%
dplyr::select(EVTYPE, FATALITIES_sum) %>%
dplyr::arrange(desc(FATALITIES_sum)) %>%
dplyr::slice(1:1) %>%
dplyr::pull(EVTYPE)
# getting the event type name of the top 1 event type
# with regards to injuries
injur_top_1 <- df_1 %>%
dplyr::select(EVTYPE, INJURIES_sum) %>%
dplyr::arrange(desc(INJURIES_sum)) %>%
dplyr::slice(1:1) %>%
dplyr::pull(EVTYPE)
# yearly timeseries data for the top 1 fatalities event type
df_1_fatal_timeseries <- df %>%
dplyr::filter(EVTYPE == fatal_top_1) %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(COUNT = sum(FATALITIES) ) %>%
dplyr::mutate(SERIES = paste("FATALITIES by", fatal_top_1))
# yearly timeseries data for the top 1 injuries event type
df_1_injur_timeseries <- df %>%
dplyr::filter(EVTYPE == injur_top_1) %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(COUNT = sum(INJURIES) ) %>%
dplyr::mutate(SERIES = paste("INJURIES by", injur_top_1))
# combining timeseries data for fatalities and injuries
df_1_timeseries <- dplyr::bind_rows(df_1_fatal_timeseries, df_1_injur_timeseries)
Question 2: For each event type (EVTYPE) the sum - across all years available - is calculated for PROPDMG and CROPDMG. For each of PROPDMG and CROPDMG, the top 5 event types are selected and presented in there results section. Then, the top one event type for both PROPDMG and CROPDMG is selected and a time series dataset of yearly totals per event type is created, for use in the plot below.
# calculating sum of PROPDMG and CROPDMG for each EVTYPE across all years
df_2 <- df %>%
dplyr::group_by(EVTYPE) %>%
dplyr::summarize(PROPDMG_sum = sum(PROPDMG),
CROPDMG_sum = sum(CROPDMG) )
# filtering out the top-5 event types with regards to property damages
df_2_prop_top5 <- df_2 %>%
dplyr::select(EVTYPE, PROPDMG_sum) %>%
dplyr::arrange(desc(PROPDMG_sum)) %>%
dplyr::slice(1:5)
# filtering out the top-5 event types with regards to crops damages
df_2_crop_top5 <- df_2 %>%
dplyr::select(EVTYPE, CROPDMG_sum) %>%
dplyr::arrange(desc(CROPDMG_sum)) %>%
dplyr::slice(1:5)
# getting the event type name of the top 1 event type
# with regards to property damage
prop_top_1 <- df_2 %>%
dplyr::select(EVTYPE, PROPDMG_sum) %>%
dplyr::arrange(desc(PROPDMG_sum)) %>%
dplyr::slice(1:1) %>%
dplyr::pull(EVTYPE)
# getting the event type name of the top 1 event type
# with regards to crops damage
crop_top_1 <- df_2 %>%
dplyr::select(EVTYPE, CROPDMG_sum) %>%
dplyr::arrange(desc(CROPDMG_sum)) %>%
dplyr::slice(1:1) %>%
dplyr::pull(EVTYPE)
# yearly timeseries data for the top 1 property damage event type
df_2_prop_timeseries <- df %>%
dplyr::filter(EVTYPE == prop_top_1) %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(VALUE = sum(PROPDMG) ) %>%
dplyr::mutate(SERIES = paste("PROPERTY DAMAGE by", prop_top_1))
# yearly timeseries data for the top 1 crops damage event type
df_2_crop_timeseries <- df %>%
dplyr::filter(EVTYPE == crop_top_1) %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(VALUE = sum(CROPDMG) ) %>%
dplyr::mutate(SERIES = paste("CROPS DAMAGE by", crop_top_1))
# combining timeseries data for property damage and crops damage
df_2_timeseries <- dplyr::bind_rows(df_2_prop_timeseries, df_2_crop_timeseries)
Results
Below is a display of the Top-5 event types by order of number of fatalities, number of injuries, value of property damage and value of crops damage. From this we can conclude that from the year 1950 to the year 2011:
- tornado is the weather event type the most harmful to population health in terms of lives lost,
with a registered total death toll of 5633 lives.
- tornado is the weather event type the most harmful to population health in terms of people injured,
with a registered total count of 91346 people injured.
- tornado is the weather event type the greatest economic consequences for property,
with a registered total nominal value of property damage of 3212258.
- hail is the weather event type the greatest economic consequences for crops,
with a registered total nominal value of crops damage of 579596.3.
knitr::kable(df_1_fatal_top5, row.names = T, caption = "Top 5 weather events in terms of total fatalities")
| EVTYPE | FATALITIES_sum | |
|---|---|---|
| 1 | tornado | 5633 |
| 2 | excessive heat | 1903 |
| 3 | flash flood | 978 |
| 4 | heat | 937 |
| 5 | lightning | 816 |
knitr::kable(df_1_injur_top5, row.names = T, caption = "Top 5 weather events in terms of total injuries")
| EVTYPE | INJURIES_sum | |
|---|---|---|
| 1 | tornado | 91346 |
| 2 | tstm wind | 6957 |
| 3 | flood | 6789 |
| 4 | excessive heat | 6525 |
| 5 | lightning | 5230 |
knitr::kable(df_2_prop_top5, row.names = T, caption = "Top 5 weather events in terms of property damages")
| EVTYPE | PROPDMG_sum | |
|---|---|---|
| 1 | tornado | 3212258.2 |
| 2 | flash flood | 1420174.6 |
| 3 | tstm wind | 1336103.6 |
| 4 | flood | 899938.5 |
| 5 | thunderstorm wind | 876844.2 |
knitr::kable(df_2_crop_top5, row.names = T, caption = "Top 5 weather events in terms of crops damages")
| EVTYPE | CROPDMG_sum | |
|---|---|---|
| 1 | hail | 579596.3 |
| 2 | flash flood | 179200.5 |
| 3 | flood | 168037.9 |
| 4 | tstm wind | 109202.6 |
| 5 | tornado | 100018.5 |
Plots
p <- ggplot(df_1_timeseries, aes(x = YEAR) )
p <- p + geom_line(aes(y = COUNT, colour = SERIES), size=1 )
p <- p + labs(title = "Weather events most harmful to population health",subtitle = "Source: U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database", caption = "",x = "", y = "Count of persons", colour = NULL)
p <- p + theme(panel.grid.major.x = element_blank(), legend.position = "bottom")
p
q <- ggplot(df_2_timeseries, aes(x = YEAR) )
q <- q + geom_line(aes(y = VALUE, colour = SERIES), size=1 )
q <- q + labs(title = "Weather events with the greatest economic consequences", subtitle = "Source: U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database", caption = "",x = "", y = "Value of damages", colour = NULL)
q <- q + theme(panel.grid.major.x = element_blank(), legend.position = "bottom")
q
(end)