# Download and read the CSV files into data dataframe
sourceFileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(sourceFileUrl, destfile = paste0(getwd(), './repdata_data_StormData.csv.bz2'))
data <- read.csv("./repdata_data_StormData.csv.bz2")
Based on the data available in NOOA Storm Database, it is possible to determine the which types of weather events are the most harmful with respect to population health and which ones have the biggest economics consequences.
As it can be seen later in the document, tornadoes are the most harmful weather events with respect to population health and flood, hurricane/typhoon, tornadoes and storm surge the ones with the highest number of damages in terms of USD.
The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site: Storm Data [47Mb]
There is also some documentation of the database available. Here you will find how some of the variables are constructed/defined. - National Weather Service Storm Data Documentation - National Climatic Data Center Storm Events FAQ
The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
Only the relevant columns for the analysis will be kept and group_by function will be applied per each event type (EVTYPE columkn):
There is confusion on how to handle exponent value of PROPDMGEXP and CROPDMGEXP columns of the database. Due to lack of official information in the NOAA website. They need to be correctly parsed in order to determine the monetary value of the damages.
Following the advice found in this link, the values of PROPDMGEXP and CROPDMGEXP can be interpreted as follows:
library(dplyr)
events <- data %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
mutate(EVTYPE = toupper(EVTYPE)) %>%
mutate(PROPDMG = PROPDMG / 1e6 * case_when(PROPDMGEXP %in% c("B", "b") ~ 1e9,
PROPDMGEXP %in% c("M", "m") ~ 1e6,
PROPDMGEXP %in% c("K", "k") ~ 1e3,
PROPDMGEXP %in% c("H", "h") ~ 1e2,
PROPDMGEXP %in% as.character(0:8) ~ 10,
PROPDMGEXP == "+" ~ 1,
TRUE ~ 0)) %>%
mutate(CROPDMG = CROPDMG / 1e6 * case_when(CROPDMGEXP %in% c("B", "b") ~ 1e9,
CROPDMGEXP %in% c("M", "m") ~ 1e6,
CROPDMGEXP %in% c("K", "k") ~ 1e3,
CROPDMGEXP %in% c("H", "h") ~ 1e2,
CROPDMGEXP %in% as.character(0:8) ~ 10,
CROPDMGEXP == "+" ~ 1,
TRUE ~ 0)) %>%
group_by(EVTYPE) %>%
summarise(Fatalities = sum(FATALITIES),
Injuries = sum(INJURIES),
Property_damage = sum(PROPDMG),
Crop_damage = sum(CROPDMG),
Total_personal_damage = Fatalities + Injuries,
Economic_impact = Crop_damage + Property_damage)
A first visual inspection can be done with a scatter plot and identify if there are specific weather events that are top contributors either to personal damages and/or big impacts on properties and crops.
As it can be seen later in the document, tornadoes are the most harmful weather events with respect to population health and flood, hurricane/typhoon, tornadoes and storm surge the ones with the highest number of damages in terms of USD.
library(dplyr)
with(events, plot(x=Total_personal_damage, y = Economic_impact, pch = 1, col = 'blue', xlab = "Sum of fatalities and injuries", ylab = "Damages (in millions USD)"))
threshold <- 9
outliers <- events %>%
mutate(contrib_pop_effects = 100*Total_personal_damage / sum(events$Total_personal_damage),
contrib_economic_impact = 100*Economic_impact / sum(events$Economic_impact)) %>%
filter(contrib_pop_effects >= threshold | contrib_economic_impact >= threshold)
par(xpd = NA)
with(outliers,
points(x = Total_personal_damage, y = Economic_impact, pch = 19, col = "blue"))
with(outliers, text(x = Total_personal_damage, y = Economic_impact, labels = EVTYPE, pos = 4, cex = 0.6))
A more detailed analysis can be done for the top 10 events for each variable.
The top 10 events with the highest effect on population health, based on the total number of fatalities and injuries, are presented in the following chart:
library(ggplot2)
library(tidyr)
library(gridExtra)
top_events_health <- events %>%
mutate(contribution_to_total = 100 * Total_personal_damage/sum(events$Total_personal_damage)) %>%
arrange(desc(Total_personal_damage)) %>%
head(10)
plot1 <- top_events_health %>%
select(EVTYPE, Fatalities, Injuries) %>%
gather(key = "Damage", value = "Measurement", Fatalities, Injuries) %>%
ggplot() + geom_col(aes(x = EVTYPE, y = Measurement, fill = Damage)) +
scale_x_discrete(limits = top_events_health$EVTYPE) + coord_flip() +
ylab("Count of fatalities and injuries") +
theme(legend.position = "top", legend.title = element_blank(), axis.title = element_blank())
plot2 <- top_events_health %>%
select(EVTYPE, Property_damage, Crop_damage) %>%
gather(key = "Damage", value = "Measurement", Property_damage, Crop_damage) %>%
ggplot() + geom_col(aes(x = EVTYPE, y = Measurement, fill = Damage)) +
scale_x_discrete(limits = top_events_health$EVTYPE) + coord_flip() +
ylab("Economic impact (in millions USD)") +
theme(legend.position = "top", legend.title = element_blank(), axis.title = element_blank())
grid.arrange(plot1, plot2, ncol = 2)
As a complement to the visual information in the chart, the associated tabular report can be seen below:
knitr::kable(top_events_health, caption = "Top N weather events based on the personal damages", digits = 2, format.args = list(decimal.mark = ".", big.mark = ","), col.names = c("Event type", "Fatalities", "Injuries", "Property damage (MUSD)", "Crop damage (MUSD)", "Total personal effects", "Total economic damage (MUSD)", "Compared to all events (%)"))
| Event type | Fatalities | Injuries | Property damage (MUSD) | Crop damage (MUSD) | Total personal effects | Total economic damage (MUSD) | Compared to all events (%) |
|---|---|---|---|---|---|---|---|
| TORNADO | 5,633 | 91,346 | 56,937.16 | 414.95 | 96,979 | 57,352.12 | 62.30 |
| EXCESSIVE HEAT | 1,903 | 6,525 | 7.75 | 492.40 | 8,428 | 500.16 | 5.41 |
| TSTM WIND | 504 | 6,957 | 4,484.96 | 554.01 | 7,461 | 5,038.97 | 4.79 |
| FLOOD | 470 | 6,789 | 144,657.71 | 5,661.97 | 7,259 | 150,319.68 | 4.66 |
| LIGHTNING | 816 | 5,230 | 928.66 | 12.09 | 6,046 | 940.75 | 3.88 |
| HEAT | 937 | 2,100 | 1.80 | 401.46 | 3,037 | 403.26 | 1.95 |
| FLASH FLOOD | 978 | 1,777 | 16,140.82 | 1,421.32 | 2,755 | 17,562.13 | 1.77 |
| ICE STORM | 89 | 1,975 | 3,944.93 | 5,022.11 | 2,064 | 8,967.04 | 1.33 |
| THUNDERSTORM WIND | 133 | 1,488 | 3,483.12 | 414.84 | 1,621 | 3,897.97 | 1.04 |
| WINTER STORM | 206 | 1,321 | 6,688.50 | 26.94 | 1,527 | 6,715.44 | 0.98 |
library(ggplot2)
library(tidyr)
library(gridExtra)
top_events_economic <- events %>%
arrange(desc(Economic_impact)) %>%
mutate(contribution_to_total = 100 * Economic_impact/sum(events$Economic_impact)) %>%
head(10)
plot1 <- top_events_economic %>%
select(EVTYPE, Property_damage, Crop_damage) %>%
gather(key = "Damage", value = "Measurement", Property_damage, Crop_damage) %>%
ggplot() + geom_col(aes(x = EVTYPE, y = Measurement, fill = Damage)) +
scale_x_discrete(limits = top_events_economic$EVTYPE) + coord_flip() +
ylab("Economic impact (in millions USD)") +
theme(legend.position = "top", legend.title = element_blank(), axis.title.x = element_blank())
plot2 <- top_events_economic %>%
select(EVTYPE, Fatalities, Injuries) %>%
gather(key = "Damage", value = "Measurement", Fatalities, Injuries) %>%
ggplot() + geom_col(aes(x = EVTYPE, y = Measurement, fill = Damage)) +
scale_x_discrete(limits = top_events_economic$EVTYPE) + coord_flip() +
ylab("Count of fatalities and injuries") +
theme(legend.position = "top", legend.title = element_blank(), axis.title.x = element_blank())
grid.arrange(plot1, plot2, ncol = 2)
As a complement to the visual information in the chart, the associated tabular report can be seen below:
knitr::kable(top_events_economic, caption = "Top N weather events based on economic damages", digits = 2, format.args = list(decimal.mark = ".", big.mark = ","), col.names = c("Event type", "Fatalities", "Injuries", "Property damage (MUSD)", "Crop damage (MUSD)", "Total personal effects", "Total economic damage (MUSD)", "Compared to all events (%)"))
| Event type | Fatalities | Injuries | Property damage (MUSD) | Crop damage (MUSD) | Total personal effects | Total economic damage (MUSD) | Compared to all events (%) |
|---|---|---|---|---|---|---|---|
| FLOOD | 470 | 6,789 | 144,657.71 | 5,661.97 | 7,259 | 150,319.68 | 31.55 |
| HURRICANE/TYPHOON | 64 | 1,275 | 69,305.84 | 2,607.87 | 1,339 | 71,913.71 | 15.09 |
| TORNADO | 5,633 | 91,346 | 56,937.16 | 414.95 | 96,979 | 57,352.12 | 12.04 |
| STORM SURGE | 13 | 38 | 43,323.54 | 0.00 | 51 | 43,323.54 | 9.09 |
| HAIL | 15 | 1,361 | 15,732.27 | 3,025.95 | 1,376 | 18,758.22 | 3.94 |
| FLASH FLOOD | 978 | 1,777 | 16,140.82 | 1,421.32 | 2,755 | 17,562.13 | 3.69 |
| DROUGHT | 0 | 4 | 1,046.11 | 13,972.57 | 4 | 15,018.67 | 3.15 |
| HURRICANE | 61 | 46 | 11,868.32 | 2,741.91 | 107 | 14,610.23 | 3.07 |
| RIVER FLOOD | 2 | 2 | 5,118.95 | 5,029.46 | 4 | 10,148.40 | 2.13 |
| ICE STORM | 89 | 1,975 | 3,944.93 | 5,022.11 | 2,064 | 8,967.04 | 1.88 |