Every year natural disasters produce a wide variaty of loses. Two of them are the subject to study in this report, injuries and fatalities and properties and crops damages. Using R libraries to process data (dplyr) and plot results (ggplot2) will help us to find out the most harmful event and the one with the greatest economics consequences. At the end, we get that tornados causes the most injuries and fatalities of all events and floods causes the greatest economics consequences.
The first step is to load the required libraries.
library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
Given the download url, we get the data from the cloud, then unzip it and finally load it into R.
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file_name <- "repdata_data_StormData.csv.bz2"
if (!file.exists(file_name)) {
download.file(url, destfile = "repdata_data_StormData.csv.bz2")
}
data <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",", stringsAsFactors = FALSE)
Looking at the data, it can be noticed that EVTYPE column values have many typos and synonyms for the same natural event. Using regular expressions and dplyr library, we will fix this issue. The procedure is as follows:
data <- data %>%
filter(EVTYPE != "?", !grepl("^SUMMARY", EVTYPE)) %>%
mutate(EVTYPE = toupper(EVTYPE),
EVTYPE = replace(EVTYPE, grepl("WIND|WND|TURBULENCE", EVTYPE), "HIGH WINDS"),
EVTYPE = replace(EVTYPE, grepl("FLO{0,3}D|URBAN", EVTYPE), "FLOODS"),
EVTYPE = replace(EVTYPE, grepl("AVAL", EVTYPE), "AVALANCHES"),
EVTYPE = replace(EVTYPE, grepl("LIG(HTI|HTN|NTN)", EVTYPE), "LIGHTNINGS"),
EVTYPE = replace(EVTYPE, grepl("COLD|CHILL|LOW TEMP|COOL|FROST|HYPOTHERMIA|FREEZE", EVTYPE), "COLD TEMPERATURES"),
EVTYPE = replace(EVTYPE, grepl("HIGH TEMP|HOT|HYPERTHERMIA|HEAT", EVTYPE), "HIGH TEMPERATURES"),
EVTYPE = replace(EVTYPE, grepl("HIGH (SURF|SEAS|TIDE|WATER|WAVES)|SURF|SWELLS", EVTYPE), "HIGH TIDES"),
EVTYPE = replace(EVTYPE, grepl("BLOW-OUT TIDE|SURGE|SEAS", EVTYPE), "HIGH TIDES"),
EVTYPE = replace(EVTYPE, grepl("SNOW|BLIZ|WINT|ICE( STORM)?|FREEZ.*(RAIN|SPRAY|FOG|DRIZZLE)|SLEET", EVTYPE), "BLIZZARDS"),
EVTYPE = replace(EVTYPE, grepl("TSTM( WND|W)?|THU", EVTYPE), "THUNDERSTORMS"),
EVTYPE = replace(EVTYPE, grepl("WILD", EVTYPE), "FOREST FIRES"),
EVTYPE = replace(EVTYPE, grepl("DUST", EVTYPE), "DUST STORMS"),
EVTYPE = replace(EVTYPE, grepl("RAIN|SHOWER|PRECIP", EVTYPE), "RAIN"),
EVTYPE = replace(EVTYPE, grepl("WAY?TER.*(SPOUT)?", EVTYPE), "WATERSPOUTS"),
EVTYPE = replace(EVTYPE, grepl("VOL", EVTYPE), "VOLCANIC ERUPTION"),
EVTYPE = replace(EVTYPE, grepl("LANDSPOUT|TORN", EVTYPE), "TORNADOS"),
EVTYPE = replace(EVTYPE, grepl("HAIL.*(STORM)?", EVTYPE), "HAIL STORMS"),
EVTYPE = replace(EVTYPE, grepl("COAST.*STORM", EVTYPE), "COASTAL STORMS"),
EVTYPE = replace(EVTYPE, grepl("TROPICAL", EVTYPE), "TROPICAL STORMS"),
EVTYPE = replace(EVTYPE, grepl("WET", EVTYPE), "WET WEATHER"),
EVTYPE = replace(EVTYPE, grepl("WARM", EVTYPE), "WARM WEATHER"),
EVTYPE = replace(EVTYPE, grepl("SLIDE", EVTYPE), "LANDSLIDE"),
EVTYPE = replace(EVTYPE, grepl("HURRICANE|TYPHOON", EVTYPE), "HURRICANE/TYPHOON"),
EVTYPE = replace(EVTYPE, grepl("CLOUD", EVTYPE), "CLOUDS"),
EVTYPE = replace(EVTYPE, grepl("RIP", EVTYPE), "RIP CURRENTS"),
EVTYPE = replace(EVTYPE, grepl("EROSI", EVTYPE), "COASTAL EROSION"),
EVTYPE = replace(EVTYPE, grepl("FIRE", EVTYPE), "FIRES"),
EVTYPE = replace(EVTYPE, grepl("FOG", EVTYPE), "FOG"),
EVTYPE = replace(EVTYPE, grepl("DAM (BREAK|FAILURE)", EVTYPE), "DAM FAILURES"),
EVTYPE = replace(EVTYPE, grepl("SMOKE", EVTYPE), "SMOKE"),
EVTYPE = replace(EVTYPE, grepl("DR(Y|IEST|OUGHT)", EVTYPE), "DROUGHT"))
To get a ranking of types of events with most harmful effects on population health we will consider only three columns (EVTYPE, INJURIES, FATALITIES). We will apply the following steps on the data:
most_harmful_events <- data %>%
group_by(EVTYPE) %>%
summarise(INJURIES = sum(INJURIES), FATALITIES = sum(FATALITIES)) %>%
mutate(TOTAL = INJURIES + FATALITIES) %>%
arrange(desc(TOTAL)) %>%
top_n(5) %>%
gather(key=DMGTYPE, value=QTY, -EVTYPE)
In order to get the data required to measure the event with the greatest economics consequences, we apply the following steps:
most_costly_events <- data %>%
select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
mutate(PROPDMGEXP = replace(PROPDMGEXP, !(tolower(PROPDMGEXP) %in% c("b", "m", "k")), "0"),
PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "k", "0.000001"),
PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "m", "0.001"),
PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "b", "1"),
CROPDMGEXP = replace(CROPDMGEXP, !(tolower(CROPDMGEXP) %in% c("b", "m", "k")), "0"),
CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "k", "0.000001"),
CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "m", "0.001"),
CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "b", "1"),
PROPDMGEXP = as.numeric(PROPDMGEXP),
CROPDMGEXP = as.numeric(CROPDMGEXP),
TOTAL_PROP = PROPDMG * PROPDMGEXP,
TOTAL_CROP = CROPDMG * CROPDMGEXP) %>%
select(EVTYPE, TOTAL_PROP, TOTAL_CROP) %>%
group_by(EVTYPE) %>%
summarise(PROP = sum(TOTAL_PROP), CROP = sum(TOTAL_CROP)) %>%
mutate(TOTAL = PROP + CROP) %>%
arrange(desc(TOTAL)) %>%
top_n(5) %>%
gather(key=DMGTYPE, value=AMOUNT, -EVTYPE)
Plotting our results will help us to get insight from the data. For the most harmful to population health events we have:
color_pallete <- brewer.pal(3, "Dark2")
line_color <- color_pallete[3]
legend_labels <- c("Fatalities", "Injuries", "Total")
most_harmful_events_without_total <- most_harmful_events %>% filter(DMGTYPE != "TOTAL")
most_harmful_events_totals <- most_harmful_events %>% filter(DMGTYPE == "TOTAL")
ggplot(most_harmful_events_without_total, aes(x=reorder(EVTYPE, -QTY), y=QTY, fill=factor(DMGTYPE))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(name="Damage type", values=color_pallete, labels=legend_labels) +
geom_line(data = most_harmful_events_totals, aes(x=EVTYPE, y=QTY, group=1), colour=line_color) +
xlab("Event") + ylab("Quantity") +
labs(title = "Top 5 most harmful to population health event types", caption = "Figure 1")
From Figure 1, we have the following insights:
The same reasoning can be applied for the events with the greatest economics consequences:
legend_labels <- c("Crops", "Properties", "Total")
most_costly_events_without_total <- most_costly_events %>% filter(DMGTYPE != "TOTAL")
most_costly_events_totals <- most_costly_events %>% filter(DMGTYPE == "TOTAL")
ggplot(most_costly_events_without_total, aes(x=reorder(EVTYPE, -AMOUNT), y=AMOUNT, fill=factor(DMGTYPE))) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(name="Damage type", values=color_pallete, labels=legend_labels) +
geom_line(data = most_costly_events_totals, aes(x=EVTYPE, y=AMOUNT, group=1), colour=line_color) +
xlab("Event") + ylab("Amount in billions") +
labs(title = "Top 5 event types with the greatest economic consequences", caption = "Figure 2")
From Figure 2, we have the following insights:
Exploring the data, we can figure out some important insights.
bmk <- data %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
table(bmk$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5 6
## 465934 1 8 5 216 25 13 4 4 28 4
## 7 8 B h H K m M
## 5 1 40 1 6 424665 7 11330
table(bmk$PROPDMG[bmk$PROPDMGEXP == ""])
##
## 0 0,41 1 2 3 4 5 6 7 8 9
## 465858 1 4 7 16 9 11 6 3 2 3
## 10 20 35 75
## 8 4 1 1
table(bmk$PROPDMG[tolower(bmk$PROPDMGEXP) == "h"])
##
## 2 3 5
## 2 1 4
table(bmk$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
table(bmk$CROPDMG[bmk$CROPDMGEXP == ""])
##
## 0 3 4
## 618410 1 2
For PROPDMGEXP:
For CROPDMGEXP:
Conclusion: To avoid noise due to unknown data, we remove from the total calculations numeric values with EXP different from ‘b’, ‘m’ and ‘k’.