Synopsis

Every year natural disasters produce a wide variaty of loses. Two of them are the subject to study in this report, injuries and fatalities and properties and crops damages. Using R libraries to process data (dplyr) and plot results (ggplot2) will help us to find out the most harmful event and the one with the greatest economics consequences. At the end, we get that tornados causes the most injuries and fatalities of all events and floods causes the greatest economics consequences.

Data Processing

The first step is to load the required libraries.

library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)

Given the download url, we get the data from the cloud, then unzip it and finally load it into R.

url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file_name <- "repdata_data_StormData.csv.bz2"

if (!file.exists(file_name)) {
    download.file(url, destfile = "repdata_data_StormData.csv.bz2")
}

data <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",", stringsAsFactors = FALSE)

Looking at the data, it can be noticed that EVTYPE column values have many typos and synonyms for the same natural event. Using regular expressions and dplyr library, we will fix this issue. The procedure is as follows:

data <- data %>%
    filter(EVTYPE != "?", !grepl("^SUMMARY", EVTYPE)) %>%
    mutate(EVTYPE = toupper(EVTYPE),
           EVTYPE = replace(EVTYPE, grepl("WIND|WND|TURBULENCE", EVTYPE), "HIGH WINDS"),
           EVTYPE = replace(EVTYPE, grepl("FLO{0,3}D|URBAN", EVTYPE), "FLOODS"),
           EVTYPE = replace(EVTYPE, grepl("AVAL", EVTYPE), "AVALANCHES"),
           EVTYPE = replace(EVTYPE, grepl("LIG(HTI|HTN|NTN)", EVTYPE), "LIGHTNINGS"),
           EVTYPE = replace(EVTYPE, grepl("COLD|CHILL|LOW TEMP|COOL|FROST|HYPOTHERMIA|FREEZE", EVTYPE), "COLD TEMPERATURES"),
           EVTYPE = replace(EVTYPE, grepl("HIGH TEMP|HOT|HYPERTHERMIA|HEAT", EVTYPE), "HIGH TEMPERATURES"),
           EVTYPE = replace(EVTYPE, grepl("HIGH (SURF|SEAS|TIDE|WATER|WAVES)|SURF|SWELLS", EVTYPE), "HIGH TIDES"),
           EVTYPE = replace(EVTYPE, grepl("BLOW-OUT TIDE|SURGE|SEAS", EVTYPE), "HIGH TIDES"),
           EVTYPE = replace(EVTYPE, grepl("SNOW|BLIZ|WINT|ICE( STORM)?|FREEZ.*(RAIN|SPRAY|FOG|DRIZZLE)|SLEET", EVTYPE), "BLIZZARDS"),
           EVTYPE = replace(EVTYPE, grepl("TSTM( WND|W)?|THU", EVTYPE), "THUNDERSTORMS"),
           EVTYPE = replace(EVTYPE, grepl("WILD", EVTYPE), "FOREST FIRES"),
           EVTYPE = replace(EVTYPE, grepl("DUST", EVTYPE), "DUST STORMS"),
           EVTYPE = replace(EVTYPE, grepl("RAIN|SHOWER|PRECIP", EVTYPE), "RAIN"),
           EVTYPE = replace(EVTYPE, grepl("WAY?TER.*(SPOUT)?", EVTYPE), "WATERSPOUTS"),
           EVTYPE = replace(EVTYPE, grepl("VOL", EVTYPE), "VOLCANIC ERUPTION"),
           EVTYPE = replace(EVTYPE, grepl("LANDSPOUT|TORN", EVTYPE), "TORNADOS"),
           EVTYPE = replace(EVTYPE, grepl("HAIL.*(STORM)?", EVTYPE), "HAIL STORMS"),
           EVTYPE = replace(EVTYPE, grepl("COAST.*STORM", EVTYPE), "COASTAL STORMS"),
           EVTYPE = replace(EVTYPE, grepl("TROPICAL", EVTYPE), "TROPICAL STORMS"),
           EVTYPE = replace(EVTYPE, grepl("WET", EVTYPE), "WET WEATHER"),
           EVTYPE = replace(EVTYPE, grepl("WARM", EVTYPE), "WARM WEATHER"),
           EVTYPE = replace(EVTYPE, grepl("SLIDE", EVTYPE), "LANDSLIDE"),
           EVTYPE = replace(EVTYPE, grepl("HURRICANE|TYPHOON", EVTYPE), "HURRICANE/TYPHOON"),
           EVTYPE = replace(EVTYPE, grepl("CLOUD", EVTYPE), "CLOUDS"),
           EVTYPE = replace(EVTYPE, grepl("RIP", EVTYPE), "RIP CURRENTS"),
           EVTYPE = replace(EVTYPE, grepl("EROSI", EVTYPE), "COASTAL EROSION"),
           EVTYPE = replace(EVTYPE, grepl("FIRE", EVTYPE), "FIRES"),
           EVTYPE = replace(EVTYPE, grepl("FOG", EVTYPE), "FOG"),
           EVTYPE = replace(EVTYPE, grepl("DAM (BREAK|FAILURE)", EVTYPE), "DAM FAILURES"),
           EVTYPE = replace(EVTYPE, grepl("SMOKE", EVTYPE), "SMOKE"),
           EVTYPE = replace(EVTYPE, grepl("DR(Y|IEST|OUGHT)", EVTYPE), "DROUGHT"))

To get a ranking of types of events with most harmful effects on population health we will consider only three columns (EVTYPE, INJURIES, FATALITIES). We will apply the following steps on the data:

most_harmful_events <- data %>%
                            group_by(EVTYPE) %>%
                            summarise(INJURIES = sum(INJURIES), FATALITIES = sum(FATALITIES)) %>%
                            mutate(TOTAL = INJURIES + FATALITIES) %>%
                            arrange(desc(TOTAL)) %>%
                            top_n(5) %>%
                            gather(key=DMGTYPE, value=QTY, -EVTYPE)

In order to get the data required to measure the event with the greatest economics consequences, we apply the following steps:

most_costly_events <- data %>%
            select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
            mutate(PROPDMGEXP = replace(PROPDMGEXP, !(tolower(PROPDMGEXP) %in% c("b", "m", "k")), "0"),
                   PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "k", "0.000001"),
                   PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "m", "0.001"),
                   PROPDMGEXP = replace(PROPDMGEXP, tolower(PROPDMGEXP) == "b", "1"),
                   CROPDMGEXP = replace(CROPDMGEXP, !(tolower(CROPDMGEXP) %in% c("b", "m", "k")), "0"),
                   CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "k", "0.000001"),
                   CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "m", "0.001"),
                   CROPDMGEXP = replace(CROPDMGEXP, tolower(CROPDMGEXP) == "b", "1"),
                   PROPDMGEXP = as.numeric(PROPDMGEXP),
                   CROPDMGEXP = as.numeric(CROPDMGEXP),
                   TOTAL_PROP = PROPDMG * PROPDMGEXP,
                   TOTAL_CROP = CROPDMG * CROPDMGEXP) %>%
            select(EVTYPE, TOTAL_PROP, TOTAL_CROP) %>%
            group_by(EVTYPE) %>%
            summarise(PROP = sum(TOTAL_PROP), CROP = sum(TOTAL_CROP)) %>%
            mutate(TOTAL = PROP + CROP) %>%
            arrange(desc(TOTAL)) %>%
            top_n(5) %>%
            gather(key=DMGTYPE, value=AMOUNT, -EVTYPE)

Results

Plotting our results will help us to get insight from the data. For the most harmful to population health events we have:

color_pallete <- brewer.pal(3, "Dark2")
line_color <- color_pallete[3]
legend_labels <- c("Fatalities", "Injuries", "Total")
most_harmful_events_without_total <- most_harmful_events %>% filter(DMGTYPE != "TOTAL")
most_harmful_events_totals <- most_harmful_events %>% filter(DMGTYPE == "TOTAL")

ggplot(most_harmful_events_without_total, aes(x=reorder(EVTYPE, -QTY), y=QTY, fill=factor(DMGTYPE))) +
    geom_bar(stat = "identity", position = "dodge") +
    scale_fill_manual(name="Damage type", values=color_pallete, labels=legend_labels) +
    geom_line(data = most_harmful_events_totals, aes(x=EVTYPE, y=QTY, group=1), colour=line_color) +
    xlab("Event") + ylab("Quantity") +
    labs(title = "Top 5 most harmful to population health event types", caption = "Figure 1")

From Figure 1, we have the following insights:

The same reasoning can be applied for the events with the greatest economics consequences:

legend_labels <- c("Crops", "Properties", "Total")
most_costly_events_without_total <- most_costly_events %>% filter(DMGTYPE != "TOTAL")
most_costly_events_totals <- most_costly_events %>% filter(DMGTYPE == "TOTAL")

ggplot(most_costly_events_without_total, aes(x=reorder(EVTYPE, -AMOUNT), y=AMOUNT, fill=factor(DMGTYPE))) +
    geom_bar(stat = "identity", position = "dodge") +
    scale_fill_manual(name="Damage type", values=color_pallete, labels=legend_labels) +
    geom_line(data = most_costly_events_totals, aes(x=EVTYPE, y=AMOUNT, group=1), colour=line_color) +
    xlab("Event") + ylab("Amount in billions") +
    labs(title = "Top 5 event types with the greatest economic consequences", caption = "Figure 2")

From Figure 2, we have the following insights:

Appendix A

Why to discard observations with values different from “b”, “m” and “k”

Exploring the data, we can figure out some important insights.

bmk <- data %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
table(bmk$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5      6 
## 465934      1      8      5    216     25     13      4      4     28      4 
##      7      8      B      h      H      K      m      M 
##      5      1     40      1      6 424665      7  11330
table(bmk$PROPDMG[bmk$PROPDMGEXP == ""])
## 
##      0   0,41      1      2      3      4      5      6      7      8      9 
## 465858      1      4      7     16      9     11      6      3      2      3 
##     10     20     35     75 
##      8      4      1      1
table(bmk$PROPDMG[tolower(bmk$PROPDMGEXP) == "h"])
## 
## 2 3 5 
## 2 1 4
table(bmk$CROPDMGEXP)
## 
##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994
table(bmk$CROPDMG[bmk$CROPDMGEXP == ""])
## 
##      0      3      4 
## 618410      1      2

For PROPDMGEXP:

  • There are 465934 missing values.
  • Corresponding PROPDMG value is 0 for 465858 observations and less than 75 for 76 observations (0.0084%).
  • In the documentation, no definition for the ‘h’ value was provided (only ‘k’, ‘m’ and ‘b’).
  • There are 7 ‘h’ and ‘H’ values with PROPDMG less than 5.
  • 314 observations (0.035%) have numeric or special characters as values.

For CROPDMGEXP:

  • For CROPDMGEXP there are 618413 missing values.
  • Corresponding CROPDMG value is 0 for 618410 observations and less than 4 for 3 observations (0.0003%).
  • 36 observations (0.004%) have numeric or special characters as values.

Conclusion: To avoid noise due to unknown data, we remove from the total calculations numeric values with EXP different from ‘b’, ‘m’ and ‘k’.