Synopsis

This analysis finds the most harmful weather events in the USA with respect to population health as well as events that have the greatest economic consequences. The analysis uses the NOAA Storm Database available at the following URL: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2.
To determine the most harmful weather events with respect to population health, the analysis will find 5 events that caused most fatalities and 5 events that caused injuries. Likewise, to determins weather events that have the greatest economic consequences, the anaysis will find 5 events that caused most property damage and 5 evets that caused most crop damage.

Events that caused most health damage - data processing

if (!require(lubridate)) {
        install.packages("lubridate")
        if (!require(lubridate)) {
                stop("Unable to load 'lubridate' library")
        }
}
if (!require(dplyr)) {
        install.packages("dplyr")
        if (!require(dplyr)) {
                stop("Unable to load 'dplyr' library")
        }
}
if (!require(ggplot2)) {
        install.packages("ggplot2")
        if (!require(ggplot2)) {
                stop("Unable to load 'ggplot2' library")
        }
}
stormdata <- read.csv(bzfile("repdata-data-StormData.csv.bz2"))
stormdata$BGN_DATE <- as.Date(stormdata$BGN_DATE, format = "%m/%d/%Y")

Let’s have a look at the number of non-zero events recorded over years:

tbl <- table(stormdata$EVTYPE, year(stormdata$BGN_DATE))
colSums(tbl != 0)
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 
##    1    1    1    1    1    3    3    3    3    3    3    3    3    3    3 
## 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
## 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 
##    3    3    3    3    3    3    3    3    3    3    3    3    3  160  267 
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 
##  387  228  170  126  121  112  122   99   51   38   46   50   46   46   46 
## 2010 2011 
##   46   46

Obviously, event records starting from 1993 are much more complete, so we will use the period of 1993 - 2011 for our analysis.

# also we'll drop some columns unnecessary for the analysis
stormdata <- subset(stormdata, year(stormdata$BGN_DATE) > "1992", 
                    select = c(BGN_DATE, EVTYPE, FATALITIES:CROPDMGEXP))
# sum of fatalities and injuries per event type
health_damage <- stormdata %>% 
        group_by(EVTYPE) %>% 
        summarize(fatalities_total = sum(FATALITIES), injuries_total = sum(INJURIES))

top5_fatalities <- arrange(health_damage, desc(fatalities_total))[1:5,1:2]
top5_injuries <- arrange(health_damage, desc(injuries_total))[1:5,c(1,3)]

Events that caused most health damage - results

Top 5 event types that caused most fatalities:

top5_fatalities
## Source: local data frame [5 x 2]
## 
##           EVTYPE fatalities_total
## 1 EXCESSIVE HEAT             1903
## 2        TORNADO             1621
## 3    FLASH FLOOD              978
## 4           HEAT              937
## 5      LIGHTNING              816

Top 5 event types that caused most injuries:

top5_injuries
## Source: local data frame [5 x 2]
## 
##           EVTYPE injuries_total
## 1        TORNADO          23310
## 2          FLOOD           6789
## 3 EXCESSIVE HEAT           6525
## 4      LIGHTNING           5230
## 5      TSTM WIND           3631

Therefore, event types that are most harmful with respect to population health are as follows:

unique(append(as.character(top5_fatalities$EVTYPE), as.character(top5_injuries$EVTYPE)))
## [1] "EXCESSIVE HEAT" "TORNADO"        "FLASH FLOOD"    "HEAT"          
## [5] "LIGHTNING"      "FLOOD"          "TSTM WIND"

Let’s see how health damage caused by the total most harmful event types changed over years:

total_fatalities_df <- subset(stormdata, EVTYPE %in% droplevels(top5_fatalities$EVTYPE))
total_injuries_df <- subset(stormdata, EVTYPE %in% droplevels(top5_injuries$EVTYPE))

total_fatalities_df <- total_fatalities_df %>% 
        group_by(year(BGN_DATE), EVTYPE) %>% 
        summarize(fatalities_total = sum(FATALITIES)) %>%
        rename(harm_total = fatalities_total) %>%
        mutate(harm_type = "fatalities")

total_injuries_df <- total_injuries_df %>% 
        group_by(year(BGN_DATE), EVTYPE) %>% 
        summarize(injuries_total = sum(INJURIES)) %>%
        rename(harm_total = injuries_total) %>% 
        mutate(harm_type = "injuries")

harm_totals_df <- bind_rows(total_fatalities_df, total_injuries_df)
names(harm_totals_df)[1] = "year"

ggplot(data = harm_totals_df, aes(x = year, y = harm_total, color = EVTYPE)) + 
        geom_line(size = 2, alpha=  0.7) + 
        facet_grid(harm_type ~ ., scales = "free") + 
        scale_colour_discrete(name = "Event Type") + 
        ylab("Total events") +
        ggtitle("Most harmful weather event types with respect to population health")

Events that have the greatest economic consequences - data processing

PROPDMGEXP and CROPDMGEXP variables contain the following values:

unique(stormdata$PROPDMGEXP)
##  [1]   B K M m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(stormdata$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M

We will assume that those values are multipliers for PROPDMG and CROPDMG values and will perform the following conversions:
“1..8” -> 10
“-” -> 0
“” -> 1
“?” -> 0
“+” -> 1
“h” or “H” -> 100
“k” or “K” -> 1000
“m” or “M” -> 1000000
“b” or “B” -> 1000000000

stormdata$PROPDMGEXP <- gsub("[1-8]", "10", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("-", "0", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("^$", "1", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("\\?", "0", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("\\+", "1", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[hH]", "100", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[kK]", "1000", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[mM]", "1000000", stormdata$PROPDMGEXP)
stormdata$PROPDMGEXP <- gsub("[bB]", "1000000000", stormdata$PROPDMGEXP)

stormdata$CROPDMGEXP <- gsub("[1-8]", "10", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("-", "0", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("^$", "1", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("\\?", "0", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("\\+", "1", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[hH]", "100", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[kK]", "1000", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[mM]", "1000000", stormdata$CROPDMGEXP)
stormdata$CROPDMGEXP <- gsub("[bB]", "1000000000", stormdata$CROPDMGEXP)

stormdata <- mutate(stormdata, 
                    PROPDMG = as.numeric(PROPDMG) * as.numeric(PROPDMGEXP),
                    CROPDMG = as.numeric(CROPDMG) * as.numeric(CROPDMGEXP))

economic_damage <- stormdata %>% 
        group_by(EVTYPE) %>% 
        summarize(prop_dmg_total = sum(PROPDMG), crop_dmg_total = sum(CROPDMG))

top5_prop_dmg <- arrange(economic_damage, desc(prop_dmg_total))[1:5,1:2]
top5_crop_dmg <- arrange(economic_damage, desc(crop_dmg_total))[1:5,c(1,3)]

Events that have the greatest economic consequences - results

Top 5 event types that caused most property damage:

top5_prop_dmg
## Source: local data frame [5 x 2]
## 
##              EVTYPE prop_dmg_total
## 1             FLOOD   144657709807
## 2 HURRICANE/TYPHOON    69305840000
## 3       STORM SURGE    43323536000
## 4           TORNADO    26338962995
## 5       FLASH FLOOD    16140812603

Top 5 event types that caused most crop damage:

top5_crop_dmg
## Source: local data frame [5 x 2]
## 
##        EVTYPE crop_dmg_total
## 1     DROUGHT    13972566000
## 2       FLOOD     5661968450
## 3 RIVER FLOOD     5029459000
## 4   ICE STORM     5022113500
## 5        HAIL     3025954453

Therefore, event types that caused greatest economic damage:

unique(append(as.character(top5_prop_dmg$EVTYPE), as.character(top5_crop_dmg$EVTYPE)))
## [1] "FLOOD"             "HURRICANE/TYPHOON" "STORM SURGE"      
## [4] "TORNADO"           "FLASH FLOOD"       "DROUGHT"          
## [7] "RIVER FLOOD"       "ICE STORM"         "HAIL"

Let’s see how economic damage caused by the total most harmful event types changed over years:

total_propdmg_df <- subset(stormdata, EVTYPE %in% droplevels(top5_prop_dmg$EVTYPE))
total_cropdmg_df <- subset(stormdata, EVTYPE %in% droplevels(top5_crop_dmg$EVTYPE))

total_propdmg_df <- total_propdmg_df %>% 
        group_by(year(BGN_DATE), EVTYPE) %>% 
        summarize(harm_total = sum(PROPDMG)) %>%
        mutate(harm_type = "property damage")

total_cropdmg_df <- total_cropdmg_df %>% 
        group_by(year(BGN_DATE), EVTYPE) %>% 
        summarize(harm_total = sum(CROPDMG)) %>%
        mutate(harm_type = "crop damage")

harm_totals_df <- bind_rows(total_propdmg_df, total_cropdmg_df)
names(harm_totals_df)[1] = "year"

ggplot(data = harm_totals_df, aes(x = year, y = harm_total, color = EVTYPE)) + 
        geom_line(size = 2, alpha =  0.7) + 
        facet_grid(harm_type ~ ., scales = "free") + 
        scale_colour_discrete(name = "Event Type") + 
        ylab("Total events") +
        ggtitle("Weather event types with greatest economic conseqences")