Synopsis

This project identifies the weather events in the U.S. that are most harmful with respect to population health and that have the greatest economic consequences. The analysis uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, which tracks major storms and weather events in the U.S., including estimates of fatalities, injuries, and property damage. The analysis concludes that Tornado’s are the most harmful event with respect to U.S. population health, and floods have the greatest economic consequences (as measured by Property and Crop damage).

Data Processing

First, ensure your directories are setup properly. Then load the following libraries:

library(tidyverse)
library(dplyr)
library(ggplot2)

Next, read the data into R.

data <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = "," )

Take a look at the dataset.

str(data)

Remove the variable that are not required.

data <- data |> 
    select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)

There appears to be some strange characters in PROPDMGEXP and CROPDMGEXP. These variable are also the exponent values for PROPDMG and CROPDMG. Clean up these variables and convert the exponent values.

First, list the unique characters.

unique(data$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
unique(data$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"

Next, examine there associated values.

data$PROPDMG[which(data$PROPDMGEXP == "")]
data$PROPDMG[which(data$PROPDMGEXP == "+")]
data$PROPDMG[which(data$PROPDMGEXP == "-")]
data$PROPDMG[which(data$PROPDMGEXP == "0")]
data$PROPDMG[which(data$PROPDMGEXP == "?")]

data$CROPDMG[which(data$CROPDMGEXP == "")]
data$CROPDMG[which(data$CROPDMGEXP == "?")]
data$CROPDMG[which(data$CROPDMGEXP == "0")]

Now, convert the exponent value.

data <- data |> 
    mutate(PROPDMG = case_when(
        PROPDMGEXP == "K" ~ PROPDMG*1000,
        PROPDMGEXP == "M" ~ PROPDMG*1000000,
        PROPDMGEXP == ""  ~ 0,
        PROPDMGEXP == "B" ~ PROPDMG*1000000000,
        PROPDMGEXP == "m" ~ PROPDMG*1000000,
        PROPDMGEXP == "+" ~ 0,
        PROPDMGEXP == "0" ~ 0,
        PROPDMGEXP == "5" ~ PROPDMG*100000,
        PROPDMGEXP == "6" ~ PROPDMG*1000000,
        PROPDMGEXP == "?" ~ 0,
        PROPDMGEXP == "4" ~ PROPDMG*10000,
        PROPDMGEXP == "2" ~ PROPDMG*100,
        PROPDMGEXP == "3" ~ PROPDMG*1000,
        PROPDMGEXP == "h" ~ PROPDMG*100,
        PROPDMGEXP == "7" ~ PROPDMG*10000000,
        PROPDMGEXP == "H" ~ PROPDMG*100,
        PROPDMGEXP == "-" ~ 0,
        PROPDMGEXP == "1" ~ PROPDMG*10,
        PROPDMGEXP == "8" ~ PROPDMG*100000000,
        .default = 0
    ))

data <- data |> 
    mutate(CROPDMG = case_when(
        CROPDMGEXP == ""  ~ CROPDMG,
        CROPDMGEXP == "M" ~ CROPDMG*1000000,
        CROPDMGEXP == "K" ~ CROPDMG*1000,
        CROPDMGEXP == "m" ~ CROPDMG*1000000,
        CROPDMGEXP == "B" ~ CROPDMG*1000000000,
        CROPDMGEXP == "?" ~ CROPDMG,
        CROPDMGEXP == "0" ~ CROPDMG,
        CROPDMGEXP == "k" ~ CROPDMG*1000,
        CROPDMGEXP == "2" ~ CROPDMG*100
    ))

data <- data |> 
    select(-c(PROPDMGEXP, CROPDMGEXP))

Finally, subset the data into two new dataframes: health and economic, and sort each one to see the weather event that is most harmful with respect to the human population and most costly in terms of property and crop damage.

health_data <- data |> 
    select(c(EVTYPE, FATALITIES, INJURIES)) |>
    mutate(HEALTHIMP = FATALITIES + INJURIES) |> 
    group_by(EVTYPE) |> 
    summarise(HEALTHIMP = sum(HEALTHIMP), FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES)) |> 
    arrange(desc(HEALTHIMP))

economic_data <- data |> 
    select(c(EVTYPE, PROPDMG, CROPDMG)) |> 
    mutate(ECONOMICIMP = PROPDMG + CROPDMG) |> 
    group_by(EVTYPE) |> 
    summarise(ECONOMICIMP = sum(ECONOMICIMP), PROPDMG = sum(PROPDMG), CROPDMG = sum(CROPDMG)) |> 
    arrange(desc(ECONOMICIMP))

There are now two neat dataframes to explore the questions of our research.

Results

The weather event most harmful to the US population and most costly in terms of damage are:

head(health_data, 1)
## # A tibble: 1 × 4
##   EVTYPE  HEALTHIMP FATALITIES INJURIES
##   <chr>       <dbl>      <dbl>    <dbl>
## 1 TORNADO     96979       5633    91346
head(economic_data, 1)
## # A tibble: 1 × 4
##   EVTYPE  ECONOMICIMP      PROPDMG    CROPDMG
##   <chr>         <dbl>        <dbl>      <dbl>
## 1 FLOOD  150319678250 144657709800 5661968450

To show the results in a figure, subset the data into “top 10”, since there are too many weather events for a single chart. Then pivot the data into long format for ease of analysis. Finally, create a bar chart of the top 10!

health_top10 <- health_data |> 
    top_n(10, HEALTHIMP)

health_top10 <- health_top10 |> 
    pivot_longer(
        cols = HEALTHIMP:INJURIES,
        names_to = "HEALTH",
        values_to = "PERSONS"
    )

health_plot <- health_top10 |> 
    ggplot(aes(x = reorder(EVTYPE, -PERSONS), y = PERSONS, fill = reorder(HEALTH, -PERSONS))) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
        title = "US Health Impact of Top 10 Weather Events",
        x     = "Weather Event", 
        y     = "Persons",
        fill  = "Health Measure"
        ) +
    scale_fill_manual(labels = c("Total", "Injuries", "Fatalities"), values = c("violet", "yellow", "orange")) +
    theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1)) 
    
health_plot

Tornado is by far the most harmful weather event in terms of injuries and fatalities.

economic_top10 <- economic_data |> 
    top_n(10, ECONOMICIMP)

economic_top10 <- economic_top10 |> 
    pivot_longer(
        cols = ECONOMICIMP:CROPDMG,
        names_to = "ECONOMIC",
        values_to = "VALUE"
    ) |> 
    mutate(VALUE = VALUE/1000000000)

economic_plot <- economic_top10 |> 
    ggplot(aes(x = reorder(EVTYPE, -VALUE), y = VALUE, fill = reorder(ECONOMIC, -VALUE))) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
        title = "US Economic Impact of Top 10 Weather Events",
        x     = "Weather Event", 
        y     = "Dollars (Billions, USD)",
        fill  = "Economic Damage"
    ) +
    scale_fill_manual(labels = c("Total", "Property Damage", "Crop Damage"), values = c("violet", "yellow", "orange")) +
    theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1)) 

economic_plot

Flooding is by far the most costly weather event in the US.