Synopsis

This project explores and analyzes the National Oceanic and Atmospheric Administration’s (NOAA) storm database. It involves an exploratory data analysis to see which type of events contributes to the most harmful effects on the population health and economy.

We analyze and address the following questions:

  1. Across the United States, which types of events are most harmful with respect to population health?
  2. Across the United States, which types of events have the greatest economic consequences?

We find that tornado is the most harmful event for the population health, indicated by the large amount of injuries and fatalities. However, flood is the most harmful event that is responsible for economic losses.

Data Processing

We first download and see the structure of the data.

if (!file.exists("Courseradata")) {
    dir.create("Courseradata")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "/Users/adrianromano/Downloads/Courseradata/StormData.csv.bz2", method = "curl")
## Warning in download.file(fileUrl, destfile = "/Users/adrianromano/
## Downloads/Courseradata/StormData.csv.bz2", : download had nonzero exit
## status
stormData <- read.csv(bzfile("/Users/adrianromano/Downloads/Courseradata/StormData.csv.bz2"))
# str(stormData)

We then find the total number of injuries and fatalities of the population and group them based on event types in descending order as follows:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fatalities <- stormData %>%
    select(EVTYPE, FATALITIES) %>%
    group_by(EVTYPE) %>%
    summarize(sum = sum(FATALITIES)) %>%
    arrange(-sum)
head(fatalities)
## # A tibble: 6 x 2
##           EVTYPE   sum
##           <fctr> <dbl>
## 1        TORNADO  5633
## 2 EXCESSIVE HEAT  1903
## 3    FLASH FLOOD   978
## 4           HEAT   937
## 5      LIGHTNING   816
## 6      TSTM WIND   504
injuries <- stormData %>%
    select(EVTYPE, INJURIES) %>%
    group_by(EVTYPE) %>%
    summarize(sum = sum(INJURIES)) %>%
    arrange(-sum)
head(injuries)
## # A tibble: 6 x 2
##           EVTYPE   sum
##           <fctr> <dbl>
## 1        TORNADO 91346
## 2      TSTM WIND  6957
## 3          FLOOD  6789
## 4 EXCESSIVE HEAT  6525
## 5      LIGHTNING  5230
## 6           HEAT  2100

We see the unique values for PROPDMGEXP and CROPDMGEXP:

unique(stormData$PROPDMGEXP)
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(stormData$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M
stormData$PROPDMGEXP <- toupper(stormData$PROPDMGEXP)
stormData$CROPDMGEXP <- toupper(stormData$CROPDMGEXP)
unique(stormData$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
unique(stormData$CROPDMGEXP)
## [1] ""  "M" "K" "B" "?" "0" "2"

PRODMGEXP and CROPDMGEXP are the exponent values for PROPDMG (Property Damage) and CROPDMG (Crop Damage) respectively. We noticed that there are both numeric and character unique values, the character values are defined as follows:

We make the conversion to convert all to numeric values as follows:

stormData[stormData$PROPDMGEXP %in% c("", "+", "-", "?"), "PROPDMGEXP"] <- "0"
stormData[stormData$CROPDMGEXP %in% c("", "?"), "CROPDMGEXP"] <- "0"
stormData[stormData$PROPDMGEXP == "K", "PROPDMGEXP"] <- "3"
stormData[stormData$PROPDMGEXP == "M", "PROPDMGEXP"] <- "6"
stormData[stormData$PROPDMGEXP == "B", "PROPDMGEXP"] <- "9"
stormData[stormData$PROPDMGEXP == "H", "PROPDMGEXP"] <- "2"
stormData[stormData$CROPDMGEXP == "M", "CROPDMGEXP"] <- "6"
stormData[stormData$CROPDMGEXP == "K", "CROPDMGEXP"] <- "3"
stormData[stormData$CROPDMGEXP == "B", "CROPDMGEXP"] <- "9"

Check:

unique(stormData$PROPDMGEXP)
##  [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
unique(stormData$CROPDMGEXP)
## [1] "0" "6" "3" "9" "2"

We calculate the total damage by combining the damage values of the population properties and crops. We calculate the total damage and group them based on event types in descending order as follows:

library(dplyr)
stormData$PROPDMGEXP <- as.integer(stormData$PROPDMGEXP)
stormData$CROPDMGEXP <- as.integer(stormData$CROPDMGEXP)
totalDamage <- stormData %>%
    mutate(PROPDMGEXP2 = (10 ** PROPDMGEXP)) %>%
    mutate(CROPDMGEXP2 = (10 ** CROPDMGEXP)) %>%
    mutate(PROPDMG2 = PROPDMG * PROPDMGEXP2) %>%
    mutate(CROPDMG2 = CROPDMG * CROPDMGEXP2) %>%
    mutate(TOTALDMG = PROPDMG2 + CROPDMG2) 

totalDamage2 <- totalDamage %>%
    select(EVTYPE, TOTALDMG) %>%
    group_by(EVTYPE) %>%
    summarize(sum = sum(TOTALDMG)) %>%
    arrange(-sum)
totalDamage2
## # A tibble: 985 x 2
##               EVTYPE          sum
##               <fctr>        <dbl>
##  1             FLOOD 150319678257
##  2 HURRICANE/TYPHOON  71913712800
##  3           TORNADO  57362333946
##  4       STORM SURGE  43323541000
##  5              HAIL  18761221986
##  6       FLASH FLOOD  18243991078
##  7           DROUGHT  15018672000
##  8         HURRICANE  14610229010
##  9       RIVER FLOOD  10148404500
## 10         ICE STORM   8967041360
## # ... with 975 more rows

Results

Across the United States, which types of events are most harmful with respect to population health?

library(ggplot2)
ggplot(fatalities[1:8, ], aes(x = reorder(EVTYPE, sum), y = sum, fill = EVTYPE, alpha = 0.1)) +
    geom_bar(stat = "identity", col = "black") +
    xlab("Event Type") +
    ylab("Number of Fatalities") +
    ggtitle("Top 8 Events with Highest Total Fatalities") +
    coord_flip() +
    guides(fill = FALSE, alpha = FALSE)

ggplot(injuries[1:8, ], aes(x = reorder(EVTYPE, sum), y = sum, fill = EVTYPE, alpha = 0.1)) +
    geom_bar(stat = "identity", col = "black") +
    xlab("Event Type") +
    ylab("Number of Injuries") +
    ggtitle("Top 8 Events with Highest Total Injuries") +
    coord_flip() +
    guides(fill = FALSE, alpha = FALSE)

Across the United States, which types of events have the greatest economic consequences?

library(ggplot2)
ggplot(totalDamage2[1:8, ], aes(x = reorder(EVTYPE, sum), y = sum, fill = EVTYPE, alpha = 0.1)) +
    geom_bar(stat = "identity", col = "black") +
    xlab("Event Type") +
    ylab("Total Damages in USD") +
    ggtitle("Top 8 Events with Highest Total Economic Impact") +
    coord_flip() +
    guides(fill = FALSE, alpha = FALSE)