Synopsis

This Assignment explores the NOAA Storm Database and answers some basic questions about severe weather events and thier impacts. I will be using the database to answer questions listed below, in combination with analysis and visualization of the data.

Data Processing and loading libraries

library(dplyr)
library(lubridate)
library(ggplot2)
if (!file.exists("StormData.csv.bz2")) {
     fileUrl<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
     download.file(fileUrl, destfile="StormData.csv.bz2", method="curl")
     
     # Exit if the file is not available
     if (!file.exists("StormData.csv.bz2")) {
          stop("Can't locate file 'StormData.csv.bz2'!")
     }
}

# Load the dataset
stormDataRaw <- read.csv("StormData.csv.bz2")

# Show the structure of the dataset
str(stormDataRaw)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
stormData <- select(stormDataRaw, BGN_DATE, EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, FATALITIES, INJURIES)

# Format the BGN_DATE variable as a date
stormData$BGN_DATE <- as.Date(stormData$BGN_DATE, "%m/%d/%Y")
stormData$YEAR <- year(stormData$BGN_DATE)

# Tornado 1950 - 1954
# Tornado, Thunderstorm Wind, Hail 1955 - 1995
# 48 Events since 1996
# Only use events since 1996
stormData <- filter(stormData, YEAR >= 1996)

# Only use events with either health impact or economic damage
stormData <- filter(stormData, PROPDMG > 0 | CROPDMG > 0 | FATALITIES > 0 | INJURIES > 0)
table(stormData$PROPDMGEXP)
## 
##             B      K      M 
##   8448     32 185474   7364
table(stormData$CROPDMGEXP)
## 
##             B      K      M 
## 102767      2  96787   1762
stormData$PROPDMGEXP <- toupper(stormData$PROPDMGEXP)
stormData$CROPDMGEXP <- toupper(stormData$CROPDMGEXP)

stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "")] <- 10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "?")] <- 10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "0")] <- 10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "2")] <- 10^2
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "K")] <- 10^3
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "M")] <- 10^6
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP == "B")] <- 10^9

stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "")] <- 10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "-")] <- 10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "?")] <- 10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "+")] <- 10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "0")] <- 10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "1")] <- 10^1
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "2")] <- 10^2
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "3")] <- 10^3
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "4")] <- 10^4
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "5")] <- 10^5
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "6")] <- 10^6
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "7")] <- 10^7
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "8")] <- 10^8
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "H")] <- 10^2
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "K")] <- 10^3
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "M")] <- 10^6
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP == "B")] <- 10^9
stormData <- mutate(stormData, HEALTHIMP = FATALITIES + INJURIES)
stormData <- mutate(stormData, ECONOMICCOST = PROPDMG * PROPDMGFACTOR + CROPDMG * CROPDMGFACTOR)
stormData$EVTYPE <- toupper(stormData$EVTYPE)
dim(data.frame(table(stormData$EVTYPE)))
## [1] 186   2
evtypeUnique <- unique(stormData$EVTYPE)
evtypeUnique[grep("THUND", evtypeUnique)]
## [1] "THUNDERSTORM"             "THUNDERSTORM WIND (G40)" 
## [3] "THUNDERSTORM WIND"        "MARINE THUNDERSTORM WIND"

Results

healthImpact <- with(stormData, aggregate(HEALTHIMP ~ EVTYPE, FUN = sum))
subset(healthImpact, HEALTHIMP > quantile(HEALTHIMP, prob = 0.95))
##                EVTYPE HEALTHIMP
## 39     EXCESSIVE HEAT      8188
## 46        FLASH FLOOD      2561
## 48              FLOOD      7172
## 69               HEAT      1459
## 88  HURRICANE/TYPHOON      1339
## 107         LIGHTNING      4792
## 146 THUNDERSTORM WIND      1530
## 149           TORNADO     22178
## 153         TSTM WIND      3870
## 182      WINTER STORM      1483
stormData$EVTYPE[(stormData$EVTYPE == "TSTM WIND")] <- "THUNDERSTORM WIND"
stormData$EVTYPE[(stormData$EVTYPE == "HURRICANE/TYPHOON")] <- "HURRICANE (TYPHOON)"
economicCost <- with(stormData, aggregate(ECONOMICCOST ~ EVTYPE, FUN = sum))
subset(economicCost, ECONOMICCOST > quantile(ECONOMICCOST, prob = 0.95))
##                  EVTYPE ECONOMICCOST
## 32              DROUGHT  14413667000
## 46          FLASH FLOOD  16557105610
## 48                FLOOD 148919611950
## 66                 HAIL  17071172870
## 86            HURRICANE  14554229010
## 87  HURRICANE (TYPHOON)  71913712800
## 141         STORM SURGE  43193541000
## 146   THUNDERSTORM WIND   8812957230
## 149             TORNADO  24900370720
## 152      TROPICAL STORM   8320186550
stormData$EVTYPE[(stormData$EVTYPE == "HURRICANE")] <- "HURRICANE (TYPHOON)"
stormData$EVTYPE[(stormData$EVTYPE == "STORM SURGE")] <- "STORM SURGE/TIDE"

Questions

healthImpact <- stormData %>% 
                group_by(EVTYPE) %>% 
                summarise(HEALTHIMP = sum(HEALTHIMP)) %>% 
                arrange(desc(HEALTHIMP))
#healthImpact[1:10,]
g1 <- ggplot(healthImpact[1:10,], aes(x=reorder(EVTYPE, -HEALTHIMP),y=HEALTHIMP,color=EVTYPE)) + 
      geom_bar(stat="identity", fill="white") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
      xlab("Event") + ylab("Number of fatalities and injuries") +
      theme(legend.position="none") +
      ggtitle("Fatalities and injuries in the US caused by severe weather events")
g1

  1. As shown in the barchart above, tornadoes are the most harmful weather events to population health.
economicCost <- stormData %>% 
                group_by(EVTYPE) %>% 
                summarise(ECONOMICCOST = sum(ECONOMICCOST)) %>% 
                arrange(desc(ECONOMICCOST))
#economicCost[1:10,]
g1 <- ggplot(economicCost[1:10,], aes(x=reorder(EVTYPE, -ECONOMICCOST),y=ECONOMICCOST,color=EVTYPE)) + 
      geom_bar(stat="identity", fill="white") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
      xlab("Event") + ylab("Economic cost in USD") +
      theme(legend.position="none") +
      ggtitle("Economic cost in the US caused by severe weather events")
g1

  1. In the above barchart, floods prove to be the most impact to economical consequences.