Synopsis

This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The data was directly downloaded from the link using r, and analyzed with library “dplyr”, “tidyr”, “data.table”, “mgsu”, and ploted with ggplot2. The plots showed that tornado caused the most harm to people’s health, and floods caused the most economic loss in US from 1950 to November of 2011.

Data Processing

  1. Load libraries
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
  1. Download file and read file, check dim and head
destfile <- "storm.csv.bz2"
if(!file.exists(destfile)){
        setInternet2(TRUE)
        download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile, "curl")
        load("storm.csv.bz2")
}


storm <- fread("storm.csv.bz2")
dim(storm)
## [1] 902297     37
head(storm, 2)
##    STATE__          BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1:       1 4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2:       1 4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
##    BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1:         0                                               0         NA
## 2:         0                                               0         NA
##    END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1:         0                        14   100 3   0          0       15    25.0
## 2:         0                         2   150 2   0          0        0     2.5
##    PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1:          K       0                                         3040      8812
## 2:          K       0                                         3042      8755
##    LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1:       3051       8806              1
## 2:          0          0              2

Results

Analyze events that caused most population health damage

  1. Subset Health Damage data, and check NA values
health_damage <- storm %>% select(EVTYPE, FATALITIES, INJURIES) 
dim(health_damage)
## [1] 902297      3
sum(is.na(health_damage$FATALITIES))
## [1] 0
sum(is.na(health_damage$INJURIES))
## [1] 0
  1. Combine Fatalities and injuries number, and sorting desc to get top 5 high events
health_damage_sum <- health_damage %>% group_by(EVTYPE) %>% summarize(totalF = sum(FATALITIES), totalI = sum(INJURIES), total = sum(FATALITIES + INJURIES)) %>% arrange(desc(total))
## `summarise()` ungrouping output (override with `.groups` argument)
dim(health_damage_sum)
## [1] 985   4
health_damage_sum_top5 <- head(health_damage_sum, 5)
  1. Plot with ggplot bargraph
ggplot(data = health_damage_sum_top5, aes(x = reorder(EVTYPE, - total), y = total, fill = interaction(total, EVTYPE)))  + 
        geom_bar(stat = "identity") + labs(title = "Top 5 Severe Weather Envents Harming Population Health in US", x = "Events", y = "No. of death and injuries") + theme(legend.position = "none") + 
        theme(plot.title = element_text(hjust = 0.5))

Analyze events that casued most economic damage

  1. Subset Economic Damage Data
economic_damage <- storm %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
sum(is.na(economic_damage$PROPDMG))
## [1] 0
sum(is.na(economic_damage$PROPDMGEXP))
## [1] 0
sum(is.na(economic_damage$CROPDMG))
## [1] 0
sum(is.na(economic_damage$CROPDMGEXP))
## [1] 0
dim(economic_damage)
## [1] 902297      5
head(economic_damage)
##     EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1: TORNADO    25.0          K       0           
## 2: TORNADO     2.5          K       0           
## 3: TORNADO    25.0          K       0           
## 4: TORNADO     2.5          K       0           
## 5: TORNADO     2.5          K       0           
## 6: TORNADO     2.5          K       0
  1. Convert damage value to numeric for calculation
library(mgsub)
economic_damage$PROPDMGEXP <- mgsub(economic_damage$PROPDMGEXP, c("K", "M", "B"), c("1000", "1e+06", "1e+09"))
economic_damage$CROPDMGEXP <- mgsub(economic_damage$CROPDMGEXP, c("K", "M", "B"), c("1000", "1e+06", "1e+09"))
economic_damage$PROPDMGEXP <- as.numeric(economic_damage$PROPDMGEXP)
## Warning: NAs introduced by coercion
economic_damage$CROPDMGEXP <- as.numeric(economic_damage$CROPDMGEXP)
## Warning: NAs introduced by coercion
str(economic_damage)
## Classes 'data.table' and 'data.frame':   902297 obs. of  5 variables:
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>
  1. Calculate economic damage value
library(tidyr)
economic_damage <- mutate_all(economic_damage, ~replace_na(., 0))
economic_damage <- economic_damage %>% mutate(Damage_value = PROPDMG * PROPDMGEXP + CROPDMG * CROPDMGEXP)
head(economic_damage)
##     EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP Damage_value
## 1: TORNADO    25.0       1000       0          0        25000
## 2: TORNADO     2.5       1000       0          0         2500
## 3: TORNADO    25.0       1000       0          0        25000
## 4: TORNADO     2.5       1000       0          0         2500
## 5: TORNADO     2.5       1000       0          0         2500
## 6: TORNADO     2.5       1000       0          0         2500
economic_damage_top5 <- economic_damage %>% group_by(EVTYPE) %>% summarise(E_D_value = sum(Damage_value)) %>% arrange(desc(E_D_value)) %>% head(5)
## `summarise()` ungrouping output (override with `.groups` argument)
str(economic_damage_top5)
## tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
##  $ EVTYPE   : chr [1:5] "FLOOD" "HURRICANE/TYPHOON" "TORNADO" "STORM SURGE" ...
##  $ E_D_value: num [1:5] 1.50e+11 7.19e+10 5.73e+10 4.33e+10 1.88e+10
head(economic_damage_top5)
## # A tibble: 5 x 2
##   EVTYPE               E_D_value
##   <chr>                    <dbl>
## 1 FLOOD             150319678250
## 2 HURRICANE/TYPHOON  71913712800
## 3 TORNADO            57340614101
## 4 STORM SURGE        43323541000
## 5 HAIL               18752904320
  1. plot with ggplot bargraph
ggplot(data = economic_damage_top5, aes(x = reorder(EVTYPE, -E_D_value), y = E_D_value, fill = interaction(E_D_value, EVTYPE)))  + 
        geom_bar(stat = "identity") + labs(title = "Top 5 Severe Weather Envents Harming Economics in US", x = "Events", y = "Economic Damage Value") + theme(legend.position = "none") + 
        theme(plot.title = element_text(hjust = 0.5))