1. Synopsis

The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. The events in the database start in the year 1950 and end in November 2011.

The data analysis will address the following questions across the United States:

  1. Which types of events are most harmful with respect to population health (fatalities and injuries)?
  2. Which types of events have the greatest economic consequences (crops and properties)?

Here you will find how some of the variables are constructed/defined: National Weather Service Storm Data Documentation

2. Data Processing

2a. Load Data

path <- setwd('/Users/huaig/Desktop/Nick/Coding/Coursera/Johns Hopkins University/5. Reproducible Research/Project/RepData_PeerAssessment2')
path
## [1] "C:/Users/huaig/Desktop/Nick/Coding/Coursera/Johns Hopkins University/5. Reproducible Research/Project/RepData_PeerAssessment2"
library(data.table)
library(ggplot2)

# Download data file
file_url <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(file_url, destfile = paste0(path, '/2FStormData.csv.bz2'))

# Extract data into dataframe
storm_df <- read.csv('2FStormData.csv.bz2')

# Convert dataframe into datatable
storm_dt <- as.data.table(storm_df)

2b. Examine Column Names

colnames(storm_dt)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

2c. Subset Data

# Identify redundant columns
redundant_cols <- colnames(storm_dt[, !c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')])

# Filter redundant columns
storm_dt[, c(redundant_cols) := NULL]

# Subset dataset based on parameters of interest
storm_dt <- storm_dt[(EVTYPE != '?' & (FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)), c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]

2d. Convert Exponent Columns into Numeric

# Change data in exponent columns into lowercase
exp_cols <- c('PROPDMGEXP', 'CROPDMGEXP')
storm_dt[, (exp_cols) := c(lapply(.SD, tolower)), .SDcols = exp_cols]

# Keys to map property damage exponents to numeric
propdmgexp_key <- c('1' = 10^1, '2' = 10^2, '3' = 10^3, '4' = 10^4, '5' = 10^5, '6' = 10^6, '7' = 10^7, '8' = 10^8, '9' = 10^9, '0' = 10^0, '-' = 10^0, '+' = 10^0, '\'\'' = 10^0, 'h' = 10^2, 'k' = 10^3, 'm' = 10^6, 'b' = 10^9)

# Keys to map crop damage exponents to numeric
cropdmgexp_key <- c('0' = 10^0, '?' = 10^0, '\'\'' = 10^0, 'k' = 10^3, 'm' = 10^6, 'b' = 10^9)

# Convert exponent columns into numeric
storm_dt[, PROPDMGEXP := propdmgexp_key[as.character(storm_dt[, PROPDMGEXP])]]
storm_dt[is.na(PROPDMGEXP), PROPDMGEXP := 10^0]

storm_dt[, CROPDMGEXP := cropdmgexp_key[as.character(storm_dt[, CROPDMGEXP])]]
storm_dt[is.na(CROPDMGEXP), CROPDMGEXP := 10^0]

2e. Create Economic Cost Columns

storm_dt <- storm_dt[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, prop_dmg_cost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, crop_dmg_cost = CROPDMG * CROPDMGEXP)]

2f. Derive Total Number of Fatalities and Injuries

total_fatalinjur_dt <- storm_dt[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), total_fatalities_injuries = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
total_fatalinjur_dt <- total_fatalinjur_dt[order(-total_fatalities_injuries),]
total_fatalinjur_dt <- total_fatalinjur_dt[1:10,]
head(total_fatalinjur_dt)
##            EVTYPE FATALITIES INJURIES total_fatalities_injuries
## 1:        TORNADO       5633    91346                     96979
## 2: EXCESSIVE HEAT       1903     6525                      8428
## 3:      TSTM WIND        504     6957                      7461
## 4:          FLOOD        470     6789                      7259
## 5:      LIGHTNING        816     5230                      6046
## 6:           HEAT        937     2100                      3037

2g. Derive Total Cost of Property and Crop Damages

total_dmgcost_dt <- storm_dt[, .(prop_dmg_cost = sum(prop_dmg_cost), crop_dmg_cost = sum(crop_dmg_cost), total_dmg_cost = sum(prop_dmg_cost) + sum(crop_dmg_cost)), by = .(EVTYPE)]
total_dmgcost_dt <- total_dmgcost_dt[order(-total_dmg_cost),]
total_dmgcost_dt <- total_dmgcost_dt[1:10,]
head(total_dmgcost_dt)
##               EVTYPE prop_dmg_cost crop_dmg_cost total_dmg_cost
## 1:             FLOOD  144657709807    5661968450   150319678257
## 2: HURRICANE/TYPHOON   69305840000    2607872800    71913712800
## 3:           TORNADO   56947380677     414953270    57362333947
## 4:       STORM SURGE   43323536000          5000    43323541000
## 5:              HAIL   15735267513    3025954473    18761221986
## 6:       FLASH FLOOD   16822673979    1421317100    18243991079

3. Results

3a. Types of Events Most Harmful to Population Health

# Melt datatable for easy graphing
harmful_event <- melt(total_fatalinjur_dt, id.vars = 'EVTYPE', variable.name = 'harmful_result')
head(harmful_event)
##            EVTYPE harmful_result value
## 1:        TORNADO     FATALITIES  5633
## 2: EXCESSIVE HEAT     FATALITIES  1903
## 3:      TSTM WIND     FATALITIES   504
## 4:          FLOOD     FATALITIES   470
## 5:      LIGHTNING     FATALITIES   816
## 6:           HEAT     FATALITIES   937
# Create bar chart on types of events most harmful to population health
pop_health_chart <- ggplot(harmful_event, aes(x = reorder(EVTYPE, -value), y = value)) + geom_bar(stat = 'identity', aes(fill = harmful_result), position = 'dodge') + labs(x = 'Event type', y = 'Frequency', title = 'US Top 10 Harmful Events to Population Health') + theme(axis.text.x = element_text(angle = 37, hjust = 1), plot.title = element_text(hjust = .5))
pop_health_chart

3b. Types of Events Having Greatest Economic Consequences

# Melt datatable for easy graphing
econ_damage <- melt(total_dmgcost_dt, id.vars = 'EVTYPE', variable.name = 'damage_type')
head(econ_damage)
##               EVTYPE   damage_type        value
## 1:             FLOOD prop_dmg_cost 144657709807
## 2: HURRICANE/TYPHOON prop_dmg_cost  69305840000
## 3:           TORNADO prop_dmg_cost  56947380677
## 4:       STORM SURGE prop_dmg_cost  43323536000
## 5:              HAIL prop_dmg_cost  15735267513
## 6:       FLASH FLOOD prop_dmg_cost  16822673979
# Create bar chart on types of events having greatest economic consequences
econ_consequences_chart <- ggplot(econ_damage, aes(x = reorder(EVTYPE, -value), y = value / 10^9)) + geom_bar(stat = 'identity', aes(fill = damage_type), position = 'dodge') + labs(x = 'Event type', y = 'Damage cost (USD trillion)', title = 'US Top 10 Harmful Events by Economic Consequences') + theme(axis.text.x = element_text(angle = 37, hjust = 1), plot.title = element_text(hjust = .5))
econ_consequences_chart