The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. The events in the database start in the year 1950 and end in November 2011.
The data analysis will address the following questions across the United States:
Here you will find how some of the variables are constructed/defined: National Weather Service Storm Data Documentation
path <- setwd('/Users/huaig/Desktop/Nick/Coding/Coursera/Johns Hopkins University/5. Reproducible Research/Project/RepData_PeerAssessment2')
path
## [1] "C:/Users/huaig/Desktop/Nick/Coding/Coursera/Johns Hopkins University/5. Reproducible Research/Project/RepData_PeerAssessment2"
library(data.table)
library(ggplot2)
# Download data file
file_url <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(file_url, destfile = paste0(path, '/2FStormData.csv.bz2'))
# Extract data into dataframe
storm_df <- read.csv('2FStormData.csv.bz2')
# Convert dataframe into datatable
storm_dt <- as.data.table(storm_df)
colnames(storm_dt)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
# Identify redundant columns
redundant_cols <- colnames(storm_dt[, !c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')])
# Filter redundant columns
storm_dt[, c(redundant_cols) := NULL]
# Subset dataset based on parameters of interest
storm_dt <- storm_dt[(EVTYPE != '?' & (FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)), c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')]
# Change data in exponent columns into lowercase
exp_cols <- c('PROPDMGEXP', 'CROPDMGEXP')
storm_dt[, (exp_cols) := c(lapply(.SD, tolower)), .SDcols = exp_cols]
# Keys to map property damage exponents to numeric
propdmgexp_key <- c('1' = 10^1, '2' = 10^2, '3' = 10^3, '4' = 10^4, '5' = 10^5, '6' = 10^6, '7' = 10^7, '8' = 10^8, '9' = 10^9, '0' = 10^0, '-' = 10^0, '+' = 10^0, '\'\'' = 10^0, 'h' = 10^2, 'k' = 10^3, 'm' = 10^6, 'b' = 10^9)
# Keys to map crop damage exponents to numeric
cropdmgexp_key <- c('0' = 10^0, '?' = 10^0, '\'\'' = 10^0, 'k' = 10^3, 'm' = 10^6, 'b' = 10^9)
# Convert exponent columns into numeric
storm_dt[, PROPDMGEXP := propdmgexp_key[as.character(storm_dt[, PROPDMGEXP])]]
storm_dt[is.na(PROPDMGEXP), PROPDMGEXP := 10^0]
storm_dt[, CROPDMGEXP := cropdmgexp_key[as.character(storm_dt[, CROPDMGEXP])]]
storm_dt[is.na(CROPDMGEXP), CROPDMGEXP := 10^0]
storm_dt <- storm_dt[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, prop_dmg_cost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, crop_dmg_cost = CROPDMG * CROPDMGEXP)]
total_fatalinjur_dt <- storm_dt[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), total_fatalities_injuries = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
total_fatalinjur_dt <- total_fatalinjur_dt[order(-total_fatalities_injuries),]
total_fatalinjur_dt <- total_fatalinjur_dt[1:10,]
head(total_fatalinjur_dt)
## EVTYPE FATALITIES INJURIES total_fatalities_injuries
## 1: TORNADO 5633 91346 96979
## 2: EXCESSIVE HEAT 1903 6525 8428
## 3: TSTM WIND 504 6957 7461
## 4: FLOOD 470 6789 7259
## 5: LIGHTNING 816 5230 6046
## 6: HEAT 937 2100 3037
total_dmgcost_dt <- storm_dt[, .(prop_dmg_cost = sum(prop_dmg_cost), crop_dmg_cost = sum(crop_dmg_cost), total_dmg_cost = sum(prop_dmg_cost) + sum(crop_dmg_cost)), by = .(EVTYPE)]
total_dmgcost_dt <- total_dmgcost_dt[order(-total_dmg_cost),]
total_dmgcost_dt <- total_dmgcost_dt[1:10,]
head(total_dmgcost_dt)
## EVTYPE prop_dmg_cost crop_dmg_cost total_dmg_cost
## 1: FLOOD 144657709807 5661968450 150319678257
## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3: TORNADO 56947380677 414953270 57362333947
## 4: STORM SURGE 43323536000 5000 43323541000
## 5: HAIL 15735267513 3025954473 18761221986
## 6: FLASH FLOOD 16822673979 1421317100 18243991079
# Melt datatable for easy graphing
harmful_event <- melt(total_fatalinjur_dt, id.vars = 'EVTYPE', variable.name = 'harmful_result')
head(harmful_event)
## EVTYPE harmful_result value
## 1: TORNADO FATALITIES 5633
## 2: EXCESSIVE HEAT FATALITIES 1903
## 3: TSTM WIND FATALITIES 504
## 4: FLOOD FATALITIES 470
## 5: LIGHTNING FATALITIES 816
## 6: HEAT FATALITIES 937
# Create bar chart on types of events most harmful to population health
pop_health_chart <- ggplot(harmful_event, aes(x = reorder(EVTYPE, -value), y = value)) + geom_bar(stat = 'identity', aes(fill = harmful_result), position = 'dodge') + labs(x = 'Event type', y = 'Frequency', title = 'US Top 10 Harmful Events to Population Health') + theme(axis.text.x = element_text(angle = 37, hjust = 1), plot.title = element_text(hjust = .5))
pop_health_chart
# Melt datatable for easy graphing
econ_damage <- melt(total_dmgcost_dt, id.vars = 'EVTYPE', variable.name = 'damage_type')
head(econ_damage)
## EVTYPE damage_type value
## 1: FLOOD prop_dmg_cost 144657709807
## 2: HURRICANE/TYPHOON prop_dmg_cost 69305840000
## 3: TORNADO prop_dmg_cost 56947380677
## 4: STORM SURGE prop_dmg_cost 43323536000
## 5: HAIL prop_dmg_cost 15735267513
## 6: FLASH FLOOD prop_dmg_cost 16822673979
# Create bar chart on types of events having greatest economic consequences
econ_consequences_chart <- ggplot(econ_damage, aes(x = reorder(EVTYPE, -value), y = value / 10^9)) + geom_bar(stat = 'identity', aes(fill = damage_type), position = 'dodge') + labs(x = 'Event type', y = 'Damage cost (USD trillion)', title = 'US Top 10 Harmful Events by Economic Consequences') + theme(axis.text.x = element_text(angle = 37, hjust = 1), plot.title = element_text(hjust = .5))
econ_consequences_chart