Storm data collected by the U.S. National Oceanic and Atmospheric Administration’s (NOAA) describes the damage to both human life and economic prosperity across the United States. Analyzing the data for the most impactful event types revealed extreme heat and flood as responsible for the largest average deaths per year. Floods similarly accounted for economic loss through property and crop damage. However, hurricanes and tornadoes also cost the U.S. significantly each year.
Data downloaded from U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database covering 1950 through November 2011. Details for data collection and pre-processing available in the National Weather Service Storm Data Documentation and National Climatic Data Center Storm Events FAQ.
Cleanup required significant effort to condense nearly 1,000 different event types including misspelling, alternate names, and compound descriptions. The Storm Data Documentation includes descriptions for 48 ‘permitted’ 48 storm events. Though not fully reduced to those prime events, cleanup cut the number of event types in half to 471.
library(data.table)
library(ggplot2)
library(plyr)
library(reshape2)
# Store link to NOAA Storm Data
link <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# Download data and save to ./data if not present
# Checks for timestamp downloading a new copy of the data if not present
if(!file.exists("./data")) dir.create("./data")
if(sum(file.exists(c("./data/StormData.csv.bz2", "./data/Timestamp.txt"))) < 2 ) {
download.file(link, destfile = "./data/StormData.csv.bz2", method = "curl")
write.table(date(), file = "./data/Timestamp.txt",
col.names = FALSE, row.names = FALSE)
print("Storm Data downloaded succesfully")
} else {
timestamp <- format(read.table("./data/Timestamp.txt"))
print(paste("Using Storm Data with timestamp", timestamp))
}
## [1] "Using Storm Data with timestamp Sat Jun 21 07:35:26 2014"
# List column classes including NULL to skip data not used in analysis
cols <- c("numeric", # STATE
"character", # BGN_DATE (will convert to Date)
"NULL", # BGN_TIME
"NULL", # BGN_TIME
"NULL", # COUNTY
"NULL", # COUNTYNAME
"factor", # STATE
"character", # EVTYPE (character class to simplify cleanup)
"NULL", # BGN_RANGE
"NULL", # BGN_AZI
"NULL", # BGN_LOCATION
"NULL", # END_DATE (too many blanks for useful analysis)
"NULL", # END_TIME
"NULL", # COUNTY_END
"NULL", # COUNTYENDN
"NULL", # END_RANGE
"NULL", # END_AZI
"NULL", # END_LOCATION
"NULL", # LENGTH
"NULL", # WIDTH
"NULL", # F
"NULL", # MAG
"numeric", # FATALITIES
"numeric", # INJURIES
"numeric", # PROPDMG
"factor", # PROPDMGEXP
"numeric", # CROPDMG
"factor", # CROPDMGEXP
"NULL", # WFO
"NULL", # STATEOFFICE
"NULL", # ZONENAMES
"NULL", # LATTITUDE
"NULL", # LONGITUDE
"NULL", # LATITUDE_E
"NULL", # LONGITUDE_
"NULL", # REMARKS
"numeric" # REFNUM
)
# Read Storm Data then convert to data.table to speed cleanup
# Calls read.csv() because fread produced an unknown error
dat <- data.table(read.csv("./data/StormData.csv.bz2", colClasses = cols))
# Print data summary
print(str(dat))
## Classes 'data.table' and 'data.frame': 902297 obs. of 11 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, ".internal.selfref")=<externalptr>
## NULL
# Clear variables
rm(cols)
Change STATE__ column name to STATENUM
setnames(dat, "STATE__", "STATENUM")Convert begin date from character to Date format then add separate year column
dat$BGN_DATE <- dat[, as.Date(BGN_DATE, format = "%m/%d/%Y %T")]
dat <- dat[, YEAR := year(BGN_DATE)]Clean event types (multiple steps)
# Count event types before clenup
print(paste("Unique event types before cleanup:",
length(unique(dat$EVTYPE))))
## [1] "Unique event types before cleanup: 985"
# Create data.frame of unique EVTYPE values
# Copies columns to enable replacement merge
evtypes <- summarize(dat, EVTYPE = count(EVTYPE))[[1]]
names(evtypes) <- c("EVTYPE", "COUNT")
evtypes$CLEAN <- evtypes$EVTYPE
# Load permitted event types as described in the National Weather Service
# Storm Data Documentation. CSV saved by directly copying the report.
evtypesPermit <- read.csv("./data/evtypes.csv",
stringsAsFactors = FALSE)[[1]]
# Convert to upper case
evtypes$CLEAN <- toupper(evtypes$CLEAN)
evtypesPermit <- toupper(evtypesPermit)
# Replace non-alphanumeric with . and remove duplicates
evtypes$CLEAN <- gsub("[^0-9A-Z]", ".", evtypes$CLEAN)
evtypes$CLEAN <- gsub("\\.{2, }", ".", evtypes$CLEAN)
# For permitted events
evtypesPermit <- gsub("[^0-9A-Z]", ".", evtypesPermit)
evtypesPermit <- gsub("\\.{2, }", ".", evtypesPermit)
# Remove leading and ending .
evtypes$CLEAN <- gsub("^\\.+", "", evtypes$CLEAN)
evtypes$CLEAN <- gsub("\\.+$", "", evtypes$CLEAN)
# for permitted events
evtypesPermit <- gsub("^\\.+", "", evtypesPermit)
evtypesPermit <- gsub("\\.+$", "", evtypesPermit)
# Replace 'THUNDERSTORM' with 'TSTM'
evtypes$CLEAN <- gsub("THUNDERSTORM", "TSTM", evtypes$CLEAN)
evtypesPermit <- gsub("THUNDERSTORM", "TSTM", evtypesPermit)
# Simplify 'FLASH FLOOD'
evtypes$CLEAN <- gsub("FLASH FLOOD.*FLOOD", "FLASH FLOOD", evtypes$CLEAN)
# Simplify 'HEAVY RAIN' (includes 'HEAVY PRECIPITATION' and 'HEAVY SHOWERS')
evtypes$CLEAN <- gsub("HEAVY (PRECIPITATION|SHOWERS)", "HEAVY RAIN",
evtypes$CLEAN)
# Simplify 'HIGH WIND(S)' and 'STRONG WIND(S)'
evtypes$CLEAN <- gsub("(HIGH|STRONG) WIND", "HIGH WIND", evtypes$CLEAN)
# Simplify split 'HURRICANE.TYPHOON' in permitted event types
evtypesPermit[grepl("^HURRICANE", evtypesPermit)] <- "HURRICANE"
evtypesPermit <- c(evtypesPermit, "TYPHOON")
# Order permitted event types by fewest number of characters
evtypesPermit <- evtypesPermit[order(nchar(evtypesPermit))]
# Search and replace by permitted event types
for(ev in evtypesPermit) {
evtypes$CLEAN[grepl(ev, evtypes$CLEAN)] <- ev
}
rm(ev)
# Merge clean event types into data
evtypesMerge <- data.table(EVTYPE = evtypes$EVTYPE,
EVTYPECLEAN = evtypes$CLEAN)
dat <- merge(dat, evtypesMerge, by = "EVTYPE", all.x = TRUE)
# Count event types after clenup
print(paste("Unique event types after cleanup:",
length(unique(dat$EVTYPECLEAN))))
## [1] "Unique event types after cleanup: 471"Multiply property and crop damage by ‘EXP’ to calculate total
# Create function to lookup 'EXP' multiple
expMultiple <- function(exp) {
switch(tolower(exp), k = 1E+3, m = 1E+6, b = 1E+9, 1)
}
# Add total property and crop damage columns
dat <- dat[, PROPDMGTOT := {exp <- sapply(dat$PROPDMGEXP, expMultiple);
PROPDMG * exp}]
dat <- dat[, CROPDMGTOT := {exp <- sapply(dat$CROPDMGEXP, expMultiple);
CROPDMG * exp}]
# Add total damage column
dat <- dat[, DMGTOT := PROPDMGTOT + CROPDMGTOT]Print clean data summary
print(str(dat))
## Classes 'data.table' and 'data.frame': 902297 obs. of 16 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 1 2 3 4 5 5 5 5 6 7 ...
## $ STATENUM : num 60 34 48 4 13 13 17 13 12 12 ...
## $ BGN_DATE : Date, format: "2001-12-03" "1996-12-13" ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 6 44 63 7 14 14 20 14 13 13 ...
## $ FATALITIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PROPDMG : num 200 0 50 0 0 0 100 8 8 0 ...
## $ PROPDMGEXP : Factor w/ 19 levels "","-","?","+",..: 17 1 17 1 1 1 17 19 17 1 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP : Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 448421 265763 408333 311043 286025 ...
## $ YEAR : int 2001 1996 2000 1998 1997 1999 1999 2000 1998 2001 ...
## $ EVTYPECLEAN: chr "HIGH.SURF" "FLOOD" "FLOOD" "LIGHTNING" ...
## $ PROPDMGTOT : num 2e+05 0e+00 5e+04 0e+00 0e+00 0e+00 1e+05 8e+06 8e+03 0e+00 ...
## $ CROPDMGTOT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DMGTOT : num 2e+05 0e+00 5e+04 0e+00 0e+00 0e+00 1e+05 8e+06 8e+03 0e+00 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "EVTYPE"
## NULLClear temporary variables
rm(evtypes, evtypesPermit, evtypesMerge, expMultiple)Results summarized annually to account for significant differences in number of events for each type. Though this provides a data set useful for comparison, it can “hide” damaging outliers. The abundance of one-time event types compounds the problem. Additional analysis for the most impactful single event agreed with the annualized assessment that floods and hurricanes/tornadoes cause the most damage.
datSumYearType <- dat[, list(sum(FATALITIES), sum(INJURIES), sum(DMGTOT)),
by = c("YEAR", "EVTYPECLEAN")]
setnames(datSumYearType,
c("YEAR", "EVTYPE", "FATALITIES", "INJURIES", "DMGTOT"))
datStatsFtl <- datSumYearType[, as.list(summary(FATALITIES)), by = EVTYPE]
datStatsFtl <- datStatsFtl[order(Mean, decreasing = TRUE)]
print(datStatsFtl[1:5])
## EVTYPE Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1: HEAT 6 39.0 83.0 165.0 166.0 1050
## 2: TORNADO 15 38.2 56.5 90.9 92.8 587
## 3: FLOOD 33 54.5 79.0 80.3 97.5 132
## 4: LIGHTNING 22 33.5 44.0 43.0 48.5 79
## 5: RIP.CURRENT 8 25.2 30.5 32.1 42.5 49
datStatsInj <- datSumYearType[, as.list(summary(INJURIES)), by = EVTYPE]
datStatsInj <- datStatsInj[order(Mean, decreasing = TRUE)]
print(datStatsInj[1:5])
## EVTYPE Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1: TORNADO 396 738.0 1060 1470 1630 6820
## 2: HEAT 5 122.0 378 485 600 1510
## 3: FLOOD 23 41.5 64 453 200 6440
## 4: LIGHTNING 114 204.0 256 275 312 492
## 5: TYPHOON 2 5.0 118 256 316 839
datStatsDmg <- datSumYearType[, as.list(summary(DMGTOT)), by = EVTYPE]
datStatsDmg <- datStatsDmg[order(Mean, decreasing = TRUE)]
print(datStatsDmg[1:5])
## EVTYPE Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1: TYPHOON 1.75e+08 6.01e+08 1.02e+09 1.45e+10 1.89e+10 5.18e+10
## 2: FLOOD 7.35e+08 1.55e+09 2.39e+09 9.47e+09 4.94e+09 1.19e+11
## 3: STORM.SURGE 1.60e+05 1.80e+06 8.16e+06 3.61e+09 4.64e+07 4.31e+10
## 4: HURRICANE 0.00e+00 1.37e+07 5.60e+07 1.22e+09 2.16e+09 5.54e+09
## 5: TORNADO 3.45e+07 2.04e+08 6.31e+08 9.26e+08 1.25e+09 9.85e+09
Both health and economic impact plotted as log10 for the top five most damaging event types. Logarithmic scale simplified visual comparison.
datPlot <- datSumYearType[EVTYPE %in% datStatsFtl[1:5]$EVTYPE]
datPlot <- melt(datPlot, id = c("YEAR", "EVTYPE"),
measure = c("FATALITIES", "INJURIES"))
qplot(factor(EVTYPE), log10(value), data = datPlot, geom = "boxplot",
facets = variable ~ .)
rm(datPlot)
datPlot <- datSumYearType[EVTYPE %in% datStatsDmg[1:5]$EVTYPE]
datPlot <- melt(datPlot, id = c("YEAR", "EVTYPE"),
measure = "DMGTOT")
qplot(factor(EVTYPE), log10(value), data = datPlot, geom = "boxplot",
facets = variable ~ .)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
rm(datPlot)
Comparison of the 10 most impactful events in the NOAA storm database plotted for economic loss and deviation from the mean. Individual descriptions vary slightly from the annual analysis above. However, broader categories of flood and hurricane/tornadoes show similar damage patterns when summed or considered individually.
datMaxType <- dat[, list(max(DMGTOT), (max(DMGTOT) - mean(DMGTOT)) / sd(DMGTOT)),
"EVTYPECLEAN"]
setnames(datMaxType, c("EVTYPE", "DMGTOT", "SD"))
datMaxType <- datMaxType[order(DMGTOT, decreasing = TRUE)]
qplot(DMGTOT, SD, data = datMaxType[1:10], color = EVTYPE)
Severe U.S. weather accounts for many deaths and economic loss each year. Though any event carries risk, weather causing extreme heat and flood account for the greatest loss of life. Hurricanes and tornadoes do not harm as many citizens. They - along with floods - cause the most economic damage.