Title: An Analysis of the NOAA Storm Database for High Impact Events
Synopsis: The basic goal of this assignment is to explore the NOAA Storm Database and answer the following basic questions about severe weather events.
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
Data Processing: After downloading, the zip file is read by R command and stored in a data table named “storm_data”.
library(data.table)
library(ggplot2)
storm_data <- read.csv("repdata-data-StormData.csv.bz2", stringsAsFactors = FALSE)
storm_data <- as.data.table(storm_data)
setkey(storm_data, REFNUM)
dim(storm_data)
## [1] 902297 37
head(storm_data, 2)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1: TORNADO 0 0
## 2: TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1: NA 0 14 100 3 0 0
## 2: NA 0 2 150 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1: 15 25.0 K 0
## 2: 0 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3040 8812 3051 8806 1
## 2: 3042 8755 0 0 2
Some of the data in the data table is not relevant to the requirements of the assignment. As the assignment deals with the total consequences of weather events with respect to human health and economic impact, information pertaining to the following are selected:
origin, type and date of weather events
fatalities and injuries
property damage
the NOAA reference number (for convenience)
relevant_data <- c("STATE__", "BGN_DATE", "STATE", "EVTYPE", "FATALITIES",
"INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG",
"CROPDMGEXP", "REFNUM")
storm_data <- subset(storm_data, select = relevant_data)
rm(relevant_data)
storm_data <- storm_data[, BGN_DATE := as.Date(BGN_DATE, "%m/%d/%Y")] # reclass begin_date
modern <- as.POSIXlt("1995/12/31") # set a threshold date
modern_index <- storm_data[, which(as.POSIXlt(BGN_DATE) > modern)] # identify events after threshold
storm_data <- storm_data[modern_index] # subset for events after threshold
rm(modern, modern_index)
event_types <- storm_data[, unique(EVTYPE)]
event_types <- sort(event_types)
Since we are only interested in the human health and economic impacts resulting from storm events, we could eliminate any events that have none of these consequences or more succinctly keep only those events that have an impact of any kind.
Differentiate between zero impact and non-zero impact events with a new factor variable
storm_data <- storm_data[, zero_impact := as.factor(ifelse(FATALITIES == 0 &
INJURIES == 0 &
PROPDMG == 0 &
CROPDMG == 0, "T", "F"))]
# create an index of non-zero impact events
impact_index <- storm_data[, which(zero_impact == "F")]
# subset on the index
storm_data <- storm_data[impact_index]
# remove the zero_impact column since it is no longer needed
storm_data <- storm_data[, zero_impact := NULL]
storm_data_check <- nrow(storm_data)
rm(impact_index)
# have another look at the unique storm events in the data table
event_types <- storm_data[, unique(EVTYPE)]
event_types <- sort(event_types)
After subsettting for storm events with impacts, there are 222 unique event types in the “storm_data” data table.
To further subset the data we examine the frequency of the types of storm events that have health and economic impacts.
Creating a new data table with occurrences by event type
event_type_freq <- storm_data[, as.data.table(table(EVTYPE))]
setnames(event_type_freq, c("event_type", "occurrences"))
# add a column of frequencies representing the % occurrence of all event types
event_type_freq <- event_type_freq[, frequency :=
round((occurrences/nrow(storm_data)*100),1)]
setnames(event_type_freq, c("event_type", "occurrences", "frequency"))
setkey(event_type_freq, occurrences)
# discard any event with a frequecy < 0.1%
cutoff <- event_type_freq[, match(0.1, frequency)]
event_type_freq <- event_type_freq[cutoff:nrow(event_type_freq)]
head(event_type_freq[order(-occurrences)], nrow(event_type_freq))
## event_type occurrences frequency
## 1: TSTM WIND 61775 30.7
## 2: THUNDERSTORM WIND 43097 21.4
## 3: HAIL 22679 11.3
## 4: FLASH FLOOD 19011 9.4
## 5: TORNADO 12366 6.1
## 6: LIGHTNING 11152 5.5
## 7: FLOOD 9513 4.7
## 8: HIGH WIND 5402 2.7
## 9: STRONG WIND 3367 1.7
## 10: WINTER STORM 1460 0.7
## 11: HEAVY RAIN 1047 0.5
## 12: HEAVY SNOW 1029 0.5
## 13: WILDFIRE 847 0.4
## 14: URBAN/SML STREAM FLD 702 0.3
## 15: EXCESSIVE HEAT 685 0.3
## 16: ICE STORM 631 0.3
## 17: TSTM WIND/HAIL 441 0.2
## 18: TROPICAL STORM 410 0.2
## 19: WINTER WEATHER 405 0.2
## 20: WILD/FOREST FIRE 381 0.2
## 21: RIP CURRENT 364 0.2
## 22: AVALANCHE 264 0.1
## 23: DROUGHT 258 0.1
## 24: RIP CURRENTS 239 0.1
## 25: BLIZZARD 228 0.1
## 26: LAKE-EFFECT SNOW 194 0.1
## 27: LANDSLIDE 190 0.1
## 28: STORM SURGE 169 0.1
## 29: EXTREME COLD 164 0.1
## 30: HEAT 164 0.1
## 31: COASTAL FLOOD 149 0.1
## 32: WINTER WEATHER/MIX 139 0.1
## 33: HURRICANE 126 0.1
## 34: HIGH SURF 122 0.1
## 35: LIGHT SNOW 119 0.1
## 36: FROST/FREEZE 116 0.1
## 37: EXTREME COLD/WIND CHILL 111 0.1
## 38: MARINE TSTM WIND 109 0.1
## 39: FOG 101 0.1
## event_type occurrences frequency
Therefore, we found 39 most frequent event types (where frequency of occurence >= 0.1%) represent 99.3% of the total number of storm events. This is a manageable number of events.
rm(cutoff)
# calculate property and crop loss
storm_data <- storm_data[, property_loss :=
ifelse(PROPDMGEXP == "K", 1000 * PROPDMG,
ifelse(PROPDMGEXP == "M", 1000000 * PROPDMG,
ifelse(PROPDMGEXP == "B", 1000000000 * PROPDMG, 0)))]
storm_data <- storm_data[, crop_loss :=
ifelse(CROPDMGEXP == "K", 1000 * CROPDMG,
ifelse(CROPDMGEXP == "M", 1000000 * CROPDMG,
ifelse(CROPDMGEXP == "B", 1000000000 * CROPDMG, 0)))]
# key the data set on the NOAA reference number
str(storm_data)
## Classes 'data.table' and 'data.frame': 201319 obs. of 13 variables:
## $ STATE__ : num 6 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Date, format: "1995-12-31" "1996-01-06" ...
## $ STATE : chr "CA" "AL" "AL" "AL" ...
## $ EVTYPE : chr "HIGH WINDS" "WINTER STORM" "TORNADO" "TSTM WIND" ...
## $ FATALITIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ INJURIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PROPDMG : num 5.5 380 100 3 5 2 400 12 8 12 ...
## $ PROPDMGEXP : chr "M" "K" "K" "K" ...
## $ CROPDMG : num 7 38 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP : chr "M" "K" "" "" ...
## $ REFNUM : num 192312 248768 248769 248770 248771 ...
## $ property_loss: num 5500000 380000 100000 3000 5000 2000 400000 12000 8000 12000 ...
## $ crop_loss : num 7000000 38000 0 0 0 0 0 0 0 0 ...
## - attr(*, "sorted")= chr "REFNUM"
## - attr(*, ".internal.selfref")=<externalptr>
# weather events with maximum property loss
pl_sort <- order(storm_data$property_loss, decreasing = TRUE)
storm_data[head(pl_sort), list(BGN_DATE, STATE, EVTYPE, property_loss)]
## BGN_DATE STATE EVTYPE property_loss
## 1: 2006-01-01 CA FLOOD 1.150e+11
## 2: 2005-08-29 LA STORM SURGE 3.130e+10
## 3: 2005-08-28 LA HURRICANE/TYPHOON 1.693e+10
## 4: 2005-08-29 MS STORM SURGE 1.126e+10
## 5: 2005-10-24 FL HURRICANE/TYPHOON 1.000e+10
## 6: 2005-08-28 MS HURRICANE/TYPHOON 7.350e+09
# weather events with maximum crop loss
cl_sort <- order(storm_data$crop_loss, decreasing = TRUE)
storm_data[head(cl_sort), list(BGN_DATE, STATE, EVTYPE, crop_loss)]
## BGN_DATE STATE EVTYPE crop_loss
## 1: 2005-08-29 MS HURRICANE/TYPHOON 1.510e+09
## 2: 2006-01-01 TX DROUGHT 1.000e+09
## 3: 1998-12-20 CA EXTREME COLD 5.960e+08
## 4: 2001-08-01 IA DROUGHT 5.788e+08
## 5: 2000-11-01 TX DROUGHT 5.150e+08
## 6: 1998-07-06 OK DROUGHT 5.000e+08
The highest property loss of $1.15e+11 is recorded for a flood in California in 2006. And the highest crop loss of $1.51e+09 is recorded for a Hurricane / Typhoon in MS in 2005.
rm(pl_sort, cl_sort)
Our goal is to review the following questions:-
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
We address these questions with bubble plots. To draw the plots we need to prepare a summary data table from my “storm_data” data table that totals the human health impacts and totals the economic impacts by weather event. We use simple sums of fatalties and injuires to represent human health impacts. Similarly, we use simple sums of property loss and crop loss to represent economic impacts. Lastly, we add a column that totals the types of weather events.
bubble_plot <- storm_data[, list(sum(FATALITIES), sum(INJURIES),
sum(property_loss),
sum(crop_loss)),
by = EVTYPE]
setnames(bubble_plot, c("EVTYPE", "FATALITIES", "INJURIES", "property_loss",
"crop_loss"))
setkey(bubble_plot, EVTYPE)
event_count <- storm_data[, c(table(EVTYPE))]
bubble_plot <- bubble_plot[, events := event_count]
bubble_plot <- bubble_plot[, human_health := (FATALITIES + INJURIES)]
bubble_plot <- bubble_plot[, economic_impact := (property_loss + crop_loss)]
Since the questions are comparative and the totals for human health and economic impacts between particular event types can differ wildly, we decided to normalize the data. We set the events with maximum human health and economic impacts equal to 100. All other events, have human health and economic impacts at some proportion of these maximums. We added columns to the dataset accrodingly.
bubble_plot <- bubble_plot[, human_health_nml :=
round(human_health/max(human_health)*100,1)]
bubble_plot <- bubble_plot[, economic_impact_nml :=
round(economic_impact/max(economic_impact)*100,1)]
# re-order the columns using column numbers
setcolorder(bubble_plot, c(1, 6, 2, 3, 4, 5, 7, 8, 9, 10))
#Here is what structure of the "bubble_plot" data table looks like.
str(bubble_plot)
## Classes 'data.table' and 'data.frame': 222 obs. of 10 variables:
## $ EVTYPE : chr " HIGH SURF ADVISORY" " FLASH FLOOD" " TSTM WIND" " TSTM WIND (G45)" ...
## $ events : int 1 1 2 1 3 8 2 264 1 1 ...
## $ FATALITIES : num 0 0 0 0 0 0 0 223 1 70 ...
## $ INJURIES : num 0 0 0 0 0 0 0 156 24 385 ...
## $ property_loss : num 200000 50000 8100000 8000 0 ...
## $ crop_loss : num 0 0 0 0 28820000 ...
## $ human_health : num 0 0 0 0 0 0 0 379 25 455 ...
## $ economic_impact : num 200000 50000 8100000 8000 28820000 ...
## $ human_health_nml : num 0 0 0 0 0 0 0 1.7 0.1 2.1 ...
## $ economic_impact_nml: num 0 0 0 0 0 0 0 0 0 0.4 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "EVTYPE"
We used quantiles to provide a data based rationalization of which weather events to classify as low, medium or high impact events. We chose 3 probabilities representing high, medium and low impacts.
Quantiles give a data based rationalization for low, medium and high impact events.
hh_quant <- quantile(bubble_plot$human_health_nml, c(0.33, 0.66, 1))
ei_quant <- quantile(bubble_plot$economic_impact_nml, c(0.33, 0.66, 1))
# look at the data
hh_quant
## 33% 66% 100%
## 0 0 100
ei_quant
## 33% 66% 100%
## 0 0 100
Results
Ploting data for weather events with relatively high human health impacts
p1 <- ggplot(bubble_plot[human_health_nml >= hh_quant[2]],
aes(human_health_nml, economic_impact_nml, size = events,
label = EVTYPE))
p1 <- p1 + geom_jitter(colour="wheat1", fill = "wheat3", shape = 21,
alpha = 0.3) + scale_size_area(max_size = 25) +
geom_text(size=3)
p1 <- p1 + xlim(c(0, 100)) + ylim(c(0, 100))
p1 <- p1 + labs(title = "Relative Comparison of the Cumulative Human Health
Impacts of Severe Weather Events, \nUSA 1996-2011")
p1 <- p1 + xlab("Human Health Impact \n(Normalized: Tornado = 100 )") +
ylab("Economic Impact \n(Normalized: Storm Surge = 100 )")
g <- guide_legend("Number of Events")
p1 <- p1 + guides(size = g)
p1 <- p1 + theme(legend.position = "bottom")
p1 <- p1 + theme_bw()
print(p1)
## Warning: Removed 136 rows containing missing values (geom_point).
Ploting data for weather events with relatively high economic impacts
p2 <- ggplot(bubble_plot[economic_impact_nml >= ei_quant[2]],
aes(human_health_nml, economic_impact_nml, size = events,
label = EVTYPE))
p2 <- p2 + geom_jitter(colour="blue1", fill = "blue3", shape = 21,
alpha = 0.3) + scale_size_area(max_size = 25) +
geom_text(size = 3)
p1 <- p2 + xlim(c(0, 100)) + ylim(c(0, 100))
p2 <- p2 + labs(title= "Relative Comparison of the Cumulative Economic
Impacts of Severe Weather Events, \nUSA 1996-2011")
p2 <- p2 + xlab("Human Health Impact \n(Normalized: Tornado = 100 )") +
ylab("Economic Impact \n(Normalized: Storm Surge = 100 )")
g <- guide_legend("Number of Events")
p2 <- p2 + guides(size = g)
p2 <- p2 + theme(legend.position = "bottom")
p2 <- p2 + theme_bw()
print(p2)