The overall aim of this analysis is to identify the type of severe weather events that has the greatest consequences for population health (e.g. injuries and fatalities) as well as the economy (e.g. property damage and crop damage) in the United States. To investigate this, I obtained a dataset of severe weather events in United States between the years 1950 and 2011 from the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database. The data shows that, when looking at the decades with most recorded events (> 1970), tornadoes are the events responsible for most fatalities and injuries. Besides tornadoes, both heat (and excessive heat), floods and thunderstorm winds are primary events causing fatalities and injuries. With respect to events that has consequences for the economy, I find that tornadoes, floods and thunderstorm winds are the events causing most property damage, while hail, floods and thunderstorm winds are the events causing most crop damage.
The dataset constitutes weather events taken from the U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database (the full dataset can be found here)
First, I read in the bzipped comma-separated file. The event types are slightly formatted (removing space, make all event types upper space etc. ). I consider weather events from the last 40 years only, as the period between 1951 and 1971 have few events recorded. Additionally, I add a specific year column for each severe weather event (extracted from the BGN_DATE field, which include date and time)
## import dplyr & stringr (data processing) and ggplot2 (plotting)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(stringr))
## read csv.bz2 file
weather_data <- read.csv(bzfile("repdata-data-StormData.csv.bz2"), header = T,
na.strings = "NA")
## add year as a column
weather_data$year <- as.integer(format(as.Date(weather_data$BGN_DATE, format = "%m/%d/%Y"),
format = "%Y"))
## format event type (TSTM as THUNDERSTORM and WIND as WINDS)
weather_data$EVTYPE <- gsub("TSTM", "THUNDERSTORM", weather_data$EVTYPE)
weather_data$EVTYPE <- gsub("WIND", "WINDS", weather_data$EVTYPE)
weather_data$EVTYPE <- as.factor(toupper(str_trim(as.character(weather_data$EVTYPE))))
## Ignore special event types (specific dates and records)
weather_data_refined <- weather_data[str_detect(as.character(weather_data$EVTYPE),
"^(SUMMARY|RECORD)") == F, ]
weather_data_refined <- weather_data_refined[weather_data_refined$year >= 1971,
]
weather_data_refined$EVTYPE <- as.factor(weather_data_refined$EVTYPE)
I consider the number of injuries and fatalities (column names: INJURIES & FATALITIES) registered with each event as quantities that reflect how harmful the event has beeen with respect to population health.
## order weather data by year and event, sum up number of fatilities and
## injuries per eventtype per year
event_year_grouping <- group_by(weather_data_refined, year, EVTYPE)
fatalities_injuries_by_year_event <- as.data.frame(summarise(event_year_grouping,
num_fatalities = sum(FATALITIES), num_injuries = sum(INJURIES)))
## get the total number of injuries and fatalities per event type
global_event_grouping <- group_by(fatalities_injuries_by_year_event, EVTYPE)
fatalities_injuries_global_type <- as.data.frame(summarise(global_event_grouping,
all_injuries = sum(num_injuries), all_fatalities = sum(num_fatalities)))
## Top 10 event types from 1971-2011 when ordered by number of fatalities
head(fatalities_injuries_global_type[with(fatalities_injuries_global_type, order(-all_fatalities)),
][, c(1, 3)], 10)
## EVTYPE all_fatalities
## 675 TORNADO 3199
## 108 EXCESSIVE HEAT 1903
## 130 FLASH FLOOD 978
## 237 HEAT 937
## 413 LIGHTNING 816
## 588 THUNDERSTORM WINDS 637
## 146 FLOOD 470
## 490 RIP CURRENT 368
## 315 HIGH WINDS 248
## 11 AVALANCHE 224
## Top 10 event types from 1971-2011 when ordered by number of injuries
head(fatalities_injuries_global_type[with(fatalities_injuries_global_type, order(-all_injuries)),
][, c(1, 2)], 10)
## EVTYPE all_injuries
## 675 TORNADO 58256
## 588 THUNDERSTORM WINDS 8445
## 146 FLOOD 6789
## 108 EXCESSIVE HEAT 6525
## 413 LIGHTNING 5230
## 237 HEAT 2100
## 382 ICE STORM 1975
## 130 FLASH FLOOD 1777
## 206 HAIL 1361
## 786 WINTER STORM 1321
global_year_grouping <- group_by(fatalities_injuries_by_year_event, year)
fatalities_injuries_year <- as.data.frame(summarise(global_year_grouping, total_injuries = sum(num_injuries),
total_fatalities = sum(num_fatalities)))
global_stats <- inner_join(fatalities_injuries_by_year_event, fatalities_injuries_year,
by = "year")
global_stats$frac_injuries <- round(global_stats$num_injuries/global_stats$total_injuries,
2)
global_stats$frac_fatalities <- round(global_stats$num_fatalities/global_stats$total_fatalities,
2)
# tmp <- group_by(global_stats, EVTYPE,year) tmp1 <- summarise(tmp, n =
# max(frac_fatalities), n2 = max(frac_injuries))
I consider the amount of dollars in property damage and crop damage (column names: PROPDMG & CROPDMG) registered with each event as quantities that reflect how harmful the event has beeen with respect to economy.
## order weather data by year and event, sum up amount of dollars in property
## damage and crop damage per eventtype per year
event_year_grouping <- group_by(weather_data_refined, year, EVTYPE)
prop_crop_damage_by_year_event <- as.data.frame(summarise(event_year_grouping,
prop_damage = sum(PROPDMG), crop_damage = sum(CROPDMG)))
## get the total amount of dollars in property damage and crop damage per
## event type
global_event_grouping <- group_by(prop_crop_damage_by_year_event, EVTYPE)
prop_crop_global_type <- as.data.frame(summarise(global_event_grouping, all_prop = sum(prop_damage),
all_crop = sum(crop_damage)))
## Top 10 event types from 1971-2011 when ordered by number of fatalities
head(prop_crop_global_type[with(prop_crop_global_type, order(-all_prop)), ][,
c(1, 2)], 10)
## EVTYPE all_prop
## 675 TORNADO 2615298
## 588 THUNDERSTORM WINDS 2212918
## 130 FLASH FLOOD 1420175
## 146 FLOOD 899938
## 206 HAIL 688693
## 413 LIGHTNING 603352
## 627 THUNDERSTORM WINDSS 446352
## 315 HIGH WINDS 324732
## 786 WINTER STORM 132721
## 268 HEAVY SNOW 122252
## Top 10 event types from 1971-2011 when ordered by number of injuries
head(prop_crop_global_type[with(prop_crop_global_type, order(-all_crop)), ][,
c(1, 3)], 10)
## EVTYPE all_crop
## 206 HAIL 579596
## 130 FLASH FLOOD 179200
## 588 THUNDERSTORM WINDS 175994
## 146 FLOOD 168038
## 675 TORNADO 100019
## 76 DROUGHT 33899
## 627 THUNDERSTORM WINDSS 18685
## 315 HIGH WINDS 17283
## 248 HEAVY RAIN 11123
## 179 FROST/FREEZE 7134