Storms and other severe weather events can cause both public health and economic problems. The U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database gives us an opportunity to track characteristics of major weather events in the United States as well as their consequences on people and properties.
In this project we explore the NOAA database to understand which are the weather events that lead to the greatest public health and economic consequences. We found that tornadoes and heat caused the most fatalities and injuries, floods and hurricane had a negative impact on property and crop.
The dataset was downloaded and imported directly from the course website. Documentation on the dataset, including how the variables were constructed is available at:
library(dplyr)
library(ggplot2)
library(knitr)
library(maps)
if(!file.exists("data")){
dir.create("data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "./Reproducible Research/Peer Assessment 2/data/stormdata.csv.bz2", method = "curl")
dat <- read.csv("stormdata.csv.bz2", na.strings ="?")
The dataset has 902297 observations and 37 variables. First, we fix the event type variable. Events that are the same were registered under different names (e.g. Beach Erosion and BEACH EROSION). We transform all the event type in uppercase, delete unnecessary spaces, plurals, synonyms, and correct spelling mistakes.
# clean event type
# write every event in upper case
dat$EVTYPE <- as.factor(toupper(trimws(dat$EVTYPE, which = "left")))
# put abnormal warmth and excessive heat together
dat$EVTYPE[dat$EVTYPE == "ABNORMAL WARMTH" | dat$EVTYPE == "EXTREME HEAT"] <- "EXCESSIVE HEAT"
# fix spelling mistakes and unnecessary plurals
dat$EVTYPE[dat$EVTYPE == "AVALANCE"] <- "AVALANCHE"
dat$EVTYPE[grep("BEACH ER", dat$EVTYPE)] <- "BEACH EROSION"
dat$EVTYPE[grep("BITTER WIND CHILL", dat$EVTYPE)] <- "BITTER WIND CHILL TEMPERATURES"
dat$EVTYPE[grep("BLOW-OUT TIDES", dat$EVTYPE)] <- "BLOW-OUT TIDE"
dat$EVTYPE[grep("DUST DEVEL", dat$EVTYPE)] <- "DUST DEVIL"
dat$EVTYPE[grep("BRUSH FIRES", dat$EVTYPE)]<- "BRUSH FIRE"
dat$EVTYPE[grep("VOLCANIC ASH+", dat$EVTYPE)] <- "VOLCANIC ASH"
dat$EVTYPE[grep("COASTALSTORM", dat$EVTYPE)] <- "COASTAL STORM"
dat$EVTYPE[c(grep("WATER SPOUT", dat$EVTYPE),grep("WAYTERSPOUT", dat$EVTYPE))]<- "WATERSPOUT"
dat$EVTYPE[grep("RIP CURRENT+", dat$EVTYPE)] <- "RIP CURRENTS"
dat$EVTYPE[grep("DUSTSTORM", dat$EVTYPE)] <- "DUST STORM"
dat$EVTYPE[grep("EXCESSIVE WETNESS", dat$EVTYPE)] <- "EXTREMELY WET"
dat$EVTYPE[grep("HYPOTHERMIA+", dat$EVTYPE)] <- "HYPOTHERMIA"
# put together same events (based on event description)
dat$EVTYPE[grep("BLIZZARD+", dat$EVTYPE)] <- "BLIZZARD"
dat$EVTYPE[grep("HAIL+", dat$EVTYPE)] <- "HAIL"
dat$EVTYPE[grep("BLOWING SNOW+", dat$EVTYPE)] <- "BLOWING SNOW"
dat$EVTYPE[grep("COLD+", dat$EVTYPE)] <- "COLD"
dat$EVTYPE[grep("DRY+", dat$EVTYPE)] <- "DRY"
dat$EVTYPE[grep("FROST+", dat$EVTYPE)] <- "FROST"
dat$EVTYPE[grep("FUNNEL+", dat$EVTYPE)] <- "FUNNEL"
dat$EVTYPE[c(grep("FLASH FLOOD+", dat$EVTYPE),grep("FLOOD+", dat$EVTYPE))] <- "FLOOD"
dat$EVTYPE[c(grep("FREEZING DRIZZLE+", dat$EVTYPE),grep("FREEZING RAIN+", dat$EVTYPE))] <- "FREEZING RAIN"
dat$EVTYPE[c(grep("FROST", dat$EVTYPE),grep("FREEZE", dat$EVTYPE))] <- "FROST/FREEZE"
dat$EVTYPE[c(grep("LIGHTN+", dat$EVTYPE), grep("LIGHTNING", dat$EVTYPE),
grep("LIGNTNING", dat$EVTYPE),grep("LIGHTING", dat$EVTYPE))]<- "LIGHTNING"
dat$EVTYPE[grep("HEAVY SNOW+", dat$EVTYPE)] <- "HEAVY SNOW"
dat$EVTYPE[c(grep("HEAVY RAIN+", dat$EVTYPE), grep("URBAN+", dat$EVTYPE),
grep("HEAVY PRECIP+", dat$EVTYPE),grep("HVY RAIN", dat$EVTYPE),
grep("TORRENTIAL RAIN+", dat$EVTYPE),grep("EXCESSIVE RAIN+", dat$EVTYPE),
grep("EXCESSIVE PRECIPITATION", dat$EVTYPE),
grep("RAIN (HEAVY)",dat$EVTYPE),grep("HEAVY SHOW+",dat$EVTYPE),
grep("SMALL STREAM AND", dat$EVTYPE),grep("SML STREAM FLD", dat$EVTYPE))] <- "HEAVY RAIN"
dat$EVTYPE[grep("HEAT+", dat$EVTYPE)] <- "EXCESSIVE HEAT"
dat$EVTYPE[grep("SLEET+", dat$EVTYPE)] <- "SLEET"
dat$EVTYPE[grep("SNOW+", dat$EVTYPE)] <- "SNOW"
dat$EVTYPE[grep("LOW TEMPERATURE+", dat$EVTYPE)] <- "LOW TEMPERATURE"
dat$EVTYPE[c(grep("THUNDERSTORM+", dat$EVTYPE),grep("THUDERSTORM+", dat$EVTYPE),
grep("THUNDERESTORM+",dat$EVTYPE),grep("THUNDEERSTORM+",dat$EVTYPE),
grep("TSTM+",dat$EVTYPE),grep("THUNDERSTROM+",dat$EVTYPE),
grep("THUNDERTORM+",dat$EVTYPE), grep("THUNDERTSORM+",dat$EVTYPE),
grep("THUNDESTORM+",dat$EVTYPE), grep("THUNERSTORM+",dat$EVTYPE))] <- "THUNDERSTORM"
dat$EVTYPE[grep("THUNDERSNOW+ ", dat$EVTYPE)] <- "THUNDERSNOW"
dat$EVTYPE[c(grep("TORNADO+", dat$EVTYPE), grep("TORNAO", dat$EVTYPE),
grep("TORNDAO", dat$EVTYPE))] <- "TORNADO"
dat$EVTYPE[grep("TROPICAL STORM+", dat$EVTYPE)]<- "TROPICAL STORM"
dat$EVTYPE[grep("WATERSPOUT+", dat$EVTYPE)] <- "WATERSPOUT"
dat$EVTYPE[c(grep("WILD+", dat$EVTYPE), grep("FOREST FIRES",dat$EVTYPE))]<- "WILDFIRES"
dat$EVTYPE[c(grep("WIND+", dat$EVTYPE), grep("WND", dat$EVTYPE))] <- "WIND"
dat$EVTYPE[c(grep("WINTER+", dat$EVTYPE),grep("WINTRY MIX",dat$EVTYPE))] <- "WINTER WEATHER"
dat$EVTYPE[c(grep("UNSEASONABLY HOT+", dat$EVTYPE),grep("UNUSUAL HOTTH", dat$EVTYPE),
grep("UNUSUALLY HOT", dat$EVTYPE),
grep("UNUSUAL/RECORD HOTTH", dat$EVTYPE))]<- "UNSEASONABLY HOT"
dat$EVTYPE[grep("UNSEASONABLY COOL+", dat$EVTYPE)]<- "UNSEASONABLY COOL"
dat$EVTYPE[c(grep("MUD SLIDES", dat$EVTYPE),grep("MUD/ROCK SLIDE", dat$EVTYPE),
grep("MUDSLIDE+", dat$EVTYPE))]<- "MUD SLIDE"
dat$EVTYPE[grep("GUSTNADO+", dat$EVTYPE)] <- "GUSTNADO"
dat$EVTYPE[grep("HURRICANE+", dat$EVTYPE)] <- "HURRICANE"
dat$EVTYPE[grep("GLAZE+", dat$EVTYPE)]<- "GLAZE ICE"
# delete summary event
dat <- dat[-c(grep("SUMMARY", dat$EVTYPE)), ]
# delete type = "EXCESSIVE", "HIGH" and "SOUTHEAST" as there is no information on whether it is excessiove cold or heat
dat <- subset(dat, EVTYPE != "EXCESSIVE")
dat <- subset(dat, EVTYPE != "HIGH")
dat <- subset(dat, EVTYPE != "SOUTHEAST")
# drop NONE and "NO SEVERE WEATHER"
dat <- subset(dat, EVTYPE != "NONE")
dat <- subset(dat, EVTYPE != "NO SEVERE WEATHER")
# change reamining synomyns
dat$EVTYPE <- gsub("WARM", "HOT",dat$EVTYPE)
dat$EVTYPE <- gsub("HIGH ", "HEAVY ",dat$EVTYPE)
dat$EVTYPE <- as.factor(dat$EVTYPE)
# last change on plurals and same event
dat$EVTYPE[grep("HEAVY SWELLS", dat$EVTYPE)]<- "HEAVY SWELLS"
dat$EVTYPE[grep("HEAVY SURF+", dat$EVTYPE)]<- "HEAVY SURF"
dat$EVTYPE[c(grep("RECORD HEAVY TEMPERATURE+", dat$EVTYPE),grep("RECORD HIGH", dat$EVTYPE),
grep("RECORD HOT+", dat$EVTYPE),
grep("RECORD HOTTH", dat$EVTYPE),grep("UNSEASONABLY HOT+", dat$EVTYPE),
grep("UNUSUAL HOTTH", dat$EVTYPE),grep("VERY HOT",dat$EVTYPE))]<- "UNUSUALLY HOT"
# put hurricane and thypoon under the hurricane categories since they are classified as the same event in NOAA
dat$EVTYPE[c(grep("HURRICANE", dat$EVTYPE),grep("TYPHOON", dat$EVTYPE))] <- "HURRICANE"
dat$EVTYPE[grep("LANDSLIDES", dat$EVTYPE)]<- "LANDSLIDE"
dat$EVTYPE[grep("WET MIC+", dat$EVTYPE)]<- "WET MICROBURST"
dat$EVTYPE[grep("STORM SURGE+", dat$EVTYPE)]<- "STORM SURGE"
dat$EVTYPE <- droplevels(dat$EVTYPE)
After cleaning and regrouping events, we have 145 storm and other severe events (still higher than the events reported in the documentation). The most common events recorded are:
kable(tail(dat %>% group_by(EVTYPE) %>% summarise(n_obs = n()) %>% arrange((n_obs))))
| EVTYPE | n_obs |
|---|---|
| WINTER WEATHER | 19690 |
| WIND | 26554 |
| TORNADO | 60699 |
| FLOOD | 82729 |
| HAIL | 290401 |
| THUNDERSTORM | 335679 |
The variables FATALITIES and INJURIES reports the number of fatalities and injuries respectively. We can see that those are coded as numeric variables, and we can briefly look at summary statistics to check whether there is something problematic
class(dat$FATALITIES)
[1] “numeric”
class(dat$INJURIES)
[1] “numeric”
summary(dat$FATALITIES)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.0168 0.0000 583.0000
summary(dat$INJURIES)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.0000 0.0000 0.1558 0.0000 1700.0000
dat$EVTYPE[which.max(dat$INJURIES)]
[1] TORNADO 145 Levels: ABNORMALLY WET APACHE COUNTY … WINTER WEATHER
dat$EVTYPE[which.max(dat$FATALITIES)]
[1] EXCESSIVE HEAT 145 Levels: ABNORMALLY WET APACHE COUNTY … WINTER WEATHER
The variables PROPDMG, PROPDMGEXP and CROPDMG CROPDMGEXP summarizes damages to property and crop respectively. PROPDMGEXP and CROPDMGEXP represent the power of 10 that it is needed to convert PROPDMG and CROPDMG to their real values (for instance, H is hundreds, B billion and so on). However, those variables are summarised as factors; hence, first we need to transform them to numeric:
class(dat$PROPDMGEXP)
## [1] "factor"
levels(dat$PROPDMGEXP)
## [1] "" "-" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m"
## [18] "M"
table(dat$PROPDMGEXP)
##
## - + 0 1 2 3 4 5 6
## 465853 1 5 216 25 13 4 4 28 4
## 7 8 B h H K m M
## 5 1 40 1 6 424664 7 11330
# rename levels of a factor
levels(dat$PROPDMGEXP) <- c(rep(0,4),1:8,9,2,2,3,6,6)
table(dat$PROPDMGEXP)
##
## 0 1 2 3 4 5 6 7 8 9
## 466075 25 20 424668 4 28 11341 5 1 40
# transform in numeric paying attention on the transformation
dat$PROPDMGEXP <- (as.numeric(as.character(dat$PROPDMGEXP)))
table(dat$PROPDMGEXP)
##
## 0 1 2 3 4 5 6 7 8 9
## 466075 25 20 424668 4 28 11341 5 1 40
class(dat$CROPDMGEXP)
## [1] "factor"
levels(dat$CROPDMGEXP)
## [1] "" "0" "2" "B" "k" "K" "m" "M"
table(dat$CROPDMGEXP)
##
## 0 2 B k K m M
## 618331 19 1 9 21 281832 1 1994
levels(dat$CROPDMGEXP) <- c(rep(0,2),2,9,3,3,6,6)
table(dat$CROPDMGEXP)
##
## 0 2 9 3 6
## 618350 1 9 281853 1995
# transform in numeric paying attention on the transformation
dat$CROPDMGEXP <- (as.numeric(as.character(dat$CROPDMGEXP)))
table(dat$CROPDMGEXP)
##
## 0 2 3 6 9
## 618350 1 281853 1995 9
then, we can use them as powers to find the total damage
dat$PROPDMG <- dat$PROPDMG*(10^dat$PROPDMGEXP)
dat$CROPDMG <- dat$CROPDMG*(10^dat$CROPDMGEXP)
To understand the impact of severe weather events on population health, we look at the number of injuries and fatalities each event causes. The tables below reports the top 10 event for fatalities and injuries respectively:
# find events that caused either injuries or fatalities
df_injury <- dat %>% group_by(EVTYPE) %>% summarise(n_fatalities = sum(FATALITIES), n_injuries = sum(INJURIES)) %>% filter(n_fatalities > 0 | n_injuries > 0)
# takes the top 10 events that caused the most fatalities
fatalities10 <- tail(df_injury[order(df_injury$n_fatalities),],10)[ ,c("EVTYPE","n_fatalities")]
# reorder from the most damaging to the least damaging
fatalities10 <- fatalities10 %>% arrange(desc(n_fatalities))
kable(fatalities10)
| EVTYPE | n_fatalities |
|---|---|
| TORNADO | 5636 |
| EXCESSIVE HEAT | 3138 |
| FLOOD | 1525 |
| LIGHTNING | 817 |
| THUNDERSTORM | 725 |
| RIP CURRENTS | 577 |
| WIND | 471 |
| COLD | 451 |
| WINTER WEATHER | 278 |
| AVALANCHE | 225 |
# repeat similar analysis but look at injuries
injuries10 <- tail(df_injury[order(df_injury$n_injuries),],10)[ ,c("EVTYPE","n_injuries")]
injuries10 <- injuries10 %>% arrange(desc(n_injuries))
kable(injuries10)
| EVTYPE | n_injuries |
|---|---|
| TORNADO | 91407 |
| THUNDERSTORM | 9447 |
| EXCESSIVE HEAT | 9224 |
| FLOOD | 8604 |
| LIGHTNING | 5232 |
| ICE STORM | 1975 |
| WINTER WEATHER | 1953 |
| WIND | 1910 |
| WILDFIRES | 1606 |
| HAIL | 1467 |
# put fatalities and injuries together to create plot
fatalities10 <- data.frame(fatalities10, consequence = "FATALITIES")
colnames(fatalities10) <- c("event", "n", "consequences")
injuries10 <- data.frame(injuries10, consequence = "INJURIES")
colnames(injuries10) <- c("event", "n", "consequences")
data_plot <- rbind(fatalities10, injuries10)
# reorder event type by number of events regardless of whether they are injuries or fatalities
data_plot$event <- factor(data_plot$event, levels = data_plot$event[order(data_plot$n)])
ggplot(data_plot , aes(x = event, y = n)) + geom_bar(stat = "identity") +
facet_wrap(~ consequences, scale = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.background = element_rect(fill = "white"),
axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'),
axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid'))
Tornadoes cause the most fatalities and injuries, following tornadoes we find excessive heat and flood.
The economic consequences of sever weather event was studied looking at damage on properties and crops. Specifically, for each event, we look at the total damage (summing all the reported values) and we list here the ten “more costly” events. For property damage we have:
# find events that caused property damage
df_property <- dat %>% group_by(EVTYPE) %>%
summarise(tot_damage = sum(PROPDMG, na.rm = T)) %>% filter(tot_damage > 0)
# takes the top 10 events that caused the most damage to property
property10 <- tail(df_property[order(df_property$tot_damage),],10)
# reorder from the most damaging to the least damaging
property10 <- property10 %>% arrange(desc(tot_damage))
kable(property10)
| EVTYPE | tot_damage |
|---|---|
| FLOOD | 168212215835 |
| HURRICANE | 85256410010 |
| TORNADO | 57003318376 |
| STORM SURGE | 47964724000 |
| HAIL | 17622991537 |
| THUNDERSTORM | 11139179176 |
| WILDFIRES | 8496563500 |
| TROPICAL STORM | 7714390550 |
| WINTER WEATHER | 6716307751 |
| WIND | 6236995123 |
Floods and hurricanes caused the highest damage in term of property. For crop damage we can see that droughts and floods were the ones that caused most of the “more expensive” crop damage:
# find events that caused property damage
df_crop <- dat %>% group_by(EVTYPE) %>%
summarise(tot_damage = sum(CROPDMG, na.rm = T)) %>% filter(tot_damage > 0)
# takes the top 10 events that caused the most damage to property
crop10 <- tail(df_crop[order(df_crop$tot_damage),],10)
# reorder from the most damaging to the least damaging
crop10 <- crop10 %>% arrange(desc(tot_damage))
kable(crop10)
| EVTYPE | tot_damage |
|---|---|
| DROUGHT | 13972566000 |
| FLOOD | 12380109100 |
| HURRICANE | 5506117800 |
| ICE STORM | 5022113500 |
| HAIL | 3114212873 |
| FROST/FREEZE | 1997061000 |
| COLD | 1416765550 |
| THUNDERSTORM | 1206853738 |
| EXCESSIVE HEAT | 904469280 |
| HEAVY RAIN | 803900900 |
Finally, to help localise the areas where severe weather events caused the most economic damage, we sum damage to crops and to properties by State and point on the US map which are the States that experience the highest economic damage.
# find events that caused ecomonic damage
df_tot <- dat %>% group_by(STATE, EVTYPE) %>%
summarise(tot_crop = sum(CROPDMG, na.rm = T),tot_prop = sum(PROPDMG, na.rm = T)) %>%
mutate(tot_damage = tot_crop + tot_crop) %>%
filter(tot_damage > 0)
df_10 <- tail(df_tot[order(df_tot$tot_damage),],10)
df_10$STATE <- droplevels(df_10$STATE)
# download states
all_states <- map(database = "state", col = "blue", fill=T, namesonly=TRUE, plot = F)
# 10 most problematic data
df_10 <- tail(df_tot[order(df_tot$tot_damage),],10)
df_10$STATE <- droplevels(df_10$STATE)
# change levels to full name states for graph
levels(df_10$STATE) <- c("oklahoma", "louisiana", "north carolina:main", "nebraska",
"california", "florida","iowa", "illinois", "mississippi", "texas")
# map the 10 events with highest economic consequences
map(database = "state",col = c("white", "red")[1+(all_states %in% levels(df_10$STATE))],fill=T)
Oklahoma, Louisiana, North Carolina, Nebraska, California, Florida, Iowa, Illinois, Mississippi, Texas were the ones who had to pay an highest price in term of economic damage in the last 60 years.
Tornadoes, heat and flood caused the most fatalities and injuries. Floods also had a negative impact on property and crop. Hurricane did not seem to have any major effect on population health; however, most of the economic damage (either crop or property) were caused by hurricane.
The event that causes most of the economic damage happened for large part (7 out of 10 States) in the South and coastal areas were hurricane are more common.
The dataset was downloaded on:
date()
## [1] "Thu Jun 30 19:59:41 2016"