The NOAA Storm database that includes registered severe weather events and effects data in the United States from 1950 to 2011 is explored. Their impact on human health, looking and fatalities and injuries caused, and the economy, looking at property and crop losses caused, is analyzed.
Downloading and reading the original data set.
if (!file.exists("repdata")) {
dir.create("repdata")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "./repdata%2Fdata%2FStormData.csv.bz2", method = "curl")
StormData <- read.csv("repdata%2Fdata%2FStormData.csv.bz2", stringsAsFactors = FALSE)
Loading the packages to be used.
library(lubridate, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(gridExtra, warn.conflicts = FALSE)
The data set includes over 900000 records with information for 37 variables.
dim(StormData)
## [1] 902297 37
colnames(StormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
The event date, event type, deaths caused, injures caused, property and crop damage value and multiplier code are selected for further analysis.
Storms <- select(StormData, BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP)
Setting the event date field as date.
Storms$BGN_DATE <- as.Date(strptime(Storms$BGN_DATE, "%m/%d/%Y %H:%M:%S"))
Adding property damage and crop damage value variables in USD using the correspondent values and multipliers codes.
Storms <- mutate(Storms, PROPDMGUSD = ifelse(PROPDMGEXP == "K", PROPDMG*10^3,
ifelse(PROPDMGEXP == "M", PROPDMG*10^6,
ifelse(PROPDMGEXP == "B", PROPDMG*10^9, PROPDMG))))
Storms <- mutate(Storms, CROPDMGUSD = ifelse(CROPDMGEXP == "K", CROPDMG*10^3,
ifelse(CROPDMGEXP == "M", CROPDMG*10^6,
ifelse(CROPDMGEXP == "B", CROPDMG*10^9, CROPDMG))))
There are 985 labels for the event type field.
length(unique(Storms$EVTYPE))
## [1] 985
Adding a new event code variable to consolidate event labels from 985 to 30 designated plus other event codes.
Storms <- mutate(Storms, EVENTCODE = "Other")
Storms$EVENTCODE[grepl("AVALANCHE|AVALANCE", Storms$EVTYPE, ignore.case = TRUE)] <- "Avalanche"
Storms$EVENTCODE[grepl("BLIZZARD", Storms$EVTYPE, ignore.case = TRUE)] <- "Blizzard"
Storms$EVENTCODE[grepl("COLD|COOL|WINTER|FREEZING", Storms$EVTYPE, ignore.case = TRUE)] <- "Cold"
Storms$EVENTCODE[grepl("DRY|DROUGHT|DRIEST", Storms$EVTYPE, ignore.case = TRUE)] <- "Dry Weather"
Storms$EVENTCODE[grepl("DUST|DUSTSTORM", Storms$EVTYPE, ignore.case = TRUE)] <- "Dust"
Storms$EVENTCODE[grepl("FIRE|FIRES", Storms$EVTYPE, ignore.case = TRUE)] <- "Fire"
Storms$EVENTCODE[grepl("FLOOD|FLD|FLOODING", Storms$EVTYPE, ignore.case = TRUE)] <- "Flood"
Storms$EVENTCODE[grepl("FOG", Storms$EVTYPE, ignore.case = TRUE)] <- "Fog"
Storms$EVENTCODE[grepl("FROST|FREEZE", StormData$EVTYPE, ignore.case = TRUE)] <- "Frost"
Storms$EVENTCODE[grepl("HAIL", Storms$EVTYPE, ignore.case = TRUE)] <- "Hail"
Storms$EVENTCODE[grepl("HEAT|WARM|TEMPERATURE|HOT", Storms$EVTYPE, ignore.case = TRUE)] <- "Heat"
Storms$EVENTCODE[grepl("HURRICANE|FUNNEL", Storms$EVTYPE, ignore.case = TRUE)] <- "Hurricane"
Storms$EVENTCODE[grepl("ICE|ICY|GLAZE", Storms$EVTYPE, ignore.case = TRUE)] <- "Ice"
Storms$EVENTCODE[grepl("LIGHT|LIGNTNING", Storms$EVTYPE, ignore.case = TRUE)] <- "Lightning"
Storms$EVENTCODE[grepl("RAIN|SHOWERS|PRECIPITATION", Storms$EVTYPE, ignore.case = TRUE)] <- "Rain"
Storms$EVENTCODE[grepl("RIP", Storms$EVTYPE, ignore.case = TRUE)] <- "Rip Currents"
Storms$EVENTCODE[grepl("SLIDE", Storms$EVTYPE, ignore.case = TRUE)] <- "Landslide"
Storms$EVENTCODE[grepl("SEICHE", Storms$EVTYPE, ignore.case = TRUE)] <- "Seiche"
Storms$EVENTCODE[grepl("SMOKE", Storms$EVTYPE, ignore.case = TRUE)] <- "Dense Smoke"
Storms$EVENTCODE[grepl("SNOW", Storms$EVTYPE, ignore.case = TRUE)] <- "Snow"
Storms$EVENTCODE[grepl("SURF|SWELLS", Storms$EVTYPE, ignore.case = TRUE)] <- "High Surf"
Storms$EVENTCODE[grepl("TIDE|WAVE", Storms$EVTYPE, ignore.case = TRUE)] <- "Tide"
Storms$EVENTCODE[grepl("TORNADO|FUNNEL|FUNNELS", Storms$EVTYPE, ignore.case = TRUE)] <- "Tornado"
Storms$EVENTCODE[grepl("THUNDER|TSTM|TUNDERSTORM", Storms$EVTYPE, ignore.case = TRUE)] <- "Thunderstorm"
Storms$EVENTCODE[grepl("TROPICAL", Storms$EVTYPE, ignore.case = TRUE)] <- "Tropical Depression"
Storms$EVENTCODE[grepl("TSUNAMI", Storms$EVTYPE, ignore.case = TRUE)] <- "Tsunami"
Storms$EVENTCODE[grepl("VOLCANIC", Storms$EVTYPE, ignore.case = TRUE)] <- "Volcanic Ash"
Storms$EVENTCODE[grepl("WATERSPOUT|WAYTERSPOUT", Storms$EVTYPE, ignore.case = TRUE)] <- "Waterspout"
Storms$EVENTCODE[grepl("WIND|WND|TYPHON", Storms$EVTYPE, ignore.case = TRUE)] <- "Strong Wind"
Storms$EVENTCODE[grepl("WET", Storms$EVTYPE, ignore.case = TRUE)] <- "Wet Weather"
Storms$EVTYPE <- as.factor(Storms$EVTYPE)
Storms$EVENTCODE <- as.factor(Storms$EVENTCODE)
Summary of totals for health, fatalities and injuries, and economic damage, property and crop damage in USD.
sapply(Storms[c(3,4,9,10)], sum)
## FATALITIES INJURIES PROPDMGUSD CROPDMGUSD
## 15145 140528 427279750338 49093756627
Severe weather events cause 9 times more death than injured and 8 times more property damage than crop damage.
Looking at registered events by year.
g_frecuency <- qplot(year(Storms$BGN_DATE), geom = "histogram", xlab = "Year", ylab = "Counts", binwidth = 1)
Looking at events by designated code.
events <- aggregate(!is.na(EVTYPE) ~ EVENTCODE, Storms, sum)
colnames(events) <- c("EVENTCODE", "NUMBER")
g_events <- ggplot(events) +
geom_bar(stat = "identity") +
aes(x = EVENTCODE, y = NUMBER) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Event Code") +
ylab("Counts")
Figure showing registered event distribution by year and designated code.
grid.arrange(g_frecuency, g_events, nrow = 2)
There are far more registered events in the 1995-2011 period. Strong winds and hail are the most abundant followed by floods and tornados.
It is worth noting that the decision and method chosen to consolidate the event labels affects the health and economic impact assessment that follows but there is no easy way around it.
Aggregating fatalities by event type.
fatalities <- aggregate(FATALITIES ~ EVENTCODE, Storms, sum)
fatalities <- arrange(fatalities, desc(FATALITIES))
fatalities <- mutate(fatalities, FATALITIESPERCENT = round(FATALITIES/(sum(fatalities$FATALITIES)), 3))
20% percent of events account for more than 85% of fatalities caused.
sum(fatalities[1:6, 2])/sum(fatalities$FATALITIES)
## [1] 0.860482
fatalities[1:10,]
## EVENTCODE FATALITIES FATALITIESPERCENT
## 1 Tornado 5633 0.372
## 2 Heat 3007 0.199
## 3 Flood 1552 0.102
## 4 Strong Wind 1451 0.096
## 5 Lightning 817 0.054
## 6 Rip Currents 572 0.038
## 7 Cold 489 0.032
## 8 Avalanche 225 0.015
## 9 Tide 196 0.013
## 10 High Surf 164 0.011
Aggregating injuries by event type.
injuries <- aggregate(INJURIES ~ EVENTCODE, Storms, sum)
injuries <- arrange(injuries, desc(INJURIES))
injuries <- mutate(injuries, INJURIESPERCENT = round(INJURIES/(sum(injuries$INJURIES)), 3))
20% percent of events account for more than 90% of injuries caused.
sum(injuries[1:6, 2])/sum(injuries$INJURIES)
## [1] 0.9110498
injuries[1:10,]
## EVENTCODE INJURIES INJURIESPERCENT
## 1 Tornado 91367 0.650
## 2 Strong Wind 11498 0.082
## 3 Heat 8850 0.063
## 4 Flood 8681 0.062
## 5 Lightning 5231 0.037
## 6 Ice 2401 0.017
## 7 Cold 2170 0.015
## 8 Fire 1608 0.011
## 9 Hail 1371 0.010
## 10 Hurricane 1326 0.009
Figure showing the health impact by top 10 most harmful severe weather events.
g_fatalities <- ggplot(fatalities[1:10, ]) +
geom_bar(stat = "identity") +
aes(x = reorder(EVENTCODE, FATALITIES), y = FATALITIES) +
xlab("Weather Event") +
ylab("Fatalities") +
coord_flip()
g_injuries <- ggplot(injuries[1:10, ]) +
geom_bar(stat = "identity") +
aes(x = reorder(EVENTCODE, INJURIES), y = INJURIES) +
xlab("Weather Event") +
ylab("Injuries") +
coord_flip()
grid.arrange(g_fatalities, g_injuries, ncol = 2)
Tornados, heat, floods, strong winds and ligtnings were the most harmful severe weather events in terms of fatalities and injuries caused accounting for 80% of the total in both cases. Tornados caused 8 times more injures than any other event.
Aggregating property damage by event type.
prop_damage <- aggregate(PROPDMGUSD ~ EVENTCODE, Storms, sum)
prop_damage <- arrange(prop_damage, desc(PROPDMGUSD))
prop_damage <- mutate(prop_damage, PROPDMGPERCENT = round(PROPDMGUSD/(sum(prop_damage$PROPDMGUSD)), 3))
10% percent of events account for more than 70% of property damage caused.
sum(prop_damage[1:3, 2])/sum(prop_damage$PROPDMGUSD)
## [1] 0.7234636
prop_damage[1:10,]
## EVENTCODE PROPDMGUSD PROPDMGPERCENT
## 1 Flood 167554630579 0.392
## 2 Hurricane 84636105030 0.198
## 3 Tornado 56930626640 0.133
## 4 Other 43928959600 0.103
## 5 Strong Wind 17740081054 0.042
## 6 Hail 15969143053 0.037
## 7 Fire 8496628500 0.020
## 8 Tropical Depression 7716127550 0.018
## 9 Cold 6841331651 0.016
## 10 Tide 4661593200 0.011
Aggregating crop damage by event type.
crop_damage <- aggregate(CROPDMGUSD ~ EVENTCODE, Storms, sum)
crop_damage <- arrange(crop_damage, desc(CROPDMGUSD))
crop_damage <- mutate(crop_damage, CROPDMGPERCENT = round(CROPDMGUSD/(sum(crop_damage$CROPDMGUSD)), 3))
20% percent of events account for more than 80% of crop damage caused.
sum(crop_damage[1:6, 2])/sum(crop_damage$CROPDMGUSD)
## [1] 0.854917
crop_damage[1:10,]
## EVENTCODE CROPDMGUSD CROPDMGPERCENT
## 1 Dry Weather 13972581000 0.285
## 2 Flood 12270384210 0.250
## 3 Hurricane 5495292810 0.112
## 4 Ice 5027114300 0.102
## 5 Hail 3046420890 0.062
## 6 Strong Wind 2159295547 0.044
## 7 Frost 1997061000 0.041
## 8 Cold 1385559500 0.028
## 9 Heat 898879280 0.018
## 10 Rain 805005800 0.016
Figure showing economic impact by top 10 most damaging severe weather events.
g_prop <- ggplot(prop_damage[1:10, ]) +
geom_bar(stat = "identity") +
aes(x = reorder(EVENTCODE, PROPDMGUSD), y = PROPDMGUSD) +
xlab("Event") +
ylab("Property Damage in USD") +
coord_flip()
g_crop <- ggplot(crop_damage[1:10, ]) +
geom_bar(stat = "identity") +
aes(x = reorder(EVENTCODE, CROPDMGUSD), y = CROPDMGUSD) +
xlab("Event") +
ylab("Crop Damage in USD") +
coord_flip()
grid.arrange(g_prop, g_crop, ncol = 2)
Floods, more than doubling any other, followed by hurricanes and strong winds were the 3 main causes of property losses. Dry weather and floods were the top causes of crop losses, more than doubling any other, followed by hurricanes and ice.