Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in loss lives and considerable economic damages, and preventing such outcomes to the extent possible is a key concern. In this study, the storm database from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) is utilized to estimate fatalities, injuries, and property damage of major storms and weather events in the United States. Throughout of the analysis, the NOAA’s data is collected, clean and summarized to present the ten most harmfull events for the country.
The dataset Storm Data obtained from the National Oceanic and Atmospheric Administration comes in the form of a csv file compressed with the bzip2 algorithm. An extraction of the file “stormdata.csv.bz2” is performed and the data is loaded into a data table for analysis. Also, the date variable is reformated to a proper type.
library(data.table)
library(stringr)
# fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2";
# download.file(fileUrl, destfile="stormdata.csv.bz2", method="wget")
# Load the data from file
stormraw <- read.csv(bzfile("stormdata.csv.bz2"), stringsAsFactors=F, header=T);
## Warning in scan(file, what, nmax, sep, dec, quote, skip, nlines,
## na.strings, : EOF within quoted string
stormraw <- data.table(stormraw)
# convert the date to the POSIXct class
stormraw$BGN_DATE <- as.POSIXct(stormraw$BGN_DATE, format="%m/%d/%Y %H:%M:%S")
The provided dataset was collected from various sources and is not uniform as recommended by the NOAA documentation. A prior cleaning process is therefore necessary for further analysis. Also, for the purpose of the study, only the following relevant fields were retained:
According to the documentation, the crop and property damages are represented by variables CROPDMG and PROPDMG, respectively, and have their corresponding magnifiers CROPDMGEXP and PROPDMGEXP. Those laters represent the exponent of a power ten. Some initial values represent the hundred, the thousand, the million and the billion with alphabetical letters H, K, M and B. Thus we have to convert those to their equivalent numerical values. Also, many entries were not provided or have irregular symbol as “?”, “+” or “-”; we decide in such case to consider them as the default values 3 as were suggested in the appendix B1 of the documentation. The reformating process is describe with the following R-script.
# extract relevant variables for damage analysis
relevantvar <- c("BGN_DATE", "STATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP",
"CROPDMG", "CROPDMGEXP")
storm <- stormraw[, relevantvar, with=FALSE]
# clean the multiplier columns (PROPDMGEXP and CROPDMGEXP) and reformate to proper numeric values
# we default undefined symbols as 10^3
storm$PROPDMGEXP <- toupper(storm$PROPDMGEXP);
storm$CROPDMGEXP <- toupper(storm$CROPDMGEXP);
storm[PROPDMGEXP=="", PROPDMGEXP:="K"];
storm[CROPDMGEXP=="", CROPDMGEXP:="K"];
storm$CROPDMGEXP <- gsub("[[:punct:]]", "K", storm$CROPDMGEXP);
storm$PROPDMGEXP <- gsub("[[:punct:]]", "K", storm$PROPDMGEXP);
storm$CROPDMGEXP <- gsub("H", "2", storm$CROPDMGEXP);
storm$CROPDMGEXP <- gsub("K", "3", storm$CROPDMGEXP);
storm$CROPDMGEXP <- gsub("M", "6", storm$CROPDMGEXP);
storm$CROPDMGEXP <- gsub("B", "9", storm$CROPDMGEXP);
storm$PROPDMGEXP <- gsub("H", "2", storm$PROPDMGEXP);
storm$PROPDMGEXP <- gsub("K", "3", storm$PROPDMGEXP);
storm$PROPDMGEXP <- gsub("M", "6", storm$PROPDMGEXP);
storm$PROPDMGEXP <- gsub("B", "9", storm$PROPDMGEXP);
storm$CROPDMGEXP <- as.numeric(storm$CROPDMGEXP);
storm$PROPDMGEXP <- as.numeric(storm$PROPDMGEXP);
The next important field that need to reformate is the type of event field, EVTYPE. This variable has many entries that are not compliant with the recommended 48 events of the documentation (see table 1 of the documentation). To reformate this variable, we follow the following rules to reclassify the irregular entries:
This is described as the follow:
# Clean up the type of events (reformate the dataset's variables to permissible values)
# Syntax corrections
storm$EVTYPE <- toupper(storm$EVTYPE); # capitalize all characters
storm$EVTYPE <- str_trim(storm$EVTYPE); # remove excess spaces
storm$EVTYPE <- gsub("TSTM", "THUNDERSTORM", storm$EVTYPE); # replace TSTM with THUNDERSTORM
storm$EVTYPE <- gsub("WND", "WIND", storm$EVTYPE); # replace WND with WIND
storm$EVTYPE <- gsub("CSTL", "COASTAL", storm$EVTYPE); # replace CSTL with COASTAL
storm$EVTYPE <- gsub("DEVEL", "DEVIL", storm$EVTYPE); # replace DEVEL with DEVIL
storm$EVTYPE <- gsub("ASHFALL", "ASH", storm$EVTYPE); # replace ASHFALL with ASH
storm$EVTYPE <- gsub("FLOODING", "FLOOD", storm$EVTYPE); # replace FLOODING with FLOOD
storm$EVTYPE <- gsub("LIGHTING", "LIGHTNING", storm$EVTYPE); # replace LIGHTING with LIGHTNING
storm$EVTYPE <- gsub("TSTM|TUNDERSTORM|THUNERSTORM|THUNDERTORM|THUNDESTORM|THUNDERESTORM|THUNDERTSORM|THUNDERSTROM|THUNDEERSTORM|THUDERSTORM",
"THUNDERSTORM", storm$EVTYPE);
storm$EVTYPE <- gsub("S\\b", "", storm$EVTYPE); # remove 'S' at the end of words
# Re-classifications of events to proper (and permitted) types # MAIN EVENTS
storm$EVTYPE[grep("ASTRONOMICAL", storm$EVTYPE)] <- "ASTRONOMICAL LOW TIDE"; # ASTRONOMICAL LOW TIDE
storm$EVTYPE[grep("DRY", storm$EVTYPE)] <- "DROUGHT"; # drought
storm$EVTYPE[grep("HURRICANE|^TYPHOON", storm$EVTYPE)] <- "HURRICANE (TYPHOON)"; # hurricane
storm$EVTYPE[grep("TORNADO|TORNDAO", storm$EVTYPE)] <- "TORNADO"; # tornado
storm$EVTYPE[grep("THUNDERSTORM(.*)W", storm$EVTYPE)] <- "THUNDERSTORM WIND"; # thunderstorm wind
storm$EVTYPE[grep("LIGHTNING", storm$EVTYPE)] <- "LIGHTNING";
storm$EVTYPE[grep("TROPICAL STORM", storm$EVTYPE)] <- "TROPICAL STORM";
storm$EVTYPE[grep("WATERSPOUT", storm$EVTYPE)] <- "WATERSPOUT";
storm$EVTYPE[grep("FUNNEL CLOUD", storm$EVTYPE)] <- "FUNNEL CLOUD";
storm$EVTYPE[grep("FLOOD", storm$EVTYPE)] <- "FLOOD"; # group flash, coastal and lakeshore flood
storm$EVTYPE[grep("WINTER|COOL", storm$EVTYPE)] <- "WINTER WEATHER"; # group winter storm with winter weather
storm$EVTYPE[grep("ICE(.*)STORM", storm$EVTYPE)] <- "ICE STORM"; # ice storm
storm$EVTYPE[grep("BLIZZARD", storm$EVTYPE)] <- "BLIZZARD"; # blizzard
storm$EVTYPE[grep("COLD|CHILL", storm$EVTYPE)] <- "COLD/WIND CHILL"; # group cold and excessive cold
storm$EVTYPE[grep("HEAT", storm$EVTYPE)] <- "HEAT"; # group heat and excessive heats
storm$EVTYPE[grep("SNOW", storm$EVTYPE)] <- "HEAVY SNOW"; # group snow and lake-effect snows
storm$EVTYPE[grep("HIGH(.*)WIND|MICROBURST|DOWNBURST", storm$EVTYPE)] <- "HIGH WIND"; # group all kind of high wind
storm$EVTYPE[grep("WIND DAMAGE", storm$EVTYPE)] <- "HIGH WIND";
storm$EVTYPE[grep("SMOKE", storm$EVTYPE)] <- "DENSE SMOKE";
storm$EVTYPE[grep("ICE FOG", storm$EVTYPE)] <- "FREEZING FOG";
storm$EVTYPE[grep("FROST", storm$EVTYPE)] <- "FROST/FREEZE";
storm$EVTYPE[grep("^FOG|^PATCHY", storm$EVTYPE)] <- "DENSE FOG";
storm$EVTYPE[grep("HAIL", storm$EVTYPE)] <- "HAIL";
storm$EVTYPE[grep("SLEET", storm$EVTYPE)] <- "SLEET";
storm$EVTYPE[grep("DUST DEVIL", storm$EVTYPE)] <- "DUST DEVIL";
storm$EVTYPE[grep("DUST [^DEVIL]", storm$EVTYPE)] <- "DUST STORM";
storm$EVTYPE[grep("RAIN|PRECIPITATION", storm$EVTYPE)] <- "HEAVY RAIN";
storm$EVTYPE[grep("SURF", storm$EVTYPE)] <- "HIGH SURF";
storm$EVTYPE[grep("FIRE", storm$EVTYPE)] <- "WILDFIRE";
storm$EVTYPE[grep("VOLCANIC ASH", storm$EVTYPE)] <- "VOLCANIC ASH";
The tidy dataset that results from the previous preparation process serve to build our analysis. For each type of impact (fatalities, injuries or damage cost), we can easily have the corresponding summarization by summing the quantity in matter, according to each type of events. The following figures give the picture for ten of the most damaging storm events in term of human lifes and monetary cost.
storm[, ':=' (CROPDMGCASH=CROPDMG*10^CROPDMGEXP, PROPDMGCASH=PROPDMG*10^PROPDMGEXP)];
storm[, DMGCASH := (CROPDMGCASH + PROPDMGCASH)];
stormSummary <- storm[, .( FATALITIES=sum(FATALITIES, na.rm=TRUE),
INJURIES=sum(INJURIES, na.rm=TRUE),
DMGCOST=round(sum(DMGCASH)/10^6, 0) ),
by=EVTYPE]
library(ggplot2)
# top10 <- head(setorder(stormSummary, -FATALITIES), n=10)
top10 <- setorder(stormSummary, -FATALITIES)[1:10]
top10$EVTYPE <- factor(top10$EVTYPE, levels = top10$EVTYPE, ordered = TRUE)
qplot(x=EVTYPE, y=FATALITIES, data=top10, geom="bar", stat="identity", position="dodge",
xlab="Event types", ylab="Fatalities", main="Top 10 Natural Disasters With Impact On Human Lifes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# top10 <- head(setorder(stormSummary, -INJURIES), n=10)
top10 <- setorder(stormSummary, -INJURIES)[1:10]
top10$EVTYPE <- factor(top10$EVTYPE, levels = top10$EVTYPE, ordered = TRUE)
qplot(x=EVTYPE, y=INJURIES, data=top10, geom="bar", stat="identity", position="dodge",
xlab="Event types", ylab="Injuries", main="Top 10 Natural Disasters With Impact On Human Health") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# top10 <- head(setorder(stormSummary, -DMGCOST), n=10)
top10 <- setorder(stormSummary, -DMGCOST)[1:10]
top10$EVTYPE <- factor(top10$EVTYPE, levels = top10$EVTYPE, ordered = TRUE)
qplot(x=EVTYPE, y=DMGCOST, data=top10, geom="bar", stat="identity", position="dodge",
xlab="Event types", ylab="Damage cost (Million USD)", main="Top 10 Natural Disasters With Impact On the Economic") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))