The U.S. National Oceanic and Atmospheric Administration (NOAA) records and publishes data on characteristics of major storms and weather events in the United States. NOAA recording commenced in 1950 and is ongoing, with the data set for this project ending in 2011.
We explore which storm types have been the most harmful in terms of
population health - the number of individuals suffering fatalities and injuries;
economic impact - the damage caused in monetary ($) terms to property and crop.
knitr::opts_chunk$set(echo = TRUE) ## set global option echo = TRUE to show code chunks in output.
library(ggplot2) ## for graphs
library(reshape2) ## for melt function used to reshape data frames for graphs
#library(tinytex) ## for pdf output; not used as html output was generated and published to RPubs.
strURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
strZIP <- file.path(getwd(), "repdata%2Fdata%2FStormData.csv.bz2")
if (!file.exists(strZIP)) {download.file(url = strURL, destfile = strZIP)}
dfALL <- read.csv(strZIP)
dim(dfALL)
## [1] 902297 37
dfSUB <- dfALL[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
dim(dfSUB)
## [1] 902297 7
#write.csv(dfSUB, "stormdata_subset.csv")
Property:
dfSUB$PROPDMGEXP <- tolower(dfSUB$PROPDMGEXP)
unique(dfSUB$PROPDMGEXP)
## [1] "k" "m" "" "b" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "-" "1" "8"
dfSUB <- dfSUB[!dfSUB$PROPDMGEXP %in% c("?","-","+") ,] ## excluding invalid exponents
dfSUB$PROPDMGEXP[dfSUB$PROPDMGEXP == ""] <- 0
dfSUB$PROPDMGEXP[dfSUB$PROPDMGEXP == "h"] <- 2
dfSUB$PROPDMGEXP[dfSUB$PROPDMGEXP == "k"] <- 3
dfSUB$PROPDMGEXP[dfSUB$PROPDMGEXP == "m"] <- 6
dfSUB$PROPDMGEXP[dfSUB$PROPDMGEXP == "b"] <- 9
unique(dfSUB$PROPDMGEXP)
## [1] "3" "6" "0" "9" "5" "4" "2" "7" "1" "8"
dfSUB$PROPDMGEXP <- as.numeric(dfSUB$PROPDMGEXP)
dfSUB$PROP <- dfSUB$PROPDMG * 10 ^ (dfSUB$PROPDMGEXP)
Crop:
dfSUB$CROPDMGEXP <- tolower(dfSUB$CROPDMGEXP)
unique(dfSUB$CROPDMGEXP)
## [1] "" "m" "k" "b" "?" "0" "2"
dfSUB <- dfSUB[!dfSUB$CROPDMGEXP == "?" ,] ## excluding invalid exponents
dfSUB$CROPDMGEXP[dfSUB$CROPDMGEXP == ""] <- 0
dfSUB$CROPDMGEXP[dfSUB$CROPDMGEXP == "k"] <- 3
dfSUB$CROPDMGEXP[dfSUB$CROPDMGEXP == "m"] <- 6
dfSUB$CROPDMGEXP[dfSUB$CROPDMGEXP == "b"] <- 9
unique(dfSUB$CROPDMGEXP)
## [1] "0" "6" "3" "9" "2"
dfSUB$CROPDMGEXP <- as.numeric(dfSUB$CROPDMGEXP)
dfSUB$CROP <- dfSUB$CROPDMG * 10 ^ (dfSUB$CROPDMGEXP)
Event types in the original data set include spelling mistakes, terminology inconsistencies, and event type sub-groups e.g. HURRICANE EMILY and HURRICANE FELIX as well as HURRICANE. These were cleaned up in this section:
dfSUB <- aggregate(cbind(FATALITIES, INJURIES, PROP, CROP) ~ EVTYPE, dfSUB, sum)
dfSUB <- dfSUB[ dfSUB$FATALITIES > 0 | dfSUB$INJURIES > 0 | dfSUB$PROP > 0 | dfSUB$CROP > 0, ]
dfSUB$EVTYPE <- toupper(dfSUB$EVTYPE)
#write.csv(unique(dfSUB$EVTYPE), "stormdata_evtype_dirty.csv")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "AVALANCE", "AVALANCHE")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "COLD TEMPERATURE", "COLD WEATHER")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "COLD", "COLD WEATHER")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "LOW TEMPERATURE", "COLD WEATHER")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "COASTALSTORM", "COASTAL STORM")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "GUSTY WINDS", "GUSTY WIND")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "HEAVY RAIN", "HEAVY RAINS")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "HEAVY SNOW SHOWER", "HEAVY SNOW")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "HIGH WIND", "HIGH WINDS")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "LANDSLIDES", "LANDSLIDE")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "LIGHTNING.", "LIGHTNING")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "MUDSLIDE", "MUD SLIDE")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "MUDSLIDES", "MUD SLIDE")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "RIP CURRENT", "RIP CURRENTS")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "SNOW SQUALLS", "SNOW SQUALL")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "SNOW/HIGH WINDS", "HIGH WINDS/SNOW")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "STRONG WINDS", "STRONG WIND")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "WILD FIRES", "WILDFIRE")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "WINTER STORM", "WINTER STORMS")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "WINTER WEATHER/MIX", "WINTER WEATHER MIX")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "WINTRY MIX", "WINTER WEATHER MIX")
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "NON TSTM WIND", "WIND")
dfSUB$EVTYPE[grep("THUNDERSTORM|THUNDERTORM|TSTM ",dfSUB$EVTYPE)] = "THUNDERSTORM WIND"
dfSUB$EVTYPE <- replace(dfSUB$EVTYPE, dfSUB$EVTYPE == "URBAN/SML STREAM FLD", "FLOOD")
dfSUB$EVTYPE[grep("FLOOD",dfSUB$EVTYPE)] = "FLOOD"
dfSUB$EVTYPE[grep("HURRICANE",dfSUB$EVTYPE)] = "HURRICANE"
dfSUB$EVTYPE[grep("TORNADO",dfSUB$EVTYPE)] = "TORNADO"
dfSUB$EVTYPE[grep("HEAT",dfSUB$EVTYPE)] = "HEAT"
dfSUB$EVTYPE[grep("COLD",dfSUB$EVTYPE)] = "COLD"
dfSUB <- aggregate(cbind(FATALITIES, INJURIES, PROP, CROP) ~ EVTYPE, dfSUB, sum)
#write.csv(unique(dfSUB$EVTYPE), "stormdata_evtype_clean.csv")
## subset df for 10 event types that are most harmful w.r.t. population health
dfHEALTH <- dfSUB[dfSUB$FATALITIES > 0 | dfSUB$INJURIES > 0, c("EVTYPE", "FATALITIES", "INJURIES")]
dfHEALTH$TOTAL <- dfHEALTH$FATALITIES + dfHEALTH$INJURIES
dfHEALTH_top10 <- dfHEALTH[order(dfHEALTH$TOTAL, decreasing = TRUE)[1:10], ]
#dfHEALTH_top10
## reshape df to have 20 rows = 10 event types X 2 variables: (i) FATALITIES and (ii) INJURIES,
## with each of 20 combinations of event type and variable having an associated value:
dfHEALTH_top10X2 <- melt(dfHEALTH_top10[, 1:3], id.vars = c("EVTYPE"))
#dfHEALTH_top10X2
## convert 10 X 2 event types in reshaped df to factor variable:
dfHEALTH_top10X2$EVTYPE <- factor(
dfHEALTH_top10X2$EVTYPE,
levels = unique(dfHEALTH_top10X2$EVTYPE))
#dfHEALTH_top10X2$EVTYPE
## plot each of 10 event types filled with 2 variables (FATALITIES and INJURIES) against associated value:
ggplot(dfHEALTH_top10X2, aes(x = EVTYPE, y = value/(2011-1950+1), fill = variable)) +
geom_histogram(stat = "identity") +
ggtitle("Storm impact on population health per annum over period from 1950 to 2011") +
labs(x = "", y = "Number of individuals affected per annum") +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_y_continuous(minor_breaks = seq(0, 2000, 20), ## gridlines
breaks = seq(0, 2000, 100)) ## ticks
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## subset df for 10 event types that are most harmful w.r.t. economic impact
dfECONOMIC <- dfSUB[dfSUB$PROP > 0 | dfSUB$CROP > 0, c("EVTYPE","PROP","CROP")]
dfECONOMIC$TOTAL <- dfECONOMIC$PROP + dfECONOMIC$CROP
dfECONOMIC_top10 <- dfECONOMIC[order(dfECONOMIC$TOTAL, decreasing = TRUE)[1:10], ]
#dfECONOMIC_top10
## reshape df to have 20 rows = 10 event types X 2 variables: (i) PROP and (ii) CROP,
## with each of 20 combinations of event type and variable having an associated value:
dfECONOMIC_top10X2 <- melt(dfECONOMIC_top10[, 1:3], id.vars = c("EVTYPE"))
#dfECONOMIC_top10X2
## convert 10 X 2 storm types in reshaped df to factor variable:
dfECONOMIC_top10X2$EVTYPE <- factor(
dfECONOMIC_top10X2$EVTYPE,
levels = unique(dfECONOMIC_top10X2$EVTYPE))
#dfECONOMIC_top10X2$EVTYPE
## plot each of 10 event types filled with 2 variables (PROP and CROP) against associated value:
ggplot(dfECONOMIC_top10X2, aes(x = EVTYPE, y = value/(2011-1950+1)/1000000, fill = variable)) +
geom_histogram(stat = "identity") +
ggtitle("Storm impact on economy per annum over period from 1950 to 2011") +
labs(x = "", y = "Damage ($ Million) per annum") +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
scale_y_continuous(minor_breaks = seq(0, 3000, 100), ## gridlines
breaks = seq(0, 3000, 200)) ## ticks
## Warning: Ignoring unknown parameters: binwidth, bins, pad
The graphs above clearly show the events that have had the most harmful impact on population health and the economy over the period from 1950 to 2011. Suffice to highlight that:
Tornadoes are the most harmful to population health, with about 1500 individuals injured and 100 fatalities per annum.
Floods are the most harmful in economic terms, with about USD 2.7 billion of property damage and USD 200 million of crop damage per annum.
Note: Given that data in the earlier years of the stated recording period was less complete than more recently recorded data, the impact on both population health and the economy may well be under-estimated.