This is an analysis of data from the NOAA Storm Database.
After some necessary data cleansing, the analysis focuses on the last fifteen years of data (1997-2011) to provide results. First we find the top 5 event types by average annual Injuries, Fatalities, Crop Damage, and Property Damage. Then, we show the same top 5 event types for each category but with the annual effects plotted as a time series over the fifteen year period.
The purpose of the analysis is to answer the questions:
# Download the data:
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","Storm.data.csv.bz2")
# Unzip and read the file:
stormdata<- read.csv(bzfile("Storm.data.csv.bz2"), stringsAsFactors = FALSE)
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
library(dplyr)
library(lubridate)
library(stringdist)
# Make a copy of stormdata to start cleaning it up
stormdata_clean <- stormdata
# If there is no damage and no health effect, remove it from the analysis
# Set all uppercase. Trim whitespace.
stormdata_clean$PROPDMGEXP <- toupper(trimws(stormdata_clean$PROPDMGEXP))
stormdata_clean$PROPDMGEXPnew <- case_when(stormdata_clean$PROPDMGEXP == "B" ~ 1e9,
stormdata_clean$PROPDMGEXP == "M" ~ 1e6,
stormdata_clean$PROPDMGEXP == "K" ~ 1e3,
stormdata_clean$PROPDMGEXP == "H" ~ 1e2,
stormdata_clean$PROPDMGEXP == "1" ~ 1e1,
stormdata_clean$PROPDMGEXP == "2" ~ 1e2,
stormdata_clean$PROPDMGEXP == "3" ~ 1e3,
stormdata_clean$PROPDMGEXP == "4" ~ 1e4,
stormdata_clean$PROPDMGEXP == "5" ~ 1e5,
stormdata_clean$PROPDMGEXP == "6" ~ 1e6,
stormdata_clean$PROPDMGEXP == "7" ~ 1e7,
stormdata_clean$PROPDMGEXP == "8" ~ 1e8,
stormdata_clean$PROPDMGEXP == "9" ~ 1e9,
TRUE ~ 1)
stormdata_clean$PROPDMGTOT <- stormdata_clean$PROPDMG * stormdata_clean$PROPDMGEXPnew
stormdata_clean$CROPDMGEXP <- toupper(trimws(stormdata_clean$CROPDMGEXP))
stormdata_clean$CROPDMGEXPnew <- case_when(stormdata_clean$CROPDMGEXP == "B" ~ 1e9,
stormdata_clean$CROPDMGEXP == "M" ~ 1e6,
stormdata_clean$CROPDMGEXP == "K" ~ 1e3,
stormdata_clean$CROPDMGEXP == "H" ~ 1e2,
stormdata_clean$CROPDMGEXP == "1" ~ 1e1,
stormdata_clean$CROPDMGEXP == "2" ~ 1e2,
stormdata_clean$CROPDMGEXP == "3" ~ 1e3,
stormdata_clean$CROPDMGEXP == "4" ~ 1e4,
stormdata_clean$CROPDMGEXP == "5" ~ 1e5,
stormdata_clean$CROPDMGEXP == "6" ~ 1e6,
stormdata_clean$CROPDMGEXP == "7" ~ 1e7,
stormdata_clean$CROPDMGEXP == "8" ~ 1e8,
stormdata_clean$CROPDMGEXP == "9" ~ 1e9,
TRUE ~ 1)
stormdata_clean$CROPDMGTOT <- stormdata_clean$CROPDMG * stormdata_clean$CROPDMGEXPnew
stormdata_clean$TOTDMG <- stormdata_clean$PROPDMGTOT + stormdata_clean$CROPDMGTOT
stormdata_clean$HEALTHEFFECTS <- stormdata_clean$INJURIES + stormdata_clean$FATALITIES
stormdata_clean$ALLEFFECTS <- stormdata_clean$TOTDMG + stormdata_clean$HEALTHEFFECTS
stormdata_clean <- filter(stormdata_clean, ALLEFFECTS > 0)
# Get the year as a standalone column
stormdata_clean <- mutate(stormdata_clean, year = year(mdy_hms(gsub("/","-",BGN_DATE,fixed = TRUE))))
# Not many different EVTYPE were recorded before about 1995, so remove data before that.
stormdata_clean <- filter(stormdata_clean, year>1994)
# Get definitive list of evtypes (extracted from NWSI 10-1605 AUGUST 17, 2007 Table of Contents)
evtypes_def_list <- toupper(trimws(read.table("evtypes_full_list.txt",sep = ",",colClasses = "character")$V1))
# Begin cleanup on EVTYPE column
# Set all uppercase. Trim whitespace.
stormdata_clean$EVTYPE <- toupper(trimws(stormdata_clean$EVTYPE))
# Build a map of old EVTYPE to the definitive list
evtypes_map <- tibble(orig = unique (stormdata_clean$EVTYPE), map_col = unique (stormdata_clean$EVTYPE))
evtypes_map$new <- ""
#Define pattern and replacement strings to use with sub(pattern, replacement, ... fixed = TRUE)
sub_strings_fixed <- matrix(byrow = TRUE, ncol = 2,
data = c("SEVERE ", "",
"DRY ","",
"RIVER ","",
"BREAKUP ","",
"SHOWER","RAIN",
"SWELLS","SURF",
"TSTM", "THUNDERSTORM",
"THUDERSTORM", "THUNDERSTORM",
"NON-THUNDERSTORM", "STRONG",
"NON THUNDERSTORM", "STRONG",
"MICROBURST", "THUNDERSTORM",
"MIRCOBURST", "THUNDERSTORM",
"DOWNBURST", "THUNDERSTORM",
"GUSTNADO", "THUNDERSTORM WIND",
"THUNDERSNOW", "HEAVY SNOW",
"LAKE FLOOD", "LAKESHORE FLOOD",
"HEAVY LAKE SNOW", "LAKE-EFFECT SNOW",
"HEAVY MIX", "HEAVY SNOW",
"WIND/HAIL", "HAIL",
"WIND AND WAVE", "MARINE HIGH WIND",
"RAIN (HEAVY)", "HEAVY RAIN",
"RURAL FLOOD", "FLOOD",
"XYZ","XYZ"
)
)
# Apply those substitutions
for (i in 1:nrow(sub_strings_fixed)){
evtypes_map$map_col <- sub(sub_strings_fixed[i,1],sub_strings_fixed[i,2],evtypes_map$map_col,fixed=TRUE)
}
#Define pattern and replacement strings to use with sub(pattern, replacement, ... fixed = FALSE)
sub_strings <- matrix(byrow = TRUE, ncol = 2,
data = c("^WIND", "HIGH WIND",
"^COLD$", "COLD/WIND/CHILL",
"^FLOOD.*", "FLOOD",
"^HEAT.*", "HEAT",
".* HEAT", "EXCESSIVE HEAT",
"SEAS$", "SURF",
".* SURF", "HIGH SURF",
".*HIGH TIDE", "STORM TIDE",
"MARINE MISHAP", NA,
"^FREEZE$", "FROST/FREEZE",
".*HURRICANE.*", "HURRICANE/TYPHOON",
".*SLIDE.*","DEBRIS FLOW",
".*H FLOOD.*","FLASH FLOOD",
".*L FLOOD.*","COASTAL FLOOD",
"^SNOW.*","HEAVY SNOW",
".*MIXED PRECIP.*","HEAVY SNOW",
"HEAVY PRECIP.*","HEAVY RAIN",
".*[ABDEFGHIJKLMNOPQRSTUZWXYZ]T SNOW.*","HEAVY SNOW",
".*[ABCDEFGHIJKLMNOPQRSUZWXYZ] SNOW.*","HEAVY SNOW",
".*[ABCDFGIJKMNOPQRSTUVWXYZ] FLOOD.*","FLOOD",
".*HAIL.*","HAIL",
"XYZ","XYZ"
)
)
# Apply those substitutions
for (i in 1:nrow(sub_strings)){
evtypes_map$map_col <- sub(sub_strings[i,1],sub_strings[i,2],evtypes_map$map_col,fixed=FALSE)
}
numtypes <- length(evtypes_map$map_col)
for (itype in 1:numtypes){
matchword <- trimws(evtypes_map$map_col[itype])
#Remove certain words that are not part of the definitive list to improve amatch performance
#matchword <- sub("SEVERE","",matchword,fixed=TRUE)
#matchword <- sub("DRY","",matchword,fixed=TRUE)
#matchword <- sub("RIVER ","",matchword,fixed=TRUE)
#matchword <- sub("BREAKUP ","",matchword,fixed=TRUE)
if (is.na(matchword)){
evtypes_map$new[itype] <- NA
next
}
matchmeth<-"soundex"
matchdist<-10
matchresult <- evtypes_def_list[amatch(matchword,evtypes_def_list,maxDist=matchdist, method=matchmeth)]
# ASTRONOMICAL LOW TIDE only occurs once, but the amatch defaults to it whenever it gets stumped.
# Try again unless orig was ASTRONOMICAL LOW TIDE
if ((evtypes_map$orig[itype] != "ASTRONOMICAL LOW TIDE")&&(matchresult == "ASTRONOMICAL LOW TIDE")){
matchresult<-NA
}
#if (is.na(matchresult)){ # If amatch didn't work, try matching against just the first half of the old evtype string.
# matchresult <- amatch(substr(matchword,1,ceiling(length(matchword)/2)),evtypes_def_list,maxDist=matchdist, method=matchmeth)
# cat ("NANA ")
#}
# Use the result to put the evtype from the definitive list in the new column
evtypes_map$new[itype] <- matchresult
}
cat(length(unique(evtypes_map$orig)),"original evtypes (after filtering out rows with no economic/health effects).\n")
## 330 original evtypes (after filtering out rows with no economic/health effects).
cat(length(unique(evtypes_map$map_col)),"map_col evtypes.\n")
## 208 map_col evtypes.
cat(length(unique(evtypes_map$new)),"new evtypes.\n")
## 44 new evtypes.
cat(length(unique(evtypes_def_list)),"definitive evtypes.\n")
## 48 definitive evtypes.
cat(sum(is.na(evtypes_map$new)),"unmatched original evtypes.\n")
## 53 unmatched original evtypes.
cat(sum(grepl("ASTRONOMICAL LOW TIDE", evtypes_map$new)),"new ASTRONOMICAL LOW TIDEs.\n")
## 1 new ASTRONOMICAL LOW TIDEs.
cat(sum(grepl("LOW TIDE", evtypes_map$orig)),"orig LOW TIDEs.\n")
## 1 orig LOW TIDEs.
# Join the map to the original dataset
stormdata_clean <- left_join(stormdata_clean,evtypes_map, by = c("EVTYPE" = "orig"))
# Rename the columns so we are going to work with the new EVTYPE instead of the original
stormdata_clean <- rename(stormdata_clean, EVTYPE.orig = EVTYPE, EVTYPE = new )
We need to summarize the health and environmental effects by EVTYPE
library(dplyr)
library(lubridate)
library(ggplot2)
# Make another copy of the data to use in this code chunk
stormdata_prep <- stormdata_clean
# Group by year and EVTYPE, only use last fifteen years of data for this step
max_year <- max(stormdata_prep$year)
by_year_evtype <- group_by(filter(stormdata_prep,year>(max_year-15)), year, EVTYPE)
# Summarise all effects
eff_per_year <- summarise(by_year_evtype,
HEALTHEFFECTS = sum(HEALTHEFFECTS),
FATALITIES = sum(FATALITIES),
INJURIES = sum(INJURIES),
CROPDMGTOT = sum(CROPDMGTOT),
PROPDMGTOT = sum(PROPDMGTOT),
TOTDMG = sum(TOTDMG))
# Group by EVTYPE
by_evtype <- group_by(eff_per_year, EVTYPE)
# Summarise all effects
mean_eff_per_evtype <- summarise(by_evtype,
HEALTHEFFECTS = mean(HEALTHEFFECTS),
FATALITIES = mean(FATALITIES),
INJURIES = round(mean(INJURIES)),
CROPDMGTOT = mean(CROPDMGTOT),
PROPDMGTOT = mean(PROPDMGTOT),
TOTDMG = mean(TOTDMG))
sum_eff_per_evtype <- summarise(by_evtype,
HEALTHEFFECTS = sum(HEALTHEFFECTS),
FATALITIES = sum(FATALITIES),
INJURIES = sum(INJURIES),
CROPDMGTOT = sum(CROPDMGTOT),
PROPDMGTOT = sum(PROPDMGTOT),
TOTDMG = sum(TOTDMG))
#Filter only those EVTYPE which make up the top 5 by yearly average over the last fifteen years.
inj_evtypes <- head(arrange(mean_eff_per_evtype, desc(INJURIES)),5)$EVTYPE
fat_evtypes <- head(arrange(mean_eff_per_evtype, desc(FATALITIES)),5)$EVTYPE
cd_evtypes <- head(arrange(mean_eff_per_evtype, desc(CROPDMGTOT)),5)$EVTYPE
pd_evtypes <- head(arrange(mean_eff_per_evtype, desc(PROPDMGTOT)),5)$EVTYPE
rate_evtypes <- unique(c(inj_evtypes,fat_evtypes,cd_evtypes,pd_evtypes))
# Create colors for these event types
evtypes_colors <- hcl.colors(length(rate_evtypes))
color_map <- tibble(evtype = rate_evtypes, color = evtypes_colors)
mean_eff_per_evtype <- left_join(mean_eff_per_evtype,color_map, by = c("EVTYPE" = "evtype"))
eff_per_year <- left_join(eff_per_year,color_map, by = c("EVTYPE" = "evtype"))
#inj_per_year <- filter(eff_per_year, EVTYPE %in% inj_evtypes)
#fat_per_year <- filter(eff_per_year, EVTYPE %in% fat_evtypes)
#cd_per_year <- filter(eff_per_year, EVTYPE %in% cd_evtypes)
#pd_per_year <- filter(eff_per_year, EVTYPE %in% pd_evtypes)
Plot the results of the analysis.
Show the top 5 in each category based on greatest average annual effects over the fifteen year period.
library(dplyr)
library(lubridate)
library(ggplot2)
#Plot the Top 5 Event Types by Yearly Average <Effect> as a 2X2 grid of column charts
par(mfrow=c(2,2),pin=c(4.5,3))# ,mai = c(.2,.2,.2,.2),omi = c(.2,.2,.2,.2))
# Make a table to loop through for the four charts needed with all variations
plot_index <- tibble(column = c("INJURIES","FATALITIES","CROPDMGTOT","PROPDMGTOT"),
friendly = c("Injuries","Fatalities","Crop Damage ($M)","Property Damage ($B)"),
scale = c(1,1,1e6,1e9),
digits = c(-2,-1,-2,0),
ylimadj = c(200,20,200,2),
textadj = c(50,5,50,0.5),
textrd = c(0,0,2,2),
text1 = c("","","$","$"),
text2 = c("","","M","B"))
for (i in 1:nrow(plot_index)){
# Make a copy of the data for each plot type
top5<-head(arrange(mean_eff_per_evtype, desc(!!as.name(plot_index$column[i]))),5)
# Make the Bar Plot
top5bar<- barplot(names.arg = top5$EVTYPE,
height = top5[[plot_index$column[i]]]/plot_index$scale[i],
beside = TRUE,
main = paste0("Top 5 Event Types by Average Annual ",
plot_index$friendly[i],"\n(",max_year-14,"-",max_year,")"),
#ylab = paste0("Average Annual ",plot_index$friendly[i]),
xlab = "Event Type",
ylim = c(0, round(max(top5[[plot_index$column[i]]]/plot_index$scale[i]),
digits = plot_index$digits[i]) + plot_index$ylimadj[i]),
col = top5$color,
cex.names = 0.7,
xpd = FALSE
)
# Put the data labels at the top of each bar
text(top5bar, top5[[plot_index$column[i]]]/plot_index$scale[i]+plot_index$textadj[i],
paste0(plot_index$text1[i],round(top5[[plot_index$column[i]]]/plot_index$scale[i],
digits = plot_index$textrd[i]),plot_index$text2[i]),cex=1)
# Include a legend since some of the bar names at the bottom are too long and will get cut off
legend("topright", legend = top5$EVTYPE, fill = top5$color)
}
Figure 1 - Top Five Event Types by Average Annual Health and Economic Effects
On average, Tornadoes cause the most annual injuries (~1331 per year), but Excessive Heat causes the most deaths (~118 per year).
Drought causes the most Crop Damage (~$857.6M per year), while Flood causes the most property damage (~$9.53B per year).
Show the annual effects of the Top 5 event types in each category over the last fifteen years as a time series. For the Crop Damage and Property Damage, use a logarithmic scale to make it easier to see differences at the lower end of the range.
library(dplyr)
library(lubridate)
library(ggplot2)
#Plot the Top 5 Event Types by Yearly Average <Effect> as a 2X2 grid of column charts
par(mfrow=c(4,1),pin=c(8,1.7),ylog=TRUE)# ,mai = c(.2,.2,.2,.2),omi = c(.2,.2,.2,.2))
# Make a table to loop through for the four charts needed with all variations
plot_index <- tibble(column = c("INJURIES","FATALITIES","CROPDMGTOT","PROPDMGTOT"),
friendly = c("Injuries","Fatalities","Crop Damage ($M)","Property Damage ($B)"),
scale = c(1,1,1e6,1e9),
digits = c(-2,-1,-2,0),
ylimadj = c(200,20,200,2),
textadj = c(50,5,50,0.5),
textrd = c(0,0,2,2),
text1 = c("","","$","$"),
text2 = c("","","M","B"))
for (i in 1:nrow(plot_index)){
# Make a copy of the data for each plot type
top5evtypes<-head(arrange(mean_eff_per_evtype, desc(!!as.name(plot_index$column[i]))),5)$EVTYPE
top5<-filter(eff_per_year, EVTYPE %in% top5evtypes)
top5leg <- top5[!duplicated(top5[ , c("EVTYPE")]), ]
firstline = TRUE
for (evt in top5evtypes){
top5f <- filter(top5, EVTYPE == evt)
# Make the Line Plot
if (firstline) {
top5line<- plot(x = top5f$year,
y = top5f[[plot_index$column[i]]]/plot_index$scale[i],
main = paste0("Event Type ",plot_index$friendly[i]," each Year\n(",max_year-14,"-",max_year,")"),
ylab = plot_index$friendly[i],
xlab = "Year",
type = "l",
#ylim = c(0, round(max(top5[[plot_index$column[i]]]/plot_index$scale[i]),
# digits = plot_index$digits[i]) + plot_index$ylimadj[i]),
col = top5f$color,
lwd = 2,
log = "y",
#cex.names = 0.7,
xpd = FALSE
)
firstline <- FALSE
} else {
lines(x = top5f$year,
y = top5f[[plot_index$column[i]]]/plot_index$scale[i],
col = top5f$color,
lwd = 2,
#log = "y"
)
}
# Put the data labels at each point
text(top5f$year,
top5f[[plot_index$column[i]]]/plot_index$scale[i],
top5f[[plot_index$column[i]]]/plot_index$scale[i]
)
#text(top5line, top5f[[plot_index$column[i]]]/plot_index$scale[i]+plot_index$textadj[i],
# paste0(plot_index$text1[i],round(top5f[[plot_index$column[i]]]/plot_index$scale[i],
# digits = plot_index$textrd[i]),plot_index$text2[i]),cex=1)
}
# Include a legend since some of the bar names at the bottom are too long and will get cut off
legend("topright", legend = top5leg$EVTYPE, fill = top5leg$color)
}
Figure 2 - Effects Over Time for the Top 5 Event Types
The highest average annual effect doesn’t necessarily predict the most damaging event type in any single year. Over the last fifteen years, various event types have taken turns as the most damaging to health (measured by injuries or fatalities) or economics (measured by crop damage or property damage.) For example, Hurricanes caused significant property damage in 2004 and 2005.