Synopsis

The NOAA storm data set is fascinating and illuminating. The data are messy, however, and do require a fair amount of processing before it can be used. In the analysis described below, I describe and document each step I took in preparing the data for analysis. The most difficult and problematic column is EVTYPE. The data in this column are very messy. In order for analysts to make better use of the storm data, NOAA needs to put some resources into making sure all of the events adhere to their own controlled list. I am not an expert in matching miscoded storm events to NOAA’s controled list. For that reason, I matched the one I could and lumped the others into an UNKNOWEVTYPE category. This is the biggest limitation to this analysis.

Some observations:

Which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Which types of events have the greatest economic consequences

Data processing

Load libraries

library(stringr)
library(dplyr)
library(ggplot2)
library(tidyr)

Load storm data

Given the very large size of the dataset, I loaded the full file only once. While in memory I cut out unnecessary columns, keeping only the ones that I might possibly need. I saved the curated file as df.RDS. Now I check for the presence of that file first. If it is found, I load that and work from there.

# setwd("./project2/")

if (file.exists("df.RDS")) {
  print("load the RDS")
  df <- readRDS("df.RDS")
  } else {
  print("load the csv.bz2")
  df <- read.csv("repdata_data_StormData.csv.bz2", stringsAsFactors=TRUE)
  df <- df[,-c(1,3:4,9:20,29:37)]
  saveRDS(dftest, file="df.RDS")
}
## [1] "load the RDS"

Create and load file of event types

The list of event types I use in my analysis is derived from Table 2.1.1, “Storm Data Event Types,” in “repdata_peer2_doc_pd01016005curr.pdf.” I copied the table into a text editor and saved it as a delimited file. There are 50 event types in the table.

eventtypes <- read.delim("eventtypes.txt", stringsAsFactors=FALSE)
eventtypes$CLEANEVTYPE <- toupper(eventtypes$CLEANEVTYPE)

Clean up data

The raw event column in the NOAA data set is very messy and contains some 890 different types. To bring some rigor to the analysis, I attempted to match each event to the types listed in table 2.1.1 (see description above). Event types that did not match the controlled list were changed to UNKNOWNEVTYPE. A large number of events fall into the category, as evidenced by the plots below.

# df$EVTYPE
# trim whitespace and convert all to uppercase
df$EVTYPE <- str_trim(df$EVTYPE)
df$EVTYPE <- toupper(df$EVTYPE)

# create temporary object to hold list of possible evtypes in df
u.df <- as.data.frame(sort(unique(df$EVTYPE)))
colnames(u.df) <- "evtype" 
u.df$evtype <- as.character(u.df$evtype)
u.df$cleanevtype <- NA

# create new cleantype column in 
df$cleanevtype <- NA
df$cleanevtype <- as.character(df$cleanevtype)


for (i in 1:nrow(u.df)) {
 if (u.df$evtype[i] %in% eventtypes$CLEANEVTYPE) {
   u.df$cleanevtype[i] <- u.df$evtype[i] 
 } else {
   u.df$cleanevtype[i] <- NA
 }
}

u.df$cleanevtype[is.na(u.df$cleanevtype)] <- "UNKNOWNEVTYPE"


for (i in 1:nrow(u.df)) {
  match <- df$EVTYPE == u.df$evtype[i]
  df$cleanevtype[match] <- u.df$cleanevtype[i]
}

rm(match)
rm(i)
df$cleanevtype <- as.factor(df$cleanevtype)

# Clean df$PROPDMGEXP

df$PROPDMGEXP <- toupper(df$PROPDMGEXP)
table(df$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      H      K      M 
##      4      5      1     40      7 424665  11337
match <- df$PROPDMGEXP == "K" | df$PROPDMGEXP == "M" | df$PROPDMGEXP == "B" | df$PROPDMGEXP == ""
table(df$PROPDMGEXP[!match])
## 
##   -   ?   +   0   1   2   3   4   5   6   7   8   H 
##   1   8   5 216  25  13   4   4  28   4   5   1   7
df$PROPDMGEXP[!match] <- "NA"
table(df$PROPDMGEXP)
## 
##             B      K      M     NA 
## 465934     40 424665  11337    321
# Clean df$CROPDMGEXP

df$CROPDMGEXP <- toupper(df$CROPDMGEXP)
table(df$CROPDMGEXP)
## 
##             ?      0      2      B      K      M 
## 618413      7     19      1      9 281853   1995
match <- df$CROPDMGEXP == "K" | df$CROPDMGEXP == "M" | df$CROPDMGEXP == "B" | df$CROPDMGEXP == ""
table(df$CROPDMGEXP[!match])
## 
##  ?  0  2 
##  7 19  1
df$CROPDMGEXP[!match] <- "NA"
table(df$CROPDMGEXP)
## 
##             B      K      M     NA 
## 618413      9 281853   1995     27

Add new columns for property and crop damage

The multiplier codes in the PROPDMGEXP and CROPDMGEXP columns was messsy. I cleaned up what I could and changed the non-matching instances to NA.

# multiply property damage * magnitude
df$propdmgexp_num <- df$PROPDMGEXP
df$propdmgexp_num <- as.character(df$propdmgexp_num)
mablank <- df$propdmgexp_num == ""
mak <- df$propdmgexp_num == "K"
mam <- df$propdmgexp_num == "M"
mab <- df$propdmgexp_num == "B"
df$propdmgexp_num[mablank] <- as.numeric(0)
df$propdmgexp_num[mak] <- as.numeric(1000)
df$propdmgexp_num[mam] <- as.numeric(1000000)
df$propdmgexp_num[mab] <- as.numeric(1000000000)
rm(mablank)
rm(mak)
rm(mam)
rm(mab)
df$propdmgexp_num <- as.numeric(df$propdmgexp_num)
## Warning: NAs introduced by coercion
df <- df %>% mutate(totalpropdmg = PROPDMG * propdmgexp_num)

# multiply crop damage * magnitude
df$cropdmgexp_num <- df$CROPDMGEXP
df$cropdmgexp_num <- as.character(df$cropdmgexp_num)
mablank <- df$cropdmgexp_num == ""
mak <- df$cropdmgexp_num == "K"
mam <- df$cropdmgexp_num == "M"
mab <- df$cropdmgexp_num == "B"
df$cropdmgexp_num[mablank] <- as.numeric(0)
df$cropdmgexp_num[mak] <- as.numeric(1000)
df$cropdmgexp_num[mam] <- as.numeric(1000000)
df$cropdmgexp_num[mab] <- as.numeric(1000000000)
rm(mablank)
rm(mak)
rm(mam)
rm(mab)
df$cropdmgexp_num <- as.numeric(df$cropdmgexp_num)
## Warning: NAs introduced by coercion
df <- df %>% mutate(totalcropdmg = CROPDMG * cropdmgexp_num)

Results

Compute sums

dfsums <- df %>%  group_by(cleanevtype) %>% filter( !is.na(totalpropdmg) & !is.na(totalcropdmg) ) %>% summarise( sumpropdmg = sum(totalpropdmg), sumcropdmg = sum(totalcropdmg), sumfatalities = sum(FATALITIES), suminjuries = sum(INJURIES) )

economicimpact <- gather(dfsums, "damagetype", "count", 2:3) %>% select(cleanevtype,damagetype, count)
healthimpact <- gather(dfsums, "damagetype", "count", 4:5) %>% select(cleanevtype,damagetype, count)

Figure 1: plot of property and crop damage

ggplot( economicimpact, aes(x=cleanevtype, y=count, fill=damagetype) ) + geom_bar(stat="identity", position="dodge", width=0.5) + xlab("Event type") + ylab("Property damage in dollars") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Figure 2: plot of injuries and fatalities

ggplot( healthimpact, aes(x=cleanevtype, y=count, fill=damagetype) ) + geom_bar(stat="identity", position="dodge", width=0.5) + xlab("Event type") + ylab("Health damage in number of injuries and fatalities") + theme(axis.text.x = element_text(angle = 45, hjust = 1))