Synopsis

The goal for this study is to explore the weather disaster on the economic and human of the US between 1994 and 2011. The data for the study is sourced from National Weather Service, and the data specifically measures the fatalities, injuries, property damage and crop damage caused by each weather related disaster.

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete. We will investigate only weather event after 2000.

Questions

Goal: answer the following questions 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? 2. Across the United States, which types of events have the greatest economic consequences?

Load Data

Load common library

library(ggplot2)

Download the data file from Here Also note we will read the csv data with strip.white=TRUE, this ensure we get rid of pesky white spaces

df <- read.csv(bzfile("repdata-data-StormData.csv.bz2"), sep=",", header=TRUE, strip.white=TRUE, stringsAsFactors=FALSE)

Remove data prior to year 2000 by adding year column, and filter, this reduces set from 1.2M record to 520k record

df$year <- as.numeric(format(as.Date(df$BGN_DATE,"%m/%d/%Y"), "%Y"))
df <- df[df$year >= 2000,]

Explore Data

List out all the unique event

head(sort(unique(df$EVTYPE)), 25)
##  [1] "   HIGH SURF ADVISORY"        " FLASH FLOOD"                
##  [3] " TSTM WIND"                   " WATERSPOUT"                 
##  [5] "ABNORMALLY DRY"               "ABNORMALLY WET"              
##  [7] "ACCUMULATED SNOWFALL"         "ASTRONOMICAL HIGH TIDE"      
##  [9] "ASTRONOMICAL LOW TIDE"        "AVALANCHE"                   
## [11] "BEACH EROSION"                "BLACK ICE"                   
## [13] "BLIZZARD"                     "BLOWING DUST"                
## [15] "BRUSH FIRE"                   "COASTAL FLOOD"               
## [17] "COASTAL FLOODING"             "COLD"                        
## [19] "COLD WEATHER"                 "COLD WIND CHILL TEMPERATURES"
## [21] "COLD/WIND CHILL"              "CSTL FLOODING/EROSION"       
## [23] "DAM BREAK"                    "DENSE FOG"                   
## [25] "DENSE SMOKE"

A lot of duplicate, mis-spell event type (AVALANCE, AVALANCHE, BEACH EROSIN, BEACH EROSION). We will try to clean up event type by adding a new column of category (we will be making assumption here)

cleanET <- function(es) {
  es <- tolower(es)
  # fix spell error
  es <- sub("^[[:space:]]+|[[:space:]]+$", "", es)
  es <- sub("[[:space:]]+", " ", es)
  es <- sub("hvy","heavy", es)
  es <- sub("avalance","avalanche", es)
  es <- sub("erosin", "erosion", es)
  es <- sub("tornadao", "tornado", es)
  es <- sub("thuderstorm", "thunderstorm", es)
  es <- sub("thudeerstorm", "thunderstorm", es)
  # regroup
  es <- sub("beach flood", "costal flood", es)
  es <- sub("^blizzard.*", "blizzard", es)
  es <- sub("^bitter.*", "bitter wind chill", es)
  es <- sub("^cstl", "coastal flood", es)
  es <- sub("^coastal.*flood.*", "coastal flood", es)
  es <- sub("^(cold|cool|freezing|frost).*", "cold weather", es)
  es <- sub("^(dam\\s).*", "dam failure", es)
  es <- sub("^(dry|drought).*", "drought", es)
  es <- sub("^excessive.*", "excessive weather", es)
  es <- sub("^flash.*", "flash flood", es)
  es <- sub("^flood.*", "flood", es)
  es <- sub("^gusty.*", "gusty wind", es)
  es <- sub("^hail.*", "hail", es)
  es <- sub("^heat.*", "heat wave", es)
  es <- sub("^heavy\\s(rain|shower|preci).*", "heavy rain", es)
  es <- sub("^heavy\\s(snow).*", "heavy snow", es)
  es <- sub("^heavy\\s(surf).*", "heavy surf", es)
  es <- sub("^high\\s(wind).*", "high wind", es)
  es <- sub("^hurricane.*", "hurricane", es)
  es <- sub("^hyp(er|o)thermia.*", "hypothermia", es)
  es <- sub("^ice.*", "hypothermia", es)
  es <- sub("^lig(h|n)tning.*", "hypothermia", es)
  es <- sub("^summary.*", "summary", es)
  es <- sub("^snow.*", "snow", es)
  es <- sub("^rain.*", "rain", es)
  es <- sub("^record.*", "record temperature", es)
  es <- sub("^thund.*", "thunderstorm", es)
  es <- sub("^tstm.*", "tstm", es)
  es <- sub("^tornado.*", "tornado", es)
  es <- sub("^unseasonably.*", "unseason weather", es)
  es <- sub("^urban.*", "urban flood", es)
  es <- sub("fires", "fire", es)
  es <- sub("tides", "tide", es)
}

df$event_type <- cleanET(df$EVTYPE)

Analyze Data

sort dmg by event type: group the data by event type and aggregate the sum of dmg columns, so we may see the event with worst damage

translateDmg <- function(y) {
  y <- tolower(y)
  return (if (y == "k") 1 else if (y == "m") 1000 else if (y == "b") 1000000 else 1)
}
df$propDmgV <- (df$PROPDMG * sapply(df$PROPDMGEXP, translateDmg))
df$cropDmgV <- (df$CROPDMG * sapply(df$CROPDMGEXP, translateDmg))

Prepare Economic Damage Data

Work on property and crop damage separately, then combine the result Group the weather event by type, and rename columns, add discriminator column, then rbind both

prop_dmg_by_etype <- aggregate(cbind(propDmgV) ~ event_type, data=df, FUN=sum)
names(prop_dmg_by_etype)[1] <- "Event"
names(prop_dmg_by_etype)[2] <- "Damage"
prop_dmg_by_etype$DamageType <- "Property"

crop_dmg_by_etype <- aggregate(cbind(cropDmgV) ~ event_type, data=df, FUN=sum)
names(crop_dmg_by_etype)[1] <- "Event"
names(crop_dmg_by_etype)[2] <- "Damage"
crop_dmg_by_etype$DamageType <- "Crop"

total_damage <- rbind(prop_dmg_by_etype[order(prop_dmg_by_etype$Damage, decreasing = TRUE)[1:10],], 
                      crop_dmg_by_etype[order(crop_dmg_by_etype$Damage, decreasing = TRUE)[1:10],])

Prepare Human Damage Data

fatalities_by_etype <- aggregate(cbind(FATALITIES) ~ event_type, data=df, FUN=sum)
names(fatalities_by_etype)[1] <- "Event"
names(fatalities_by_etype)[2] <- "Count"
fatalities_by_etype$CountType <- "Fatalities"

injuries_by_etype <- aggregate(cbind(INJURIES) ~ event_type, data=df, FUN=sum)
names(injuries_by_etype)[1] <- "Event"
names(injuries_by_etype)[2] <- "Count"
injuries_by_etype$CountType <- "Injuries"

total_human <- rbind(fatalities_by_etype[order(fatalities_by_etype$Count, decreasing = TRUE)[1:10],], 
                      injuries_by_etype[order(injuries_by_etype$Count, decreasing = TRUE)[1:10],])

Create another set, group by year, so we may see the damage by year. Group again, this time by year + event_type. Then we will filter on only the top event_type qualified above (non-year aggregation)

human_by_etype_year <- aggregate(x=df[,c("FATALITIES","INJURIES")], by=df[,c("year","event_type")], FUN=sum)
econo_by_etype_year <- aggregate(x=df[,c("propDmgV","cropDmgV")], by=df[,c("year","event_type")], FUN=sum)

top_human_by_etype_year <- human_by_etype_year[human_by_etype_year$event_type %in% total_human$Event,]
top_human_by_etype_year$total <- top_human_by_etype_year$FATALITIES + top_human_by_etype_year$INJURIES
top_econ_by_etype_year <- econo_by_etype_year[econo_by_etype_year$event_type %in% total_damage$Event,]
top_econ_by_etype_year$total <- top_econ_by_etype_year$propDmgV + top_econ_by_etype_year$cropDmgV

Result

Examine the top damage amount

qplot(Event, Damage/1000000, data=total_damage) + 
  geom_bar(aes(fill=DamageType), stat="identity") + 
  theme(axis.text.x  = element_text(angle=-25, vjust=0.5)) + 
  labs(x="Weather Event", y="Damage in Billion (USD)", title="Economic Damage")

plot of chunk unnamed-chunk-10

qplot(year, total/1000000, data=top_econ_by_etype_year) +
  geom_line(aes(col=event_type)) +
  labs(x="Year between 2000 to 2011", y="Property and Crop Damage in Billion (USD)")

plot of chunk unnamed-chunk-10 Flood and Hurricane have much higher economic impact than others. Amount crop and property, property damage takes on most of the toll. year 2005 we see a surge in hurricane damage (Katrina), and in 2006 a surge in floods around the countries.

qplot(Event, Count, data=total_human) + 
  geom_bar(aes(fill=CountType), stat="identity") + 
  theme(axis.text.x  = element_text(angle=-25, vjust=0.5)) + 
  labs(x="Weather Event", y="Population Count", title="Human Damage") +
  scale_fill_discrete(name="Type")

plot of chunk unnamed-chunk-11

qplot(year, total, data=top_human_by_etype_year) +
  geom_line(aes(col=event_type)) +
  labs(x="Years from 2000 to 2011", y="Total Count (Fatalities + Injuries)")

plot of chunk unnamed-chunk-11 Tornado has the significantly higher population impact than all other weather event. Most of them are from tornado outbreak in 2011.