Synopsis

This analysis reviews the data of the U.S. National Oceanic and Athmosphere Administration’s (NOAA) storm database. This database contains information of major storms and wheather events in the US. We research which wheather event types causes the biggest damage to population and economy.

Source can be downloaded from the following URL:
Source Data: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
Documentation: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
FAQ: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf

Results

Most damage to population health is caused by tornados. Most damage to economy is caused by Flood.

Data Processing

Load dependent libraries.

library(stringr)

Load the source data files.

# load the source data file
fileURL <- "repdata_data_StormData.csv"
df.data <- read.csv(fileURL,stringsAsFactors = F)

Perform basic clean up of the data by converting the date variables, removing aggregated information and converting the event information to upper case.

df.data$BGN_DATE <- as.Date(substr(df.data$BGN_DATE,1,10), format="%m/%d/%Y")
# we convert to upper case to simply the data cleaning process
df.data$EVTYPE <- toupper(df.data$EVTYPE)
df.data$PROPDMGEXP <- toupper(df.data$PROPDMGEXP)
df.data$CROPDMGEXP <- toupper(df.data$CROPDMGEXP)
# remove summary event observations as it is aggregated data
df.data <- df.data[!str_detect(df.data$EVTYPE,"SUMMARY"),]

Clean up exponent of crop and property damage data. Here we change the exponent value and change the data type to numeric.

# crop damage exponent
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="0"] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="?"] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP==""] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="2"] <- 100
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="K"] <- 1000
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="M"] <- 1000000
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="B"] <- 1000000000
df.data$CROPDMGEXP <- as.numeric(df.data$CROPDMGEXP)
# property damage exponent
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="0"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="1"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP==""] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="+"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="-"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="?"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="2"] <- 10^2
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="3"] <- 10^3
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="4"] <- 10^4
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="5"] <- 10^5
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="6"] <- 10^6
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="7"] <- 10^7
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="8"] <- 10^8
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="H"] <- 100
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="K"] <- 1000
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="M"] <- 1000000
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="B"] <- 1000000000
df.data$PROPDMGEXP <- as.numeric(df.data$PROPDMGEXP)

Clean up the event data.

# construct list of valid events as listed in referenced documentation
ev.types <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill", "Debris Flow",
              "Dense Fog", "Dense Smoke", "Drought", "Dust Devil", "Dust Storm", "Excessive Heat", "Extreme Cold/Wind Chill",
              "Flash Flood", "Flood", "Frost/Freeze", "Funnel Cloud", "Freezing Fog", "Hail", "Heat",
              "Heavy Rain", "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", "Ice Storm",
              "Lake-Effect Snow", "Lakeshore Flood", "Lightning", "Marine Hail",
              "Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet",
              "Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", "Tropical Depression",
              "Tropical Storm", "Tsunami", "Volcanic Ash", "Waterspout", "Wildfire",
              "Winter Storm", "Winter Weather")
ev.types <- toupper(ev.types)

# special characters and strings
df.data$EVTYPE <- gsub("\\\\"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("/"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("-"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub(" & "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub(" AND "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("  "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("SML","SMALL",df.data$EVTYPE)
df.data$EVTYPE <- str_trim(df.data$EVTYPE, side="both")

# size
df.data$EVTYPE <- gsub("ABNORMALLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("ABNORMAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTREME","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTREMELY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FIRST","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTENDED","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HEAVY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HOT","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HARD","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXCESSIVE","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HEAVY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("TROPICAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("RECORD","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNSEASONABLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNSEASONAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNUSUALLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("MONTHLY","HIGH",df.data$EVTYPE)

# wording
df.data$EVTYPE <- gsub("TORNDAO","TORNADO",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WND","WIND",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WINDS","WIND",df.data$EVTYPE)
df.data$EVTYPE <- gsub("STORMS","STORM",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FLOODING","FLOOD",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FLOODS","FLOOD",df.data$EVTYPE)
df.data$EVTYPE <- gsub("SNOWFALL","SNOW",df.data$EVTYPE)
df.data$EVTYPE <- gsub("ICY","ICE",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WATERSPOUTS","WATERSPOUT",df.data$EVTYPE)
df.data$EVTYPE[str_detect(df.data$EVTYPE,"TORNADO")]<-"TORNADO"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HAIL")]<-"HAIL"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH WIND")]<-"HIGH WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"LIGHTING")]<-"LIGHTNING"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HURRICANE")]<-"HURRICANE (TYPHOON)"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"WINDS")]<-"WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"SNOW")]<-"HEAVY SNOW"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH WARM")]<-"EXCESSIVE HEAT"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH HEAT")]<-"EXCESSIVE HEAT"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH RAIN")]<-"HEAVY RAIN"

# typos
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTROM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSNOW")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNERSTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTSORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTSORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTORM")]<-"THUNDERSTORM WIND"

# constant values
df.data$EVTYPE[df.data$EVTYPE=="VOG"] <- "FOG"
df.data$EVTYPE[df.data$EVTYPE=="RAIN DAMAGE"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH SHOWERS"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH SHOWER"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="RAIN (HEAVY)"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="RAINSTORM"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HEAVY RAINFALL"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGHLY WET"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH HOT"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="HIGHLY DRY"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="UNUSUAL WARMTH"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="HEAVY TEMPERATURES"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="TEMPERATURE HIGH"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="DRY HOT WEATHER"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="VOLCANIC ERUPTION"] <- "VOLCANIC ASH"

for (t in ev.types ) {
    df.data$EVTYPE[str_detect(df.data$EVTYPE,t)] <- t
}

After the cleanup of the source data we have 288 different wheather event types.

length(unique(sort(df.data$EVTYPE)))
## [1] 288

Calculate the population damage information

# damage to population health
df.data$POPULATION <- df.data$FATALITIES + df.data$INJURIES
df.agg.Population <- aggregate(POPULATION~EVTYPE, data=df.data, FUN=sum)
df.agg.Population <- df.agg.Population[order(-df.agg.Population$POPULATION),]
df.dmg.Data <- head(df.agg.Population)

The following table shows the event types with the biggest impact on population damage.

df.dmg.Data
##                EVTYPE POPULATION
## 228           TORNADO      97068
## 98               HEAT      12400
## 71              FLOOD      10128
## 233         TSTM WIND       7461
## 164         LIGHTNING       6048
## 226 THUNDERSTORM WIND       2685

Most damage to population is caused by Tornado and Heat.

Calculate the economy information

df.data$CROP_DMG <- df.data$CROPDMG * df.data$CROPDMGEXP
df.data$PROP_DMG <- df.data$PROPDMG * df.data$PROPDMGEXP
df.data$TOTAL_DMG <- df.data$CROP_DMG + df.data$PROP_DMG
df.agg.TOTAL_DMG <- aggregate(TOTAL_DMG~EVTYPE, data=df.data, FUN=sum)
df.agg.TOTAL_DMG <- df.agg.TOTAL_DMG[order(-df.agg.TOTAL_DMG$TOTAL_DMG),]
df.agg.Data <- head(df.agg.TOTAL_DMG)

The following table shows the event types with the biggest impact on economy damage.

df.agg.Data
##                  EVTYPE TOTAL_DMG
## 71                FLOOD 1.806e+11
## 144 HURRICANE (TYPHOON) 9.016e+10
## 228             TORNADO 5.902e+10
## 220         STORM SURGE 4.332e+10
## 96                 HAIL 1.913e+10
## 46              DROUGHT 1.502e+10

Most damage to economy is caused by Flood and Hurricane.

Plot health information:

barplot(as.numeric(df.dmg.Data$POPULATION), 
        names.arg=as.vector(df.dmg.Data$EVTYPE),
        ylab="Injured People",
        main="Damage To Population Health")

plot of chunk unnamed-chunk-11

Plot economy information:

barplot(as.numeric(df.agg.Data$TOTAL_DMG) / 1000000, 
        names.arg=as.vector(df.agg.Data$EVTYPE),
        ylab="Damage To Economy (Millionen)",
        main="Damage To Economy")

plot of chunk unnamed-chunk-12

Environment

sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_0.6.2
## 
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8  digest_0.6.4     evaluate_0.5.5   formatR_0.10    
## [5] htmltools_0.2.4  knitr_1.6        rmarkdown_0.2.46 tools_3.1.0     
## [9] yaml_2.1.13