This analysis reviews the data of the U.S. National Oceanic and Athmosphere Administration’s (NOAA) storm database. This database contains information of major storms and wheather events in the US. We research which wheather event types causes the biggest damage to population and economy.
Source can be downloaded from the following URL:
Source Data: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
Documentation: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
FAQ: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf
Most damage to population health is caused by tornados. Most damage to economy is caused by Flood.
Load dependent libraries.
library(stringr)
Load the source data files.
# load the source data file
fileURL <- "repdata_data_StormData.csv"
df.data <- read.csv(fileURL,stringsAsFactors = F)
Perform basic clean up of the data by converting the date variables, removing aggregated information and converting the event information to upper case.
df.data$BGN_DATE <- as.Date(substr(df.data$BGN_DATE,1,10), format="%m/%d/%Y")
# we convert to upper case to simply the data cleaning process
df.data$EVTYPE <- toupper(df.data$EVTYPE)
df.data$PROPDMGEXP <- toupper(df.data$PROPDMGEXP)
df.data$CROPDMGEXP <- toupper(df.data$CROPDMGEXP)
# remove summary event observations as it is aggregated data
df.data <- df.data[!str_detect(df.data$EVTYPE,"SUMMARY"),]
Clean up exponent of crop and property damage data. Here we change the exponent value and change the data type to numeric.
# crop damage exponent
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="0"] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="?"] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP==""] <- 1
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="2"] <- 100
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="K"] <- 1000
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="M"] <- 1000000
df.data$CROPDMGEXP[df.data$CROPDMGEXP=="B"] <- 1000000000
df.data$CROPDMGEXP <- as.numeric(df.data$CROPDMGEXP)
# property damage exponent
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="0"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="1"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP==""] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="+"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="-"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="?"] <- 1
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="2"] <- 10^2
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="3"] <- 10^3
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="4"] <- 10^4
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="5"] <- 10^5
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="6"] <- 10^6
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="7"] <- 10^7
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="8"] <- 10^8
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="H"] <- 100
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="K"] <- 1000
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="M"] <- 1000000
df.data$PROPDMGEXP[df.data$PROPDMGEXP=="B"] <- 1000000000
df.data$PROPDMGEXP <- as.numeric(df.data$PROPDMGEXP)
Clean up the event data.
# construct list of valid events as listed in referenced documentation
ev.types <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill", "Debris Flow",
"Dense Fog", "Dense Smoke", "Drought", "Dust Devil", "Dust Storm", "Excessive Heat", "Extreme Cold/Wind Chill",
"Flash Flood", "Flood", "Frost/Freeze", "Funnel Cloud", "Freezing Fog", "Hail", "Heat",
"Heavy Rain", "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", "Ice Storm",
"Lake-Effect Snow", "Lakeshore Flood", "Lightning", "Marine Hail",
"Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet",
"Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", "Tropical Depression",
"Tropical Storm", "Tsunami", "Volcanic Ash", "Waterspout", "Wildfire",
"Winter Storm", "Winter Weather")
ev.types <- toupper(ev.types)
# special characters and strings
df.data$EVTYPE <- gsub("\\\\"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("/"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("-"," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub(" & "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub(" AND "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub(" "," ",df.data$EVTYPE)
df.data$EVTYPE <- gsub("SML","SMALL",df.data$EVTYPE)
df.data$EVTYPE <- str_trim(df.data$EVTYPE, side="both")
# size
df.data$EVTYPE <- gsub("ABNORMALLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("ABNORMAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTREME","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTREMELY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FIRST","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXTENDED","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HEAVY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HOT","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HARD","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("EXCESSIVE","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("HEAVY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("TROPICAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("RECORD","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNSEASONABLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNSEASONAL","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("UNUSUALLY","HIGH",df.data$EVTYPE)
df.data$EVTYPE <- gsub("MONTHLY","HIGH",df.data$EVTYPE)
# wording
df.data$EVTYPE <- gsub("TORNDAO","TORNADO",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WND","WIND",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WINDS","WIND",df.data$EVTYPE)
df.data$EVTYPE <- gsub("STORMS","STORM",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FLOODING","FLOOD",df.data$EVTYPE)
df.data$EVTYPE <- gsub("FLOODS","FLOOD",df.data$EVTYPE)
df.data$EVTYPE <- gsub("SNOWFALL","SNOW",df.data$EVTYPE)
df.data$EVTYPE <- gsub("ICY","ICE",df.data$EVTYPE)
df.data$EVTYPE <- gsub("WATERSPOUTS","WATERSPOUT",df.data$EVTYPE)
df.data$EVTYPE[str_detect(df.data$EVTYPE,"TORNADO")]<-"TORNADO"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HAIL")]<-"HAIL"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH WIND")]<-"HIGH WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"LIGHTING")]<-"LIGHTNING"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HURRICANE")]<-"HURRICANE (TYPHOON)"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"WINDS")]<-"WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"SNOW")]<-"HEAVY SNOW"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH WARM")]<-"EXCESSIVE HEAT"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH HEAT")]<-"EXCESSIVE HEAT"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"HIGH RAIN")]<-"HEAVY RAIN"
# typos
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTROM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSNOW")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNERSTORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTSORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERTSORM")]<-"THUNDERSTORM WIND"
df.data$EVTYPE[str_detect(df.data$EVTYPE,"THUNDERSTORM")]<-"THUNDERSTORM WIND"
# constant values
df.data$EVTYPE[df.data$EVTYPE=="VOG"] <- "FOG"
df.data$EVTYPE[df.data$EVTYPE=="RAIN DAMAGE"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH SHOWERS"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH SHOWER"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="RAIN (HEAVY)"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="RAINSTORM"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HEAVY RAINFALL"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGHLY WET"] <- "HEAVY RAIN"
df.data$EVTYPE[df.data$EVTYPE=="HIGH HOT"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="HIGHLY DRY"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="UNUSUAL WARMTH"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="HEAVY TEMPERATURES"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="TEMPERATURE HIGH"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="DRY HOT WEATHER"] <- "EXCESSIVE HEAT"
df.data$EVTYPE[df.data$EVTYPE=="VOLCANIC ERUPTION"] <- "VOLCANIC ASH"
for (t in ev.types ) {
df.data$EVTYPE[str_detect(df.data$EVTYPE,t)] <- t
}
After the cleanup of the source data we have 288 different wheather event types.
length(unique(sort(df.data$EVTYPE)))
## [1] 288
Calculate the population damage information
# damage to population health
df.data$POPULATION <- df.data$FATALITIES + df.data$INJURIES
df.agg.Population <- aggregate(POPULATION~EVTYPE, data=df.data, FUN=sum)
df.agg.Population <- df.agg.Population[order(-df.agg.Population$POPULATION),]
df.dmg.Data <- head(df.agg.Population)
The following table shows the event types with the biggest impact on population damage.
df.dmg.Data
## EVTYPE POPULATION
## 228 TORNADO 97068
## 98 HEAT 12400
## 71 FLOOD 10128
## 233 TSTM WIND 7461
## 164 LIGHTNING 6048
## 226 THUNDERSTORM WIND 2685
Most damage to population is caused by Tornado and Heat.
Calculate the economy information
df.data$CROP_DMG <- df.data$CROPDMG * df.data$CROPDMGEXP
df.data$PROP_DMG <- df.data$PROPDMG * df.data$PROPDMGEXP
df.data$TOTAL_DMG <- df.data$CROP_DMG + df.data$PROP_DMG
df.agg.TOTAL_DMG <- aggregate(TOTAL_DMG~EVTYPE, data=df.data, FUN=sum)
df.agg.TOTAL_DMG <- df.agg.TOTAL_DMG[order(-df.agg.TOTAL_DMG$TOTAL_DMG),]
df.agg.Data <- head(df.agg.TOTAL_DMG)
The following table shows the event types with the biggest impact on economy damage.
df.agg.Data
## EVTYPE TOTAL_DMG
## 71 FLOOD 1.806e+11
## 144 HURRICANE (TYPHOON) 9.016e+10
## 228 TORNADO 5.902e+10
## 220 STORM SURGE 4.332e+10
## 96 HAIL 1.913e+10
## 46 DROUGHT 1.502e+10
Most damage to economy is caused by Flood and Hurricane.
Plot health information:
barplot(as.numeric(df.dmg.Data$POPULATION),
names.arg=as.vector(df.dmg.Data$EVTYPE),
ylab="Injured People",
main="Damage To Population Health")
Plot economy information:
barplot(as.numeric(df.agg.Data$TOTAL_DMG) / 1000000,
names.arg=as.vector(df.agg.Data$EVTYPE),
ylab="Damage To Economy (Millionen)",
main="Damage To Economy")
sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_0.6.2
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 digest_0.6.4 evaluate_0.5.5 formatR_0.10
## [5] htmltools_0.2.4 knitr_1.6 rmarkdown_0.2.46 tools_3.1.0
## [9] yaml_2.1.13