This is a brief analysis of the impact of different weather events, as collected in U.S. National Oceanic and Atmospheric Administration's (NOAA) storm database, over US population health and US economy. This NOAA storm database contains data gathered on a period spanned from 1950 to 2011. My analysis addresses the following 2 simple questions:
As NOAA storm database has many inconsistencies due to the long time period and different collecting ways and sources my analysis didn't manage to resolve all these mismatches.
The input file for this brief analysis is NOAA storm database zipped file repdata_data_StormData.csv.bz2 downloaded from Coursera Reproducible Research course web site at June 22, 2014: Storm Data. This database structure is described in 2 associated files, downloadable also from Coursera Reproducible Research course web site :
Unzipping NOAA storm database zipped file produces repdata_data_StormData.csv which will be read into a data frame, df:
library("lubridate")
df <- read.csv('repdata_data_StormData.csv')
For each of the 2 required questions we create a suitable subset of the input dataframe containg only the needed columns:
dfFatInj <- df[,c("EVTYPE", "BGN_DATE", "FATALITIES", "INJURIES")]
dfDmg <- df[,c("EVTYPE", "BGN_DATE", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Construct useful DateTime columns:
dfFatInj$year <- year(as.Date(as.character(dfFatInj$BGN_DATE), "%m/%d/%Y %H:%M"))
dfFatInj$month <- month(as.Date(as.character(dfFatInj$BGN_DATE), "%m/%d/%Y %H:%M"))
dfDmg$year <- year(as.Date(as.character(dfDmg$BGN_DATE), "%m/%d/%Y %H:%M"))
dfDmg$month <- month(as.Date(as.character(dfDmg$BGN_DATE), "%m/%d/%Y %H:%M"))
Factor month column for a monthly analysis:
monthMap <- c("January", "February", "March",
"April", "May", "June",
"July", "August", "September",
"October", "November", "December")
dfFatInj$month = as.factor(monthMap[dfFatInj$month])
dfDmg$month = as.factor(monthMap[dfDmg$month])
Prepare other columns:
dfFatInj$EVTYPE <- toupper(dfFatInj$EVTYPE)
dfDmg$EVTYPE <- toupper(dfDmg$EVTYPE)
dfDmg$PROPDMGEXP <- toupper(dfDmg$PROPDMGEXP)
dfDmg$CROPDMGEXP <- toupper(dfDmg$CROPDMGEXP)
dfDmg$PROPDMGEXP <- as.factor(dfDmg$PROPDMGEXP)
dfDmg$CROPDMGEXP <- as.factor(dfDmg$CROPDMGEXP)
levels(dfDmg$CROPDMGEXP)
## [1] "" "?" "0" "2" "B" "K" "M"
levels(dfDmg$PROPDMGEXP)
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "H" "K" "M"
Aggregate property and crop damages from 2 columns (mantissa + exponent) into only 1 numeric column: PROPDMG_WO_EXP, CROPDMG_WO_EXP:
applyExponent <- function(mantissa, exponent) {
num <- mantissa *
ifelse( exponent %in% c('0','1','2','3','4','5','6','7','8'),
10^as.numeric(exponent),
ifelse( exponent == 'H',
10^2,
ifelse( exponent == 'K',
10^3,
ifelse( exponent == 'M',
10^6,
ifelse( exponent == 'B',
10^9, 1)))))
num
}
dfDmg$PROPDMG_WO_EXP <- applyExponent(dfDmg$PROPDMG, dfDmg$PROPDMGEXP)
dfDmg$CROPDMG_WO_EXP <- applyExponent(dfDmg$CROPDMG, dfDmg$CROPDMGEXP)
Find first 12 weather causes for human fatalities:
fatalities <- aggregate(dfFatInj$FATALITIES , by=list(dfFatInj$EVTYPE), FUN=sum, na.rm=TRUE)
str(fatalities)
## 'data.frame': 898 obs. of 2 variables:
## $ Group.1: chr " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD" " LIGHTNING" ...
## $ x : num 0 0 0 0 0 0 0 0 0 0 ...
colnames(fatalities) <- c("EVTYPE", "FATALITIES")
fatalities <- fatalities[ order(-fatalities$FATALITIES), ]
fatalities <- fatalities[1:12,]
fatalities
## EVTYPE FATALITIES
## 758 TORNADO 5633
## 116 EXCESSIVE HEAT 1903
## 138 FLASH FLOOD 978
## 243 HEAT 937
## 418 LIGHTNING 816
## 779 TSTM WIND 504
## 154 FLOOD 470
## 524 RIP CURRENT 368
## 320 HIGH WIND 248
## 19 AVALANCHE 224
## 888 WINTER STORM 206
## 525 RIP CURRENTS 204
Find first 12 weather causes for human injuries:
injuries <- aggregate(dfFatInj$INJURIES , by=list(dfFatInj$EVTYPE), FUN=sum, na.rm=TRUE)
str(injuries)
## 'data.frame': 898 obs. of 2 variables:
## $ Group.1: chr " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD" " LIGHTNING" ...
## $ x : num 0 0 0 0 0 0 0 0 0 0 ...
colnames(injuries) <- c("EVTYPE", "INJURIES")
injuries <- injuries[ order(-injuries$INJURIES), ]
injuries <- injuries[1:12,]
injuries
## EVTYPE INJURIES
## 758 TORNADO 91346
## 779 TSTM WIND 6957
## 154 FLOOD 6789
## 116 EXCESSIVE HEAT 6525
## 418 LIGHTNING 5230
## 243 HEAT 2100
## 387 ICE STORM 1975
## 138 FLASH FLOOD 1777
## 685 THUNDERSTORM WIND 1488
## 212 HAIL 1361
## 888 WINTER STORM 1321
## 372 HURRICANE/TYPHOON 1275
Find first 12 weather causes for economic damages, the sum of property and crop damages:
dfDmg$DMG <- dfDmg$PROPDMG_WO_EXP + dfDmg$CROPDMG_WO_EXP
damages <- aggregate(dfDmg$DMG , by=list(dfDmg$EVTYPE), FUN=sum, na.rm=TRUE)
str(damages)
## 'data.frame': 898 obs. of 2 variables:
## $ Group.1: chr " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD" " LIGHTNING" ...
## $ x : num 200000 0 50000 0 8100000 8000 0 0 5000 0 ...
colnames(damages) <- c("EVTYPE", "DAMAGES")
damages <- damages[ order(-damages$DAMAGES), ]
damages <- damages[1:12,]
damages
## EVTYPE DAMAGES
## 138 FLASH FLOOD 6.820e+13
## 711 THUNDERSTORM WINDS 2.087e+13
## 758 TORNADO 1.079e+12
## 212 HAIL 3.188e+11
## 418 LIGHTNING 1.730e+11
## 154 FLOOD 1.503e+11
## 372 HURRICANE/TYPHOON 7.191e+10
## 168 FLOODING 5.922e+10
## 599 STORM SURGE 4.332e+10
## 274 HEAVY SNOW 1.807e+10
## 84 DROUGHT 1.502e+10
## 363 HURRICANE 1.461e+10
For the first question: Across the United States, which types of weather events are most harmful with respect to population health? the answer is represented by the following graphics:
library("ggplot2")
library(grid)
library(gridExtra)
pFatalities <- ggplot(fatalities, aes(x = reorder(EVTYPE, FATALITIES), y = FATALITIES, fill = EVTYPE)) +
geom_bar(colour="black", position = "dodge", stat="identity", fill = "red", width=.7) +
labs(title=("Fig.1 Top 12 events for highest fatalities")) +
ylab("Fatalities count") + xlab("Events") + coord_flip()
pFatalities
pInjuries <- ggplot(injuries, aes(x = reorder(EVTYPE, INJURIES), y = INJURIES, fill = EVTYPE)) +
geom_bar(colour="black", position = "dodge", stat="identity", fill = "yellow", width=.7) +
labs(title=("Fig.2 Top 12 events for highest injuries")) +
ylab("Injuries count") + xlab("Events") + coord_flip()
pInjuries
For the second question: Across the United States, which types of events have the greatest economic consequences? the answer is represented by the following graphics:
pDamages <- ggplot(damages, aes(x = reorder(EVTYPE, DAMAGES), y = DAMAGES / 10^9, fill = EVTYPE)) +
geom_bar(colour="black", position = "dodge", stat="identity", fill = "grey", width=.7) +
labs(title=("Fig.3 Impact of 12 worst weather events on US economy")) +
ylab("Damages in US $ billions") + xlab("Events") + coord_flip()
pDamages