The document is an author’s analytical attempt to explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
The object hereby is to answer the question of the impact of this weather events, mainly forcusing on Injuries, Fatalities, Crop and Property Damages.
# the zip file
stormDataZipFile <- "repdata-data-StormData.csv.bz2"
unZippedFile <- "StormData.csv"
# get current working dir
projDir <- getwd()
zipFilePath <- paste(projDir,stormDataZipFile,sep = "/")
unzippedDataPath <- paste(projDir,unZippedFile, sep = "/")
# check if unzipped file exists
if (!file.exists(unzippedDataPath)) {
bunzip2(
stormDataZipFile, unZippedFile, remove = FALSE, skip = TRUE, header = TRUE
)
}
stormRawData <- read.csv(unZippedFile)
events <-
c(
"EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP"
)
# set the events we want to see
filteredEventData <- stormRawData[,events]
# for memory concerns caching the reading may be necessay
cache = TRUE
head(filteredEventData)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
injuries <-
ddply(filteredEventData, .(EVTYPE), summarize,totalInjuries = sum(INJURIES, na.rm = TRUE))
injuries <-
injuries[order(injuries$totalInjuries, decreasing = TRUE),]
head(injuries, 10)
## EVTYPE totalInjuries
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 275 HEAT 2100
## 427 ICE STORM 1975
## 153 FLASH FLOOD 1777
## 760 THUNDERSTORM WIND 1488
## 244 HAIL 1361
dim(injuries)
## [1] 985 2
injuriesPlot <-
ggplot(data = head(injuries, 10), aes(
x = reorder(EVTYPE,totalInjuries) , y = totalInjuries, fill = EVTYPE,alpha = 0.5
)) + geom_bar(stat = "identity",fill = "darkblue") +
xlab("Events") + ylab("Totals Injuries") + coord_flip() + theme(legend.position = "none")
fatalitiesCase <-
ddply(filteredEventData, .(EVTYPE), summarize,totalFatalities = sum(FATALITIES, na.rm = TRUE))
fatalitiesCase <-
fatalitiesCase[order(fatalitiesCase$totalFatalities, decreasing = TRUE),]
dim(fatalitiesCase)
## [1] 985 2
head(fatalitiesCase, tail(10))
## EVTYPE totalFatalities
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 856 TSTM WIND 504
## 170 FLOOD 470
## 585 RIP CURRENT 368
## 359 HIGH WIND 248
## 19 AVALANCHE 224
fatalitiesPlot <-
ggplot(
data = head(fatalitiesCase, 10), aes(
x = reorder(EVTYPE,totalFatalities), y = totalFatalities, fill = EVTYPE, alpha = .3 )
) + geom_bar(stat = "identity",fill = "red") +
xlab("Events") + ylab("Totals Fatalities") + ggtitle("Top 10 Weather Events Health Injuries and/or Fatalities impacts in US") +
coord_flip() + theme(legend.position ="none")
unique(filteredEventData$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(filteredEventData$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
filteredEventData$PROPDMGEXP <- toupper(filteredEventData$PROPDMGEXP)
filteredEventData$CROPDMGEXP <- toupper(filteredEventData$CROPDMGEXP)
filteredEventData$PROPDMGEXP[filteredEventData$PROPDMGEXP %in% c("?","-","+","")] = "0"
filteredEventData$CROPDMGEXP[filteredEventData$CROPDMGEXP %in% c("?","-","+","")] = "0"
unique(filteredEventData$PROPDMGEXP)
## [1] "K" "M" "0" "B" "5" "6" "4" "2" "3" "H" "7" "1" "8"
unique(filteredEventData$CROPDMGEXP)
## [1] "0" "M" "K" "B" "2"
filteredEventData$PROPDMGEXP[filteredEventData$PROPDMGEXP %in% c("B")] = "9"
filteredEventData$PROPDMGEXP[filteredEventData$PROPDMGEXP %in% c("M")] = "6"
filteredEventData$PROPDMGEXP[filteredEventData$PROPDMGEXP %in% c("K")] = "3"
filteredEventData$PROPDMGEXP[filteredEventData$PROPDMGEXP %in% c("H")] = "2"
filteredEventData$CROPDMGEXP[filteredEventData$CROPDMGEXP %in% c("B")] = "9"
filteredEventData$CROPDMGEXP[filteredEventData$CROPDMGEXP %in% c("M")] = "6"
filteredEventData$CROPDMGEXP[filteredEventData$CROPDMGEXP %in% c("K")] = "3"
filteredEventData$CROPDMGEXP[filteredEventData$CROPDMGEXP %in% c("H")] = "2"
filteredEventData$propertyDamage <-
filteredEventData$PROPDMG * (10 ** as.numeric(filteredEventData$PROPDMGEXP))
filteredEventData$cropDamage <-
filteredEventData$CROPDMG * (10 ** as.numeric(filteredEventData$CROPDMGEXP))
head(filteredEventData, tail(10))
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 3 0 0
## 2 TORNADO 0 0 2.5 3 0 0
## 3 TORNADO 0 2 25.0 3 0 0
## 4 TORNADO 0 2 2.5 3 0 0
## 5 TORNADO 0 2 2.5 3 0 0
## 6 TORNADO 0 6 2.5 3 0 0
## 7 TORNADO 0 1 2.5 3 0 0
## 8 TORNADO 0 0 2.5 3 0 0
## 9 TORNADO 1 14 25.0 3 0 0
## 10 TORNADO 0 0 25.0 3 0 0
## propertyDamage cropDamage
## 1 25000 0
## 2 2500 0
## 3 25000 0
## 4 2500 0
## 5 2500 0
## 6 2500 0
## 7 2500 0
## 8 2500 0
## 9 25000 0
## 10 25000 0
financialDamage <- ddply(
filteredEventData, .(EVTYPE), summarize,totalPropertyDamage = sum(propertyDamage),totalCropDamage = sum(cropDamage)
)
financialDamage <- financialDamage[(financialDamage$totalCropDamage > 0 | financialDamage$totalPropertyDamage > 0),]
head(financialDamage)
## EVTYPE totalPropertyDamage totalCropDamage
## 1 HIGH SURF ADVISORY 200000 0
## 3 FLASH FLOOD 50000 0
## 5 TSTM WIND 8100000 0
## 6 TSTM WIND (G45) 8000 0
## 9 ? 5000 0
## 14 AGRICULTURAL FREEZE 0 28820000
financialDamage <-
financialDamage[order(financialDamage$totalCropDamage, decreasing = TRUE),]
financialDamage <-
financialDamage[order(financialDamage$totalPropertyDamage, decreasing = TRUE),]
propertyPlot <-
ggplot(
data = head(financialDamage, 10), aes( x = reorder(EVTYPE,totalPropertyDamage), y = log10(totalPropertyDamage), fill = totalPropertyDamage, alpha =.3 )) +
geom_bar(stat = "identity", fill = "darkblue") +
xlab("Events") + ylab("Property Damages [Billions $]") + ggtitle("Economical impact of Top 10 Weather Events in the USA") + coord_flip() + theme(legend.position = "none")
cropPlot <-
ggplot(
data = head(financialDamage, 10), aes( x = reorder(EVTYPE,totalCropDamage), y = log10(totalCropDamage), fill = totalCropDamage, alpha =.3)) +
geom_bar(stat = "identity", fill = "darkgreen") +
xlab("Events") + ylab("Crop Damages [Billions $]") + coord_flip() +
theme(legend.position = "none")
grid.arrange(fatalitiesPlot, injuriesPlot, nrow = 2, ncol = 1)
grid.arrange(propertyPlot, cropPlot, nrow = 2, ncol = 1)
Tornados are the most health impactful, while Floods have most damaging imact to crop and property of the top 10 weather events as shown in the plots above.
Reproducible Research Project 2
a John Hopkins University Coursera Data Science Specialization Course