The assignment is to use the NOAA Storm Database to explore the effects of severe weather events on both population and economy. The database includes the time period between 1950 and November 2011.
The analysis investigates what types of severe weather events are most harmful on:
The data for this assignment can be downloaded from the course web site:
library("data.table")
library("ggplot2")
# Downloading file
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile="/home/dugi/coursera/Course5_week4/repdata_data_StormData.csv.bz2")
# Reading in csv file into a data.frame
stormFile <- read.csv("/home/dugi/coursera/Course5_week4/repdata_data_StormData.csv.bz2")
# Converting data.table
stormData <- as.data.table(stormFile)
colnames(stormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3: 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4: 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5: 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6: 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1: TORNADO 0 0
## 2: TORNADO 0 0
## 3: TORNADO 0 0
## 4: TORNADO 0 0
## 5: TORNADO 0 0
## 6: TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1: NA 0 14.0 100 3 0 0
## 2: NA 0 2.0 150 2 0 0
## 3: NA 0 0.1 123 2 0 0
## 4: NA 0 0.0 100 2 0 0
## 5: NA 0 0.0 150 2 0 0
## 6: NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1: 15 25.0 K 0
## 2: 0 2.5 K 0
## 3: 2 25.0 K 0
## 4: 2 2.5 K 0
## 5: 2 2.5 K 0
## 6: 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3040 8812 3051 8806 1
## 2: 3042 8755 0 0 2
## 3: 3340 8742 0 0 3
## 4: 3458 8626 0 0 4
## 5: 3412 8642 0 0 5
## 6: 3450 8748 0 0 6
Subset dataset for parameters we want and we remove the columns we do not need.
# Setting vector colsRemove to columns to remove
colsRemove <- colnames(stormData[, !c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")])
# Removing columns
stormData[, c(colsRemove) := NULL]
# Need only data where fatalities, injuries, property damage or crop damage are > 0
stormData <- stormData[(EVTYPE != "?" &
(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)),
c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Need to fix PROPDMGEXP & CROPDMGEXP columns so they can be used to calculate in dollar amounts.
# Change all damage exponents to uppercase.
cols <- c("PROPDMGEXP", "CROPDMGEXP")
stormData[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
# Verifying values in PROPDMGEXP and CROPDMGEXP
table(stormData$PROPDMGEXP)
##
## - + 0 2 3 4 5 6 7
## 11585 1 5 210 1 1 4 18 3 3
## B H K M
## 40 7 231427 11327
table(stormData$CROPDMGEXP)
##
## ? 0 B K M
## 152663 6 17 7 99953 1986
# Map property damage alphanumeric exponents to numeric values.
propDmgKey <- c("\"\"" = 10^0,
"-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
# Map crop damage alphanumeric exponents to numeric values
cropDmgKey <- c("\"\"" = 10^0,
"?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
stormData[, PROPDMGEXP := propDmgKey[as.character(stormData[,PROPDMGEXP])]]
stormData[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
stormData[, CROPDMGEXP := cropDmgKey[as.character(stormData[,CROPDMGEXP])] ]
stormData[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
#stormData$PROPCOST <- stormData$PROPDMG %*% stormData$PROPDMGEXP
stormData <- stormData[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propCost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropCost = CROPDMG * CROPDMGEXP)]
totalCostData <- stormData[, .(propCost=sum(propCost), cropCost=sum(cropCost), totalCost=sum(propCost) + sum(cropCost)), by = .(EVTYPE)]
totalCostData <- totalCostData[order(-totalCost), ]
totalCostData <- totalCostData[1:10, ]
totalCostData
## EVTYPE propCost cropCost totalCost
## 1: FLOOD 144657709807 5661968450 150319678257
## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3: TORNADO 56947380676 414953270 57362333946
## 4: STORM SURGE 43323536000 5000 43323541000
## 5: HAIL 15735267513 3025954473 18761221986
## 6: FLASH FLOOD 16822673978 1421317100 18243991078
## 7: DROUGHT 1046106000 13972566000 15018672000
## 8: HURRICANE 11868319010 2741910000 14610229010
## 9: RIVER FLOOD 5118945500 5029459000 10148404500
## 10: ICE STORM 3944927860 5022113500 8967041360
totalInjuryData <- stormData[, .(FATALITIES=sum(FATALITIES), INJURIES=sum(INJURIES), totalInjury=sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
totalInjuryData <- totalInjuryData[order(-FATALITIES), ]
totalInjuryData <- totalInjuryData[1:10, ]
totalInjuryData
## EVTYPE FATALITIES INJURIES totalInjury
## 1: TORNADO 5633 91346 96979
## 2: EXCESSIVE HEAT 1903 6525 8428
## 3: FLASH FLOOD 978 1777 2755
## 4: HEAT 937 2100 3037
## 5: LIGHTNING 816 5230 6046
## 6: TSTM WIND 504 6957 7461
## 7: FLOOD 470 6789 7259
## 8: RIP CURRENT 368 232 600
## 9: HIGH WIND 248 1137 1385
## 10: AVALANCHE 224 170 394
# Transpose data.table to put into histogram
harmPopData <- melt(totalInjuryData, id.vars="EVTYPE", variable.name = "badOutcome")
head(harmPopData)
## EVTYPE badOutcome value
## 1: TORNADO FATALITIES 5633
## 2: EXCESSIVE HEAT FATALITIES 1903
## 3: FLASH FLOOD FATALITIES 978
## 4: HEAT FATALITIES 937
## 5: LIGHTNING FATALITIES 816
## 6: TSTM WIND FATALITIES 504
# Create Histogram
harmPopHist <- ggplot(harmPopData, aes(x=reorder(EVTYPE, -value), y=value)) +
geom_bar(stat="identity", aes(fill=badOutcome), position="dodge") +
ylab("Frequency") + xlab("Event") + ggtitle("Top 10 Harmful Storm Events in U.S.") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5))
harmPopHist
# Transpose data.table to put into histogram
badEconomyData <- melt(totalCostData, id.vars="EVTYPE", variable.name = "damageType")
head(badEconomyData)
## EVTYPE damageType value
## 1: FLOOD propCost 144657709807
## 2: HURRICANE/TYPHOON propCost 69305840000
## 3: TORNADO propCost 56947380676
## 4: STORM SURGE propCost 43323536000
## 5: HAIL propCost 15735267513
## 6: FLASH FLOOD propCost 16822673978
# Create Histogram
harmEconHist <- ggplot(badEconomyData, aes(x=reorder(EVTYPE, -value), y=value)) +
geom_bar(stat="identity", aes(fill=damageType), position="dodge") +
ylab("Cost (dollars)") + xlab("Event") + ggtitle("Top 10 Harmful Economic Storm Events in U.S.") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5))
harmEconHist