In this analysis we will explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database and try to determine which types of events are most harmful to population health and which types of events have the greatest economic consequences.
We will download the data from NOAA database and load it as a data table as it is easier to perform functions and generally loads faster. The file contains data from 1950 till November, 2011.
library(data.table)
library(ggplot2)
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
f <- file.path(getwd(), "/repdata%2Fdata%2FStormData.csv.bz2")
download.file(url = url, destfile = f)
data0 <- read.csv(f)
data0 <- as.data.table(data0)
Trimming the data table to only contain the information that we need in order to get the results.
colsremove <- colnames(data0[ ,!c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
"PROPDMGEXP", "CROPDMG", "CROPDMGEXP")])
data0[ ,c(colsremove) := NULL]
data0 <- data0[(EVTYPE != "?" &
(INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)), c("EVTYPE"
, "FATALITIES"
, "INJURIES"
, "PROPDMG"
, "PROPDMGEXP"
, "CROPDMG"
, "CROPDMGEXP") ]
Making the PROPDMGEXP and CROPDMGEXP columns cleaner so that they can be used to calculate the property and crop costs and mapping the property damage and crop damage alphanumeric exponents to numeric values.
cols <- c("PROPDMGEXP", "CROPDMGEXP")
data0[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
propDmgKey <- c("\"\"" = 10^0,
"-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
cropDmgKey <- c("\"\"" = 10^0,
"?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
data0[, PROPDMGEXP := propDmgKey[as.character(data0[,PROPDMGEXP])]]
data0[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
data0[, CROPDMGEXP := cropDmgKey[as.character(data0[,CROPDMGEXP])] ]
data0[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
Making economic costs coloumns and calculating the total property and crop costs and also total fatalities and injjuries.
data0 <- data0[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propCost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropCost = CROPDMG * CROPDMGEXP)]
totalCostDT <- data0[, .(propCost = sum(propCost), cropCost = sum(cropCost), Total_Cost = sum(propCost) + sum(cropCost)), by = .(EVTYPE)]
totalCostDT <- totalCostDT[order(-Total_Cost), ]
totalCostDT <- totalCostDT[1:10, ]
head(totalCostDT, 5)
## EVTYPE propCost cropCost Total_Cost
## 1: FLOOD 144657709807 5661968450 150319678257
## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3: TORNADO 56947380677 414953270 57362333947
## 4: STORM SURGE 43323536000 5000 43323541000
## 5: HAIL 15735267513 3025954473 18761221986
totalInjuriesDT <- data0[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), total = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
totalInjuriesDT <- totalInjuriesDT[order(-FATALITIES), ]
totalInjuriesDT <- totalInjuriesDT[1:10, ]
head(totalInjuriesDT, 5)
## EVTYPE FATALITIES INJURIES total
## 1: TORNADO 5633 91346 96979
## 2: EXCESSIVE HEAT 1903 6525 8428
## 3: FLASH FLOOD 978 1777 2755
## 4: HEAT 937 2100 3037
## 5: LIGHTNING 816 5230 6046
Melting data table so its easier to put in bar graph format. And then plotting the resultant data tables.
bad_stuff <- melt(totalInjuriesDT, id.vars="EVTYPE", variable.name = "bad_thing")
head(bad_stuff, 5)
## EVTYPE bad_thing value
## 1: TORNADO FATALITIES 5633
## 2: EXCESSIVE HEAT FATALITIES 1903
## 3: FLASH FLOOD FATALITIES 978
## 4: HEAT FATALITIES 937
## 5: LIGHTNING FATALITIES 816
healthChart <- ggplot(bad_stuff, aes(x=reorder(EVTYPE, -value), y=value))
healthChart = healthChart + geom_bar(stat="identity", aes(fill=bad_thing), position="dodge")
healthChart = healthChart + ylab("Frequency Count")
healthChart = healthChart + xlab("Event Type")
healthChart = healthChart + theme(axis.text.x = element_text(angle=45, hjust=1))
healthChart = healthChart + ggtitle("Top 10 US Killers") + theme(plot.title = element_text(hjust = 0.5))
healthChart
econ_consequences <- melt(totalCostDT, id.vars="EVTYPE", variable.name = "Damage_Type")
head(econ_consequences, 5)
## EVTYPE Damage_Type value
## 1: FLOOD propCost 144657709807
## 2: HURRICANE/TYPHOON propCost 69305840000
## 3: TORNADO propCost 56947380677
## 4: STORM SURGE propCost 43323536000
## 5: HAIL propCost 15735267513
econChart <- ggplot(econ_consequences, aes(x=reorder(EVTYPE, -value), y=value))
econChart = econChart + geom_bar(stat="identity", aes(fill=Damage_Type), position="dodge")
econChart = econChart + ylab("Cost (dollars)")
econChart = econChart + xlab("Event Type")
econChart = econChart + theme(axis.text.x = element_text(angle=45, hjust=1))
econChart = econChart + ggtitle("Top 10 US Storm Events causing Economic Consequences") + theme(plot.title = element_text(hjust = 0.5))
econChart
Thus from our analysis we can observe that regarding fatalities and injuries Tornado is most harmful and Floods have the greatest economic consequences.