The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events and their effect on both population and economy.
The following analysis investigates which types of severe weather events across the United States:
For this purpose the top 10 impactful storm events (with repsect to above metrics) are examined.
The database covers the period between 1950 and 2011. The data can be downloaded here and is documented here.
Download the file and load into a data table (better performance than data frame). Fread can read bz2 files directly (Using fread with .csv files yields better performance).
# libs
library("data.table")
library("ggplot2")
# data
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "repdata%2Fdata%2FStormData.csv.bz2")
stormData <- fread("repdata%2Fdata%2FStormData.csv.bz2")
# stormData <- fread("repdata%2Fdata%2FStormData.csv")
summary(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE
## Min. : 0.000 Length:902297 Length:902297 Length:902297
## 1st Qu.: 0.000 Class :character Class :character Class :character
## Median : 0.000 Mode :character Mode :character Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## END_TIME COUNTY_END COUNTYENDN END_RANGE
## Length:902297 Min. :0 Mode:logical Min. : 0.0000
## Class :character 1st Qu.:0 NA's:902297 1st Qu.: 0.0000
## Mode :character Median :0 Median : 0.0000
## Mean :0 Mean : 0.9862
## 3rd Qu.:0 3rd Qu.: 0.0000
## Max. :0 Max. :925.0000
##
## END_AZI END_LOCATI LENGTH WIDTH
## Length:902297 Length:902297 Min. : 0.0000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.0000 Median : 0.000
## Mean : 0.2301 Mean : 7.503
## 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. :2315.0000 Max. :4400.000
##
## F MAG FATALITIES INJURIES
## Min. :0.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000
## 1st Qu.:0.0 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median :1.0 Median : 50.0 Median : 0.0000 Median : 0.0000
## Mean :0.9 Mean : 46.9 Mean : 0.0168 Mean : 0.1557
## 3rd Qu.:1.0 3rd Qu.: 75.0 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :5.0 Max. :22000.0 Max. :583.0000 Max. :1700.0000
## NA's :843563
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## Min. : 0.00 Length:902297 Min. : 0.000 Length:902297
## 1st Qu.: 0.00 Class :character 1st Qu.: 0.000 Class :character
## Median : 0.00 Mode :character Median : 0.000 Mode :character
## Mean : 12.06 Mean : 1.527
## 3rd Qu.: 0.50 3rd Qu.: 0.000
## Max. :5000.00 Max. :990.000
##
## WFO STATEOFFIC ZONENAMES LATITUDE
## Length:902297 Length:902297 Length:902297 Min. : 0
## Class :character Class :character Class :character 1st Qu.:2802
## Mode :character Mode :character Mode :character Median :3540
## Mean :2875
## 3rd Qu.:4019
## Max. :9706
## NA's :47
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. :-14451 Min. : 0 Min. :-14455 Length:902297
## 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 8707 Median : 0 Median : 0 Mode :character
## Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. : 17124 Max. :9706 Max. :106220
## NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
str(stormData)
## Classes 'data.table' and 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, ".internal.selfref")=<externalptr>
Remove not needed columns, for better overview and handling.
# find columns to remove
neededCols <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
notNeededCols <- colnames(stormData[, !..neededCols])
# remove columns
stormData[, c(notNeededCols) := NULL]
# only relevant data with harmful events
stormData <- stormData[(EVTYPE != "?" & (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)), ..neededCols]
Cleaning PROPDMGEXP and CROPDMGEXP columns to make later calculations easier.
# change damage exponents to uppercase
cols <- c("PROPDMGEXP", "CROPDMGEXP")
stormData[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
# map alphanumeric exponents to numeric values
expKey1 <- c("\"\"" = 10^0,"-" = 10^0,"+" = 10^0,"0" = 10^0,"1" = 10^1,"2" = 10^2,"3" = 10^3,
"4" = 10^4,"5" = 10^5,"6" = 10^6,"7" = 10^7,"8" = 10^8,"9" = 10^9,"H" = 10^2,"K" = 10^3,"M" = 10^6,"B" = 10^9)
expKey2 <- c("\"\"" = 10^0, "?" = 10^0, "0" = 10^0, "K" = 10^3, "M" = 10^6, "B" = 10^9)
stormData[, PROPDMGEXP := expKey1[as.character(stormData[,PROPDMGEXP])]]
stormData[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
stormData[, CROPDMGEXP := expKey2[as.character(stormData[,CROPDMGEXP])] ]
stormData[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
stormData <- stormData[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propCost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropCost = CROPDMG * CROPDMGEXP)]
totalCost <- stormData[, .(propCost = sum(propCost), cropCost = sum(cropCost), totalCost = sum(propCost) + sum(cropCost)), by = .(EVTYPE)]
totalCost <- totalCost[order(-totalCost), ]
totalCost <- totalCost[1:10, ]
head(totalCost)
## EVTYPE propCost cropCost totalCost
## 1: FLOOD 144657709807 5661968450 150319678257
## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3: TORNADO 56947380676 414953270 57362333946
## 4: STORM SURGE 43323536000 5000 43323541000
## 5: HAIL 15735267513 3025954473 18761221986
## 6: FLASH FLOOD 16822673978 1421317100 18243991078
totalInjuries <- stormData[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTAL = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
totalInjuries <- totalInjuries[order(-FATALITIES), ]
totalInjuries <- totalInjuries[1:10, ]
head(totalInjuries)
## EVTYPE FATALITIES INJURIES TOTAL
## 1: TORNADO 5633 91346 96979
## 2: EXCESSIVE HEAT 1903 6525 8428
## 3: FLASH FLOOD 978 1777 2755
## 4: HEAT 937 2100 3037
## 5: LIGHTNING 816 5230 6046
## 6: TSTM WIND 504 6957 7461
Transform data table for easier plotting.
consequence <- melt(totalInjuries, id.vars="EVTYPE", variable.name = "Consequence")
head(consequence)
## EVTYPE Consequence value
## 1: TORNADO FATALITIES 5633
## 2: EXCESSIVE HEAT FATALITIES 1903
## 3: FLASH FLOOD FATALITIES 978
## 4: HEAT FATALITIES 937
## 5: LIGHTNING FATALITIES 816
## 6: TSTM WIND FATALITIES 504
# make plot
healthChart <- ggplot(consequence, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Consequence), position = "dodge") +
ylab("Count") +
xlab("Event") +
ggtitle("Top 10 US Storm events causing health consequences") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
healthChart
Transform table for easier plotting.
damages <- melt(totalCost, id.vars="EVTYPE", variable.name = "Damage")
head(damages)
## EVTYPE Damage value
## 1: FLOOD propCost 144657709807
## 2: HURRICANE/TYPHOON propCost 69305840000
## 3: TORNADO propCost 56947380676
## 4: STORM SURGE propCost 43323536000
## 5: HAIL propCost 15735267513
## 6: FLASH FLOOD propCost 16822673978
# make plot
econChart <- ggplot(damages, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Damage), position = "dodge") +
ylab("Cost ($)") +
xlab("Event") +
ggtitle("Top 10 US Storm events causing economic consequences") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
econChart