We aim to ascertain the weather event type in terms of population health and economic consequences in the United States from 1950 to 2011. This study involves analysis of the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, which can be downloaded here
The analysis observes that Tornado was the most harmful weather event type in terms of population health while Flood was the weather event type which caused the worst economic impact.
Load the required packages.
library(dplyr)
library(lattice)
library(ggplot2)
Assuming the dataset is already downloaded locally, load the csv file and study its components.
data <- read.csv("repdata_data_StormData.csv", header=TRUE, stringsAsFactors = FALSE)
sum(is.na(data))
## [1] 1745947
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Only the variables relevant to this analysis are retained; for instance variables associated with event types, population health impacts (“FATALITIES” & “INJURIES), and economic consequences (”PROPDMG“,”PROPDMGEXP“,”CROPDMG" & “CROPDMGEXP”).
data <- data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP","CROPDMG", "CROPDMGEXP" )]
sum(is.na(data))
## [1] 0
Check the structure of Event Type.
length(unique(data$EVTYPE))
## [1] 985
head(unique(data$EVTYPE),20)
## [1] "TORNADO" "TSTM WIND"
## [3] "HAIL" "FREEZING RAIN"
## [5] "SNOW" "ICE STORM/FLASH FLOOD"
## [7] "SNOW/ICE" "WINTER STORM"
## [9] "HURRICANE OPAL/HIGH WINDS" "THUNDERSTORM WINDS"
## [11] "RECORD COLD" "HURRICANE ERIN"
## [13] "HURRICANE OPAL" "HEAVY RAIN"
## [15] "LIGHTNING" "THUNDERSTORM WIND"
## [17] "DENSE FOG" "RIP CURRENT"
## [19] "THUNDERSTORM WINS" "FLASH FLOOD"
Normalize and combine similar event tyes.
data$EVTYPE <- toupper(data$EVTYPE)
data$EVTYPE <- gsub('.*STORM.*', 'STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('.*WIND.*', 'WIND', data$EVTYPE)
data$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', data$EVTYPE)
data$EVTYPE <- gsub('.*HURRICANE.*', 'HURRICANE', data$EVTYPE)
data$EVTYPE <- gsub('.*HEAT.*', 'HEAT', data$EVTYPE)
data$EVTYPE <- gsub('.*WARM.*', 'HEAT', data$EVTYPE)
data$EVTYPE <- gsub('.*FIRE.*', 'FIRE', data$EVTYPE)
data$EVTYPE <- gsub('.*DRY.*', 'DRY', data$EVTYPE)
data$EVTYPE <- gsub('.*DUST.*', 'DUST', data$EVTYPE)
data$EVTYPE <- gsub('.*VOLCANIC.*', 'VOLCANIC', data$EVTYPE)
data$EVTYPE <- gsub('.*HIGH.*TEMP.*', 'EXTREME HEAT', data$EVTYPE)
data$EVTYPE <- gsub('.*CLOUD.*', 'CLOUD', data$EVTYPE)
data$EVTYPE <- gsub('.*MICROBURST.*', 'MICROBURST', data$EVTYPE)
data$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', data$EVTYPE)
data$EVTYPE <- gsub('.*COLD.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*SNOW.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*FREEZ.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*LOW TEMPREATURE RECORD.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*ICE.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*FROST.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*LO.*TEMP*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*HAIL.*', 'HAIL', data$EVTYPE)
data$EVTYPE <- gsub('.*RAIN.*', 'RAIN', data$EVTYPE)
data$EVTYPE <- gsub('.*LIGHTNING.*', 'LIGHTNING', data$EVTYPE)
data$EVTYPE <- gsub('.*WET.*', 'WET', data$EVTYPE)
data$EVTYPE <- gsub('.*FOG.*', 'FOG', data$EVTYPE)
data$EVTYPE <- gsub('.*SURF.*', 'SURF', data$EVTYPE)
data$EVTYPE <- gsub('.*CURRENT.*', 'CURRENT', data$EVTYPE)
data$EVTYPE <- gsub('.*SUMMARY.*', 'SUMMARY', data$EVTYPE)
length(unique(data$EVTYPE))
## [1] 152
Check the structure of Property Damage and Crop Damage.
# PROPDMG and CROPDMG come with multiple scales: damage magnitudes are expressed in a separate variable (PROPDMGEXP and CROPDMGEXP, respectively). We should normalize them (to the same scale), otherwise it would not be straighforward to compare damage values in different events and dates.
unique(data$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
data$PROPDMGEXP <- toupper(data$PROPDMGEXP)
data$PROPDMGEXP[(data$PROPDMGEXP=='')|(data$PROPDMGEXP=='+')|(data$PROPDMGEXP=='?')|(data$PROPDMGEXP=='-')|(data$PROPDMGEXP=='0')] <- 0
data$PROPDMGEXP[(data$PROPDMGEXP=='1')] <- 1
data$PROPDMGEXP[(data$PROPDMGEXP=='2')] <- 2
data$PROPDMGEXP[(data$PROPDMGEXP=='3')] <- 3
data$PROPDMGEXP[(data$PROPDMGEXP=='4')] <- 4
data$PROPDMGEXP[(data$PROPDMGEXP=='5')] <- 5
data$PROPDMGEXP[(data$PROPDMGEXP=='6')] <- 6
data$PROPDMGEXP[(data$PROPDMGEXP=='7')] <- 7
data$PROPDMGEXP[(data$PROPDMGEXP=='8')] <- 8
data$PROPDMGEXP[(data$PROPDMGEXP=='H')] <- 2
data$PROPDMGEXP[(data$PROPDMGEXP=='K')] <- 3
data$PROPDMGEXP[(data$PROPDMGEXP=='M')] <- 6
data$PROPDMGEXP[(data$PROPDMGEXP=='B')] <- 9
unique(data$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
data$CROPDMGEXP <- toupper(data$CROPDMGEXP)
data$CROPDMGEXP [(data$CROPDMGEXP=='')|(data$CROPDMGEXP=='?')|(data$CROPDMGEXP=='0')] <- 1
data$CROPDMGEXP [(data$CROPDMGEXP =='2')] <- 2
data$CROPDMGEXP [(data$CROPDMGEXP =='K')] <- 3
data$CROPDMGEXP [(data$CROPDMGEXP =='M')] <- 6
data$CROPDMGEXP [(data$CROPDMGEXP =='B')] <- 9
# Calculate the cost of damages
data$CROPDMGEXP<- as.numeric(data$CROPDMGEXP)
data$PROPDMGEXP<- as.numeric(data$PROPDMGEXP)
data$PROPDMG <- data$PROPDMG*(10^data$PROPDMGEXP)
data$CROPDMG <- data$CROPDMG*(10^data$CROPDMGEXP)
Combine the damage and damage exponent multiplier parameters into the single parameters.
data$proploss <- data$PROPDMG * data$PROPDMGEXP
data$croploss <- data$CROPDMG * data$CROPDMGEXP
Calculate the total amount of property loss and crop loss per event type.
eData <- aggregate(cbind(proploss, croploss) ~ EVTYPE, data=data, FUN=sum)
eData$total <- eData$proploss + eData$croploss
Remove rows with zero economic impact and sort the data in descending order.
eData <- eData[eData$total > 0, ]
eData <- eData[order(eData$total, decreasing=TRUE), ]
# Create dataframe of highest economy-impacting event types
eDataTop <- eData[1:15, ]
eDataTop
## EVTYPE proploss croploss total
## 36 FLOOD 1.388253e+12 88208127300 1.476461e+12
## 62 HURRICANE 7.307971e+11 37541318400 7.683384e+11
## 114 STORM 6.086671e+11 53147427280 6.618146e+11
## 118 TORNADO 3.484089e+11 2191115680 3.506000e+11
## 43 HAIL 9.778681e+10 16544242580 1.143311e+11
## 27 DROUGHT 6.267468e+09 88271058000 9.453853e+10
## 145 WIND 7.831574e+10 8028567450 8.634430e+10
## 34 FIRE 5.827164e+10 2392174890 6.066381e+10
## 93 RAIN 2.672377e+10 4801958400 3.152572e+10
## 17 COLD 6.599026e+09 21800731200 2.839976e+10
## 44 HEAT 1.123073e+08 6623937840 6.736245e+09
## 11 BLIZZARD 3.911682e+09 672180000 4.583862e+09
## 72 LIGHTNING 3.772068e+09 61836270 3.833904e+09
## 125 TYPHOON 3.598890e+09 2475000 3.601365e+09
## 67 LANDSLIDE 1.891608e+09 120051000 2.011659e+09
Remove rows with zero health impact and sort health data in descending order.
hData <- aggregate(cbind(FATALITIES, INJURIES) ~ EVTYPE, data=data, FUN=sum)
hData$total <- hData$FATALITIES + hData$INJURIES
hData <- hData[hData$total > 0, ]
hData <- hData[order(hData$total, decreasing=TRUE), ]
# Create dataframe of highest health-impacting event types
hDataTop <- hData[1:15, ]
hDataTop
## EVTYPE FATALITIES INJURIES total
## 118 TORNADO 5636 91407 97043
## 44 HEAT 3178 9243 12421
## 145 WIND 1240 9044 10284
## 36 FLOOD 1525 8602 10127
## 114 STORM 633 6692 7325
## 72 LIGHTNING 817 5231 6048
## 17 COLD 399 1605 2004
## 34 FIRE 90 1608 1698
## 62 HURRICANE 133 1328 1461
## 43 HAIL 15 1371 1386
## 37 FOG 80 1076 1156
## 21 CURRENT 572 529 1101
## 11 BLIZZARD 101 806 907
## 147 WINTER WEATHER 33 398 431
## 116 SURF 163 246 409
Question 1: Across the United States, which types of events are most harmful with respect to population health?
Tornado was the most harmful weather event type in terms of population health, resulting in both deaths and injuries.
ggplot(data = head(hDataTop, 15), aes(x = factor(EVTYPE), y = (FATALITIES + INJURIES))) + geom_bar(stat="identity") + coord_flip() + labs(y = "Injuries and fatalities", x = "Event type", title = "Injuries and fatalites per event type across the US")
Question 2: Across the United States, which types of events have the greatest economic consequences?
Flood was the worst weather event type in terms of economic consequences, for both property and crop losses.
ggplot(data = head(eDataTop, 15), aes(x = factor(EVTYPE), y = (proploss + croploss))) + geom_bar(stat="identity") + coord_flip() + labs(y = "Property and crop damage", x = "Event type", title = "Property and crop damage by event type accross the US")