This data analysis relies on U.S. National Oceanic and Atmospheric Administration (NOAA) storm data to determine 1) the types of events that are most harmful with respect to population health, and 2) the types of events that have the greatest economic consequences.
The analysis used to answer these questions involves a fair bit of processing, including formatting date, normalizing damage data (property and crop) because of its representation in different dollar units, deducing the ten worst (most fatal/injurious/economically damaging) in each dataset, and merging the datasets (property with crop and fatalities with injuries) to conclude the overall most damaging events economically and in terms of human health, respectively.
The analysis shows that the most damaging type of storm event with respect to population health is tornado, and the most damaging type of storm event in terms of economic consequences is flood.
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3
library(ggplot2)
library(knitr)
library(data.table)
setwd("C:/Users/Loaner - Kirsten/Desktop/Coursera")
unzip("repdata_data_StormData.zip", overwrite=T)
## Warning in unzip("repdata_data_StormData.zip", overwrite = T): error 1 in
## extracting from zip file
stormData <- read.csv("repdata_data_StormData.csv", sep = ",")
dim(stormData)
## [1] 902297 37
head(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
names(stormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
keepCols <- c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
stormData_used <- stormData[keepCols]
stormData_used$Year <- as.numeric(format(as.Date(stormData_used$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
unique(stormData_used$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
stormData_used$PROPDMGEXP <- as.character(stormData_used$PROPDMGEXP)
stormData_used$PROPDMGEXP[toupper(stormData_used$PROPDMGEXP) == 'H'] <- "2"
stormData_used$PROPDMGEXP[toupper(stormData_used$PROPDMGEXP) == 'K'] <- "3"
stormData_used$PROPDMGEXP[toupper(stormData_used$PROPDMGEXP) == 'M'] <- "6"
stormData_used$PROPDMGEXP[toupper(stormData_used$PROPDMGEXP) == 'B'] <- "9"
stormData_used$PROPDMGEXP <- as.numeric(stormData_used$PROPDMGEXP)
## Warning: NAs introduced by coercion
stormData_used$PROPDMGEXP[is.na(stormData_used$PROPDMGEXP)] <- 0
stormData_used$TOTALPROPDMG <- stormData_used$PROPDMG * 10^stormData_used$PROPDMGEXP
unique(stormData_used$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
stormData_used$CROPDMGEXP <- as.character(stormData_used$CROPDMGEXP)
stormData_used$CROPDMGEXP[toupper(stormData_used$CROPDMGEXP) == 'H'] <- "2"
stormData_used$CROPDMGEXP[toupper(stormData_used$CROPDMGEXP) == 'K'] <- "3"
stormData_used$CROPDMGEXP[toupper(stormData_used$CROPDMGEXP) == 'M'] <- "6"
stormData_used$CROPDMGEXP[toupper(stormData_used$CROPDMGEXP) == 'B'] <- "9"
stormData_used$CROPDMGEXP <- as.numeric(stormData_used$CROPDMGEXP)
## Warning: NAs introduced by coercion
stormData_used$CROPDMGEXP[is.na(stormData_used$CROPDMGEXP)] <- 0
stormData_used$TOTALCROPDMG <- stormData_used$CROPDMG * 10^stormData_used$CROPDMGEXP
head(stormData_used)
## BGN_DATE EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## 1 4/18/1950 0:00:00 TORNADO 0 15 25.0 3 0
## 2 4/18/1950 0:00:00 TORNADO 0 0 2.5 3 0
## 3 2/20/1951 0:00:00 TORNADO 0 2 25.0 3 0
## 4 6/8/1951 0:00:00 TORNADO 0 2 2.5 3 0
## 5 11/15/1951 0:00:00 TORNADO 0 2 2.5 3 0
## 6 11/15/1951 0:00:00 TORNADO 0 6 2.5 3 0
## CROPDMGEXP Year TOTALPROPDMG TOTALCROPDMG
## 1 0 1950 25000 0
## 2 0 1950 2500 0
## 3 0 1951 25000 0
## 4 0 1951 2500 0
## 5 0 1951 2500 0
## 6 0 1951 2500 0
TotFatalities <- aggregate(stormData_used$FATALITIES, by = list(stormData_used$EVTYPE), "sum")
names(TotFatalities) <- c("Event", "Fatalities")
TotFatalities <- TotFatalities[order(-TotFatalities$Fatalities), ][1:10, ]
TotFatalities
## Event Fatalities
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 856 TSTM WIND 504
## 170 FLOOD 470
## 585 RIP CURRENT 368
## 359 HIGH WIND 248
## 19 AVALANCHE 224
TotInjuries <- aggregate(stormData_used$INJURIES, by = list(stormData_used$EVTYPE), "sum")
names(TotInjuries) <- c("Event", "Injuries")
TotInjuries <- TotInjuries[order(-TotInjuries$Injuries), ][1:10, ]
TotInjuries
## Event Injuries
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 275 HEAT 2100
## 427 ICE STORM 1975
## 153 FLASH FLOOD 1777
## 760 THUNDERSTORM WIND 1488
## 244 HAIL 1361
TotHealthDamage <- merge(x = TotFatalities, y = TotInjuries, by = "Event", all = TRUE)
TotHealthDamage <- melt(data.table(TotHealthDamage), id.vars = 'Event')
ggplot(TotHealthDamage, aes(Event, value)) +
geom_bar(aes(fill=variable), position = "dodge", stat="identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Event Type") +
ylab("Damage, USD") + ggtitle("Effect on Healthy damage by Event type")
## Warning: Removed 6 rows containing missing values (`geom_bar()`).
##### Conclusion: Tornado
TotPropDmg <- aggregate(stormData_used$TOTALPROPDMG, by = list(stormData_used$EVTYPE), "sum")
names(TotPropDmg) <- c("Event", "Prop_Cost")
TotPropDmg <- TotPropDmg[order(-TotPropDmg$Prop_Cost), ][1:10, ]
TotPropDmg
## Event Prop_Cost
## 170 FLOOD 144657709807
## 411 HURRICANE/TYPHOON 69305840000
## 834 TORNADO 56947380677
## 670 STORM SURGE 43323536000
## 153 FLASH FLOOD 16822673979
## 244 HAIL 15735267513
## 402 HURRICANE 11868319010
## 848 TROPICAL STORM 7703890550
## 972 WINTER STORM 6688497251
## 359 HIGH WIND 5270046295
TotCropDmg <- aggregate(stormData_used$TOTALCROPDMG, by = list(stormData_used$EVTYPE), "sum")
names(TotCropDmg) <- c("Event", "Crop_Cost")
TotCropDmg <- TotCropDmg[order(-TotCropDmg$Crop_Cost), ][1:10, ]
TotCropDmg
## Event Crop_Cost
## 95 DROUGHT 13972566000
## 170 FLOOD 5661968450
## 590 RIVER FLOOD 5029459000
## 427 ICE STORM 5022113500
## 244 HAIL 3025954473
## 402 HURRICANE 2741910000
## 411 HURRICANE/TYPHOON 2607872800
## 153 FLASH FLOOD 1421317100
## 140 EXTREME COLD 1292973000
## 212 FROST/FREEZE 1094086000
TotEcoDamage <- merge(x = TotPropDmg, y= TotCropDmg, by = "Event", all=TRUE)
TotEcoDamage <- melt(data.table(TotEcoDamage), id.vars = "Event")
ggplot(TotEcoDamage, aes(Event, value)) +
geom_bar(aes(fill=variable), position = "dodge", stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("Event Type") +
ylab("Damage, USD") + ggtitle("Crop/Property damage by type")
## Warning: Removed 10 rows containing missing values (`geom_bar()`).
##### Conclusion: Flood
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.