The data are from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States from 1950 to 2011, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. This report shows a process to clean and organise the data and then presents the top five severe weather events that cost the U.S. economy the most in property and crop damage. Further analysis presents the human cost in fatality and injury. Both economic and human costs span the whole database, rather than specific years, to give an overall picture of the effects of severe weather conditions in the United States.
First operation is to create a data file, if none exists, then download and store the data
#downloading and storing the data
if(!file.exists("./data")){dir.create("./data")}
fileUrl = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl,
destfile = "./data/StormData.csv.bz2",
method = "curl")
Read the .csv into a named variable and check out the structure of the file with str(). It is also a good idea to have the National Weather Service Storm Data Documentation to hand as it shows how the original variables are constructed and defined. From here we can determine which variables are needed for the analysis and thereby, reduce the size of the .csv file to work on.
# loading the data
StormData <- read.csv("./data/StormData.csv.bz2")
Reform the data to utilise necessary variables.
# organising and viewing the original data
library(tidyverse)
stormTidy <- StormData[,c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
stormTidy$BGN_DATE <- as.Date(stormTidy$BGN_DATE, "%m/%d/%Y")
head(stormTidy, 10)
## BGN_DATE EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 1950-04-18 TORNADO 0 15 25.0 K 0
## 2 1950-04-18 TORNADO 0 0 2.5 K 0
## 3 1951-02-20 TORNADO 0 2 25.0 K 0
## 4 1951-06-08 TORNADO 0 2 2.5 K 0
## 5 1951-11-15 TORNADO 0 2 2.5 K 0
## 6 1951-11-15 TORNADO 0 6 2.5 K 0
## 7 1951-11-16 TORNADO 0 1 2.5 K 0
## 8 1952-01-22 TORNADO 0 0 2.5 K 0
## 9 1952-02-13 TORNADO 1 14 25.0 K 0
## 10 1952-02-13 TORNADO 0 0 25.0 K 0
It is also necessary to multiply the numerical values in ‘PROPDMG’ and ‘CROPDMG’ with the numerical versions of the character nominators shown in ‘PROPDMGEXP’ and ‘CROPDMGEXP’, whereby:
in order to make sense of the total values of damages in USD($) for later use.
# property damage
stormTidy$PROPDMGEXP <- gsub("k | K", 1000, stormTidy$PROPDMGEXP)
stormTidy$PROPDMGEXP <- gsub("M", 1000000, stormTidy$PROPDMGEXP)
stormTidy$PROPDMGEXP <- gsub("B", 1000000000, stormTidy$PROPDMGEXP)
stormTidy$PROPDMGEXP <- as.numeric(stormTidy$PROPDMGEXP)
# crop damage
stormTidy$CROPDMGEXP <- gsub("k | K", 1000, stormTidy$CROPDMGEXP)
stormTidy$CROPDMGEXP <- gsub("M", 1000000, stormTidy$CROPDMGEXP)
stormTidy$CROPDMGEXP <- gsub("B", 1000000000, stormTidy$CROPDMGEXP)
stormTidy$CROPDMGEXP <- as.numeric(stormTidy$CROPDMGEXP)
# multiply damage variables for cost
stormTidy$PROPDMG <- (stormTidy$PROPDMG*stormTidy$PROPDMGEXP)
stormTidy$CROPDMG <- (stormTidy$CROPDMG*stormTidy$CROPDMGEXP)
Next, final variables are introduced which total (sum) the fatalities, injuries; crop and property damage costs for each storm event type from 1950 to 2011.
# totalling the numbers
humanCostFatal <- aggregate(FATALITIES ~ EVTYPE, data = stormTidy, FUN = sum)
humanCostInjury <- aggregate(INJURIES ~ EVTYPE, data = stormTidy, FUN = sum)
propertyDamageCost <- aggregate(PROPDMG ~ EVTYPE, data = stormTidy, FUN = sum)
cropDamageCost <- aggregate(CROPDMG ~ EVTYPE, data = stormTidy, FUN = sum)
These final variables are ordered and then ‘forced’ to show the top five events only.
# ordering the data for top five in each case
humanCostFatal <- humanCostFatal[order(-humanCostFatal$FATALITIES), ][1:5, ]
humanCostInjury <- humanCostInjury[order(-humanCostInjury$INJURIES), ][1:5, ]
propertyDamageCost <-
propertyDamageCost[order(-propertyDamageCost$PROPDMG), ][1:5, ]
cropDamageCost <-
cropDamageCost[order(-cropDamageCost$CROPDMG), ][1:5, ]
# top five property damage
names(propertyDamageCost) <- c("EventType", "PropertyDamage(USD)")
print(propertyDamageCost, row.names = c("1st", "2nd", "3rd", "4th", "5th"))
## EventType PropertyDamage(USD)
## 1st FLOOD 143779180000
## 2nd HURRICANE/TYPHOON 69303870000
## 3rd TORNADO 53762180511
## 4th STORM SURGE 43304930000
## 5th HAIL 15052260150
# property damage plot
ggplot(propertyDamageCost, aes(x = reorder(EventType, -`PropertyDamage(USD)`),
y = `PropertyDamage(USD)`/10^9))+
geom_bar(stat = "identity", fill = "steelblue")+
geom_text(aes(label = round(`PropertyDamage(USD)`/10^9, 2),
hjust = 1.2))+
theme_minimal()+
coord_flip()+
labs(title = "Top 5 US storm events causing property damage")+
xlab("Storm Event Type")+
ylab("Property Damage Cost (Billion USD)")
# top five crop damage
names(cropDamageCost) <- c("EventType", "CropDamage(USD)")
print(cropDamageCost, row.names = c("1st", "2nd", "3rd", "4th", "5th"))
## EventType CropDamage(USD)
## 1st DROUGHT 13951120000
## 2nd FLOOD 5499430000
## 3rd RIVER FLOOD 5026000000
## 4th ICE STORM 5020450000
## 5th HURRICANE 2739310000
# crop damage plot
ggplot(cropDamageCost, aes(x = reorder(EventType, -`CropDamage(USD)`),
y = `CropDamage(USD)`/10^9))+
geom_bar(stat = "identity", fill = "steelblue")+
geom_text(aes(label = round(`CropDamage(USD)`/10^9, 2), hjust = 1.2))+
theme_minimal()+
coord_flip()+
labs(title = "Top 5 US storm events causing crop damage")+
xlab("Storm Event Type")+
ylab("Crop Damage Cost (Billion USD)")
# top five human fatality events
names(humanCostFatal) <- c("EventType", "Fatalities")
print(humanCostFatal, row.names = c("1st", "2nd", "3rd", "4th", "5th"))
## EventType Fatalities
## 1st TORNADO 5633
## 2nd EXCESSIVE HEAT 1903
## 3rd FLASH FLOOD 978
## 4th HEAT 937
## 5th LIGHTNING 816
# top five human injury events
names(humanCostInjury) <- c("EventType", "Injuries")
print(humanCostInjury, row.names = c("1st", "2nd", "3rd", "4th", "5th"))
## EventType Injuries
## 1st TORNADO 91346
## 2nd TSTM WIND 6957
## 3rd FLOOD 6789
## 4th EXCESSIVE HEAT 6525
## 5th LIGHTNING 5230
# reforming fatality and injury data to plot
humanCost <- merge(humanCostFatal, humanCostInjury, all = TRUE)
humanCost <- humanCost %>%
pivot_longer(c(`Fatalities`, `Injuries`), names_to = "HumanCost",
values_to = "Casualties")
# human fatality plot
ggplot(humanCost, aes(x = reorder(EventType, -`Casualties`),
y = `Casualties`/1000, fill = HumanCost))+
geom_bar(position = "dodge", stat = "identity")+
theme_minimal()+
coord_flip()+
labs(title = "Top 5 US storm events causing human injury or fatality")+
xlab("Storm Event Type")+
ylab("Number of casualties (Thousands)")
This analysis is only one small contribution to show the top five severe weather events that cost the U.S. economy in property and crop damage; and also the human cost in fatality and injury. Although this presentation concentrated on the sum of incidents through the whole period between 1950 and 2011, further analysis may be of benefit to see how or if severe weather events have increased over time; and possibly predict future events to direct funds for better protection.