Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The events in the database start in the year 1950 and end in November 2011.
The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. The data analysis address the following questions:
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
library(pander)
# Load Data
stormData<- read.csv("repdata-data-StormData.csv", header = TRUE, stringsAsFactors = FALSE,
strip.white=TRUE)
# Subset the interest columns for the analysis
KeepCol <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
stormData <- subset(stormData, select = KeepCol)
# Replace all Na with 0
stormData[is.na(stormData)] <- 0
# Subset only data where fatalities or injuries occurred
stormData <- subset(stormData, FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)
head(stormData)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
# Change all damage exponents to uppercase.
stormData$CROPDMGEXP <- toupper(stormData$CROPDMGEXP)
stormData$PROPDMGEXP <- toupper(stormData$PROPDMGEXP)
# Crate new variable with the valid multipliers and their respective values
multiplier <- c("0" = 10^0, "1" = 10^1, "2" = 10^2, "3" = 10^3, "4" = 10^4, "5" = 10^5,
"6" = 10^6, "7" = 10^7, "8" = 10^8, "9" = 10^9, "H" = 10^2, "K" = 10^3,
"M" = 10^6, "B" = 10^9)
# The values will be replace by their respective multipliers values (e.g K = 1000)
stormData$CROPDMGEXP <- multiplier[stormData$CROPDMGEXP]
stormData$PROPDMGEXP <- multiplier[stormData$PROPDMGEXP]
# If the values of the columns don't have a valid code then use multiplier equal 1
stormData[is.na(stormData$CROPDMGEXP), "CROPDMGEXP"] <- 1
stormData[is.na(stormData$PROPDMGEXP), "PROPDMGEXP"] <- 1
# Change the characters columns by numeric columns in preparation for calculation
stormData$CROPDMGEXP <- as.numeric(stormData$CROPDMGEXP)
stormData$PROPDMGEXP <- as.numeric(stormData$PROPDMGEXP)
# Calculate the real values of CROPDMG and PROPDMG
stormData$CROPCOST <- stormData$CROPDMG * stormData$CROPDMGEXP
stormData$PROPCOST <- stormData$PROPDMG * stormData$PROPDMGEXP
stormData$TOTALCOST <- stormData$CROPCOST + stormData$PROPCOST
# Subset the interest columns for the analysis
KeepCol <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPCOST", "CROPCOST", "TOTALCOST")
stormData <- subset(stormData, select = KeepCol)
head(stormData)
## EVTYPE FATALITIES INJURIES PROPCOST CROPCOST TOTALCOST
## 1 TORNADO 0 15 25000 0 25000
## 2 TORNADO 0 0 2500 0 2500
## 3 TORNADO 0 2 25000 0 25000
## 4 TORNADO 0 2 2500 0 2500
## 5 TORNADO 0 2 2500 0 2500
## 6 TORNADO 0 6 2500 0 2500
# Change all events to uppercase
stormData$EVTYPE <- toupper(stormData$EVTYPE)
# Create copy of EVTYPE column
stormData$EVENT <- stormData$EVTYPE
# Create a keyword/value dataframe
keyReplace <- data.frame(key = c("TORNADO", "HURRICANE", "TYPHOON", "LIGHTNING", "FLOOD", "HAIL",
"ICE", "FOG", "BLIZZARD", "SNOW", "RIP CURRENT", "FIRE",
"THUNDER", "TSTM", "WIND", "HEAT", "TROPICAL", "FROST",
"WINTER", "RAIN", "FREEZ", "COLD", "PRECI", "WARM", "MUD"),
val = c("TORNADO", "HURRICANE", "HURRICANE", "THUNDERSTORM", "FLOOD", "HAIL",
"ICE", "FOG", "SNOW", "SNOW", "RIP CURRENT", "FIRE",
"THUNDERSTORM", "THUNDERSTORM", "WIND", "HEAT", "TROPICAL STORM", "ICE",
"SNOW", "RAIN", "ICE", "COLD", "RAIN", "HEAT", "MUD"))
# Search for keywords and replace value in EVENT column
for (key in keyReplace$key) {
stormData$EVENT[grep(key, stormData$EVENT)] <- as.character(keyReplace[keyReplace$key == key, "val"]) }
# Change event column as factor
stormData$EVENT <- as.factor(stormData$EVENT)
# Subset the interest columns for the analysis
KeepCol <- c("EVENT", "FATALITIES", "INJURIES", "PROPCOST", "CROPCOST", "TOTALCOST")
stormData <- subset(stormData, select = KeepCol)
head(stormData)
## EVENT FATALITIES INJURIES PROPCOST CROPCOST TOTALCOST
## 1 TORNADO 0 15 25000 0 25000
## 2 TORNADO 0 0 2500 0 2500
## 3 TORNADO 0 2 25000 0 25000
## 4 TORNADO 0 2 2500 0 2500
## 5 TORNADO 0 2 2500 0 2500
## 6 TORNADO 0 6 2500 0 2500
stormDataSum <- aggregate(. ~ EVENT, stormData, sum)
head(stormDataSum)
## EVENT FATALITIES INJURIES PROPCOST CROPCOST TOTALCOST
## 1 HIGH SURF ADVISORY 0 0 200000 0 200000
## 2 ? 0 0 5000 0 5000
## 3 APACHE COUNTY 0 0 5000 0 5000
## 4 ASTRONOMICAL HIGH TIDE 0 0 9425000 0 9425000
## 5 ASTRONOMICAL LOW TIDE 0 0 320000 0 320000
## 6 AVALANCE 1 0 0 0 0
# Get the top 10 fatalities events
fatalitiesData <- stormDataSum[order(-stormDataSum$FATALITIES), c("EVENT", "FATALITIES")]
rownames(fatalitiesData) <- NULL
head(fatalitiesData, 10)
## EVENT FATALITIES
## 1 TORNADO 5661
## 2 HEAT 3178
## 3 THUNDERSTORM 1542
## 4 FLOOD 1525
## 5 WIND 690
## 6 RIP CURRENT 577
## 7 SNOW 542
## 8 AVALANCHE 224
## 9 COLD 214
## 10 HURRICANE 135
# Plot the top 10 fatalities events
library(ggplot2)
ggplot(data = fatalitiesData[1:10, ], aes(x = reorder(EVENT, -FATALITIES), y = FATALITIES)) +
geom_bar(colour="black", stat="identity") +
theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.3),
plot.title = element_text(hjust = 0.5)) +
xlab("Event Type") + ylab("Fatalities") +
ggtitle("Fatalities by Event Type")
# Get the top 10 injuries events
injuriesData <- stormDataSum[order(-stormDataSum$INJURIES), c("EVENT", "INJURIES")]
rownames(injuriesData) <- NULL
head(injuriesData, 10)
## EVENT INJURIES
## 1 TORNADO 91407
## 2 THUNDERSTORM 14679
## 3 HEAT 9243
## 4 FLOOD 8604
## 5 SNOW 3835
## 6 ICE 2182
## 7 WIND 1951
## 8 FIRE 1608
## 9 HAIL 1467
## 10 HURRICANE 1333
# Plot the top 10 injuries events
library(ggplot2)
ggplot(data = injuriesData[1:10, ], aes(x = reorder(EVENT, -INJURIES), y = INJURIES)) +
geom_bar(colour="black", stat="identity") +
theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.3),
plot.title = element_text(hjust = 0.5)) +
xlab("Event Type") + ylab("Injuries") +
ggtitle("Injuries by Event Type")
As you can see in the tables / graphs, “TORNADO” are still by far the leading cause of death and injury in the United States, followed by “HEAT”, “THUNDERSTORM” and “FLOOD”.
# Get the top 10 property cost events
propcostData <- stormDataSum[order(-stormDataSum$PROPCOST), c("EVENT", "PROPCOST")]
rownames(propcostData) <- NULL
head(propcostData, 10)
## EVENT PROPCOST
## 1 FLOOD 168212215835
## 2 HURRICANE 85356410010
## 3 TORNADO 58603317927
## 4 STORM SURGE 43323536000
## 5 HAIL 16022991537
## 6 THUNDERSTORM 12079856106
## 7 FIRE 8496628500
## 8 SNOW 8396338901
## 9 TROPICAL STORM 7716127550
## 10 WIND 6262196123
As you can see in the table, “FLOOD” creates by far the largest property damage bill in the United States, followed by “HURRICANE”, “TORNADO”, STORM SURGE" and “HAIL”.
# Get the top 10 crop cost events
cropcostData <- stormDataSum[order(-stormDataSum$CROPCOST), c("EVENT", "CROPCOST")]
rownames(cropcostData) <- NULL
head(cropcostData, 10)
## EVENT CROPCOST
## 1 DROUGHT 13972566000
## 2 FLOOD 12380109100
## 3 ICE 7019175300
## 4 HURRICANE 5516117800
## 5 HAIL 3111712873
## 6 COLD 1409115500
## 7 THUNDERSTORM 1218945828
## 8 HEAT 904479280
## 9 RAIN 806152800
## 10 WIND 780480400
As you can see in the table, “DROUGHT” and “FLOOD” creates by far the largest crops damage bill in the United States, followed by “ICE” and “HURRICANE”.
# Get the top 10 total cost events
totalcostData <- stormDataSum[order(-stormDataSum$TOTALCOST), c("EVENT", "TOTALCOST")]
rownames(totalcostData) <- NULL
head(totalcostData, 10)
## EVENT TOTALCOST
## 1 FLOOD 180592324935
## 2 HURRICANE 90872527810
## 3 TORNADO 59020779447
## 4 STORM SURGE 43323541000
## 5 HAIL 19134704410
## 6 DROUGHT 15018672000
## 7 THUNDERSTORM 13298801934
## 8 ICE 11006814710
## 9 FIRE 8899910130
## 10 SNOW 8685506001
# Plot the top 10 total cost events
library(ggplot2)
ggplot(data = totalcostData[1:10, ], aes(x = reorder(EVENT, -TOTALCOST), y = TOTALCOST)) +
geom_bar(colour="black", stat="identity") +
theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.3),
plot.title = element_text(hjust = 0.5)) +
xlab("Event Type") + ylab("Total Cost") +
ggtitle("Total Cost by Event Type")
As you can see in the table / graph, “FLOOD” creates by far the largest total damage bill in the United States, followed by “HURRICANE”, “TTORNADO”, STORM SURGE" and “HAIL”.