Summary

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The data is for the United States for the time period 1950-2011.

The basic goal of this assignment is to explore the NOAA Storm Database and answer the following questions about severe weather events.

Questions

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

  2. Across the United States, which types of events have the greatest economic consequences?

Synopsis

To answer the above questions, an analisys have been performed using the NOAA data. The NOAA data set have been processed to obtain the information needed. This processing includes cleaning the data and plotting results.

Data Processing

The data use can be downloaded from https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2 There is also some documentation of the database available. Here you will find out how some of the variables are constructed/defined.

National Weather Service Storm Data Documentation https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf

National Climatic Data Center Storm Events FAQ https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf


Loading the data

fileURL <- "https://d396qusza40orc.cloudfront.net/repdata/data/StormData.csv.bz2"
download.file(fileURL, destfile="StormData.csv")
my_data <- read.csv("StormData.csv", header = T, sep = ",", na.strings = c(""," ", "NA"))

Creating the data set

First I take all the variables that are needed for the task. Then I transform all the rows into lower case.

library(ggplot2)
library(gridExtra)
library(reshape2)

data <- my_data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
data[] <- lapply(data, tolower)
table(is.na(data))
## 
##   FALSE    TRUE 
## 5231732 1084347
prop.table(table(is.na(data))) 
## 
##     FALSE      TRUE 
## 0.8283196 0.1716804

Here I change all the values into the ones stated in the documentation.

data$PROPDMGEXP <- gsub("\\-|\\+|\\?|0","0",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("h|2","1000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("k|3","1000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("m|6","1000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("b","1000000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("1","10",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("4","10000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("5","100000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("7","10000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("8","100000000",data$PROPDMGEXP)
data$PROPDMGEXP <- ifelse(is.na(data$PROPDMGEXP), 0, data$PROPDMGEXP)

table(data$PROPDMGEXP)
## 
##           0          10       10000      100000    10000000   100000000 
##      466164          25      424693          28       11346           1 
## 10000000000 
##          40
data$CROPDMGEXP <- gsub("\\-|\\+|\\?|0","0",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("k","1000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("m","1000000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("b","1000000000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("2","100",data$CROPDMGEXP)
data$CROPDMGEXP <- ifelse(is.na(data$CROPDMGEXP), 0, data$CROPDMGEXP)

table(data$CROPDMGEXP)
## 
##          0        100       1000    1000000 1000000000 
##     618439          1     281853       1995          9

Compute the property damage value and the crop damage value.

data$PROPDMG_VALUE <- as.numeric(data$PROPDMGEXP) * as.numeric(data$PROPDMG)
data$CROPDMG_VALUE <- as.numeric(data$CROPDMG) * as.numeric(data$CROPDMGEXP)

str(data)
## 'data.frame':    902297 obs. of  9 variables:
##  $ EVTYPE       : chr  "tornado" "tornado" "tornado" "tornado" ...
##  $ FATALITIES   : chr  "0" "0" "0" "0" ...
##  $ INJURIES     : chr  "15" "0" "2" "2" ...
##  $ PROPDMG      : chr  "25" "2.5" "25" "2.5" ...
##  $ PROPDMGEXP   : chr  "10000" "10000" "10000" "10000" ...
##  $ CROPDMG      : chr  "0" "0" "0" "0" ...
##  $ CROPDMGEXP   : chr  "0" "0" "0" "0" ...
##  $ PROPDMG_VALUE: num  250000 25000 250000 25000 25000 25000 25000 25000 250000 250000 ...
##  $ CROPDMG_VALUE: num  0 0 0 0 0 0 0 0 0 0 ...

Aggregate the data by event and prepare the data for plotting.

fatalities <- aggregate(as.numeric(FATALITIES) ~ EVTYPE, data = data, FUN = sum)
names(fatalities)[2] <- "FATALITIES"
fatalities <- fatalities[order(fatalities$FATALITIES, decreasing = TRUE),]
fatalities$EVTYPE <- factor(fatalities$EVTYPE, levels = fatalities$EVTYPE[order(fatalities$FATALITIES, decreasing = TRUE)])

injuries <- aggregate(as.numeric(INJURIES) ~ EVTYPE, data = data, FUN = sum)
names(injuries)[2] <- "INJURIES"
injuries <- injuries[order(injuries$INJURIES, decreasing = TRUE),]
injuries$EVTYPE <- factor(injuries$EVTYPE, levels = injuries$EVTYPE[order(injuries$INJURIES, decreasing = TRUE)])

propdmgval <- aggregate(as.numeric(PROPDMG_VALUE) ~ EVTYPE, data = data, FUN = sum)
names(propdmgval)[2] <- "PROPDMG_VALUE"
propdmgval <- propdmgval[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE),]
propdmgval$EVTYPE <- factor(propdmgval$EVTYPE, levels = propdmgval$EVTYPE[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE)])

cropdmgval <- aggregate(as.numeric(CROPDMG_VALUE) ~ EVTYPE, data = data, FUN = sum)
names(cropdmgval)[2] <- "CROPDMG_VALUE"
cropdmgval <- cropdmgval[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE),]
cropdmgval$EVTYPE <- factor(cropdmgval$EVTYPE, levels = cropdmgval$EVTYPE[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE)])

Results

Here we can see that tornados cause the most fatalities and injuries.

plot1 <- ggplot(head(fatalities, 10), aes(x=EVTYPE, y=FATALITIES, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+ 
  xlab("")+ 
  ylab("Total Number of Fatalities")+
  ggtitle("Worst 10 Events by Fatalities")

plot2 <- ggplot(head(injuries, 10), aes(x=EVTYPE, y=INJURIES, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+ 
  xlab("")+ 
  ylab("Total Number of Injuries")+
  ggtitle("Worst 10 Events by Injuries")

grid.arrange(plot1, plot2, ncol=2)

Next we can see that floods and draughts cause the most property damange and crop damage respectively.

plot3 <- ggplot(head(propdmgval, 10), aes(x=EVTYPE, y=PROPDMG_VALUE, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+ 
  xlab("")+ 
  ylab("")+
  ggtitle("Worst 10 Events by Prop Damage")

plot4 <- ggplot(head(cropdmgval, 10), aes(x=EVTYPE, y=CROPDMG_VALUE, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+ 
  xlab("")+ 
  ylab("") +
  ggtitle("Worst 10 Events by Crop Damage")

grid.arrange(plot3, plot4, ncol=2)

Here we see the combined values for the public health figures.

fata_inj <- merge(head(fatalities, 20), head(injuries, 20), by = "EVTYPE")
SH <- melt(fata_inj[,1:3], id = "EVTYPE")

plot1 <- ggplot(SH, aes(EVTYPE, log10(value+1), fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  geom_text(aes(label=value), position="stack", hjust=2) +
  scale_y_continuous(name="Number of Fatalities & Injuries (Log10 Scale)")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
  
plot2 <- ggplot(SH, aes(EVTYPE, value, fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  scale_y_continuous(name="Number of Fatalities & Injuries")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
  
grid.arrange(plot1, plot2, ncol=1)

Next we see the combined values for the economic figures.

prop_crop <- merge(head(propdmgval, 20), head(cropdmgval, 20), by = "EVTYPE")
SH <- melt(prop_crop[,1:3], id = "EVTYPE")

plot3 <- ggplot(SH, aes(EVTYPE, log10(value+1), fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + xlab("")+
  geom_text(aes(label=round(value/1000000)), position="stack", hjust=2) +
  scale_y_continuous(name="Damage to Properties and Crop (Log10 Scale)")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
  
plot4 <- ggplot(SH, aes(EVTYPE, value, fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + xlab("")+
  scale_y_continuous(name="Damage to Properties and Crop")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
  
grid.arrange(plot3, plot4, ncol=1)


Combining labels

data[grepl("precipitation|rain|hail|drizzle|wet|percip|burst|depression|fog|wall cloud", data$EVTYPE), "EVENT"] <- "Precipitation & Fog"
data[grepl("wind|storm|wnd|hurricane|typhoon", data$EVTYPE), "EVENT"] <- "Wind & Storm"
data[grepl("slide|erosion|slump", data$EVTYPE), "EVENT"] <- "Landslide & Erosion"
data[grepl("warmth|warm|heat|dry|hot|drought|thermia|temperature record|record temperature|record high", data$EVTYPE), "EVENT"] <- "Heat & Drought"
data[grepl("cold|cool|ice|icy|frost|freeze|snow|winter|wintry|wintery|blizzard|chill|freezing|avalanche|glaze|sleet", data$EVTYPE), "EVENT"] <- "Snow & Ice"
data[grepl("flood|surf|blow-out|swells|fld|dam break", data$EVTYPE), "EVENT"] <- "Flooding & High Surf"
data[grepl("seas|high water|tide|tsunami|wave|current|marine|drowning", data$EVTYPE), "EVENT"] <- "High seas"
data[grepl("dust|saharan", data$EVTYPE), "EVENT"] <- "Dust & Saharan winds"  
data[grepl("tstm|thunderstorm|lightning", data$EVTYPE), "EVENT"] <- "Thunderstorm & Lightning"
data[grepl("tornado|spout|funnel|whirlwind", data$EVTYPE), "EVENT"] <- "Tornado"
data[grepl("fire|smoke|volcanic", data$EVTYPE), "EVENT"] <- "Fire & Volcanic activity"

table(data$EVENT)
## 
##     Dust & Saharan winds Fire & Volcanic activity     Flooding & High Surf 
##                      588                     4290                    87197 
##           Heat & Drought                High seas      Landslide & Erosion 
##                     5568                     2242                      653 
##      Precipitation & Fog               Snow & Ice Thunderstorm & Lightning 
##                   302685                    47322                   352567 
##                  Tornado             Wind & Storm 
##                    71544                    27402

Aggregate the data by event

fatalities <- aggregate(as.numeric(FATALITIES) ~ EVENT, data = data, FUN = sum)
names(fatalities)[2] <- "FATALITIES"
fatalities <- fatalities[order(fatalities$FATALITIES, decreasing = TRUE),]
fatalities$EVENT <- factor(fatalities$EVENT, levels = fatalities$EVENT[order(fatalities$FATALITIES, decreasing = TRUE)])

injuries <- aggregate(as.numeric(INJURIES) ~ EVENT, data = data, FUN = sum)
names(injuries)[2] <- "INJURIES"
injuries <- injuries[order(injuries$INJURIES, decreasing = TRUE),]
injuries$EVENT <- factor(injuries$EVENT, levels = injuries$EVENT[order(injuries$INJURIES, decreasing = TRUE)])

propdmgval <- aggregate(as.numeric(PROPDMG_VALUE) ~ EVENT, data = data, FUN = sum)
names(propdmgval)[2] <- "PROPDMG_VALUE"
propdmgval <- propdmgval[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE),]
propdmgval$EVENT <- factor(propdmgval$EVENT, levels = propdmgval$EVENT[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE)])

cropdmgval <- aggregate(as.numeric(CROPDMG_VALUE) ~ EVENT, data = data, FUN = sum)
names(cropdmgval)[2] <- "CROPDMG_VALUE"
cropdmgval <- cropdmgval[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE),]
cropdmgval$EVENT <- factor(cropdmgval$EVENT, levels = cropdmgval$EVENT[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE)])

Results

Here we can see that tornados cause the most fatalities and injuries.

plot1 <- ggplot(head(fatalities, 10), aes(x=EVENT, y=FATALITIES, fill = EVENT)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+ 
  xlab("")+ 
  ylab("Total Number of Fatalities")+
  ggtitle("Worst 10 Events by Fatalities")

plot2 <- ggplot(head(injuries, 10), aes(x=EVENT, y=INJURIES, fill = EVENT)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+ 
  xlab("")+ 
  ylab("Total Number of Injuries")+
  ggtitle("Worst 10 Events by Injuries")

grid.arrange(plot1, plot2, ncol=2)

Next we can see that floods and draughts cause the most property damange and crop damage respectively.

plot3 <- ggplot(head(propdmgval, 10), aes(x=EVENT, y=PROPDMG_VALUE, fill = EVENT)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(),axis.ticks.x=element_blank(), axis.text.x=element_blank())+ 
  xlab("")+ 
  ylab("")+
  ggtitle("Worst 10 Events by Prop Damage")

plot4 <- ggplot(head(cropdmgval, 10), aes(x=EVENT, y=CROPDMG_VALUE, fill = EVENT)) + geom_bar(stat="identity", color = "black")+ 
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+ 
  xlab("")+ 
  ylab("") +
  ggtitle("Worst 10 Events by Crop Damage")

grid.arrange(plot3, plot4, ncol=2)

Here we see the combined values for the public health figures.

fata_inj <- merge(head(fatalities, 20), head(injuries, 20), by = "EVENT")
SH <- melt(fata_inj[,1:3], id = "EVENT")

plot1 <- ggplot(SH, aes(EVENT, log10(value+1), fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  geom_text(aes(label=value), position="stack", hjust=2) +
  scale_y_continuous(name="Number of Fatalities & Injuries (Log10 Scale)")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())

plot2 <- ggplot(SH, aes(EVENT, value, fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  scale_y_continuous(name="Number of Fatalities & Injuries")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())

grid.arrange(plot1, plot2, ncol=1)

Next we see the combined values for the economic figures.

prop_crop <- merge(head(propdmgval, 20), head(cropdmgval, 20), by = "EVENT")
SH <- melt(prop_crop[,1:3], id = "EVENT")

plot3 <- ggplot(SH, aes(EVENT, log10(value+1), fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  geom_text(aes(label=round(value/100000)), position="stack", hjust=2) +
  scale_y_continuous(name="Damage to Properties and Crop (Log10 Scale)")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())

plot4 <- ggplot(SH, aes(EVENT, value, fill=variable), ymax = max(value)) + 
  geom_bar(stat="identity", position="stack", color = "black") + 
  xlab("")+
  scale_y_continuous(name="Damage to Properties and Crop")+
  coord_flip()+
  theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())

grid.arrange(plot3, plot4, ncol=1)