Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The data is for the United States for the time period 1950-2011.
The basic goal of this assignment is to explore the NOAA Storm Database and answer the following questions about severe weather events.
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Across the United States, which types of events have the greatest economic consequences?
To answer the above questions, an analisys have been performed using the NOAA data. The NOAA data set have been processed to obtain the information needed. This processing includes cleaning the data and plotting results.
The data use can be downloaded from https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2 There is also some documentation of the database available. Here you will find out how some of the variables are constructed/defined.
National Weather Service Storm Data Documentation https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
National Climatic Data Center Storm Events FAQ https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf
fileURL <- "https://d396qusza40orc.cloudfront.net/repdata/data/StormData.csv.bz2"
download.file(fileURL, destfile="StormData.csv")
my_data <- read.csv("StormData.csv", header = T, sep = ",", na.strings = c(""," ", "NA"))
First I take all the variables that are needed for the task. Then I transform all the rows into lower case.
library(ggplot2)
library(gridExtra)
library(reshape2)
data <- my_data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
data[] <- lapply(data, tolower)
table(is.na(data))
##
## FALSE TRUE
## 5231732 1084347
prop.table(table(is.na(data)))
##
## FALSE TRUE
## 0.8283196 0.1716804
Here I change all the values into the ones stated in the documentation.
data$PROPDMGEXP <- gsub("\\-|\\+|\\?|0","0",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("h|2","1000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("k|3","1000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("m|6","1000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("b","1000000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("1","10",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("4","10000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("5","100000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("7","10000000",data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("8","100000000",data$PROPDMGEXP)
data$PROPDMGEXP <- ifelse(is.na(data$PROPDMGEXP), 0, data$PROPDMGEXP)
table(data$PROPDMGEXP)
##
## 0 10 10000 100000 10000000 100000000
## 466164 25 424693 28 11346 1
## 10000000000
## 40
data$CROPDMGEXP <- gsub("\\-|\\+|\\?|0","0",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("k","1000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("m","1000000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("b","1000000000",data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("2","100",data$CROPDMGEXP)
data$CROPDMGEXP <- ifelse(is.na(data$CROPDMGEXP), 0, data$CROPDMGEXP)
table(data$CROPDMGEXP)
##
## 0 100 1000 1000000 1000000000
## 618439 1 281853 1995 9
Compute the property damage value and the crop damage value.
data$PROPDMG_VALUE <- as.numeric(data$PROPDMGEXP) * as.numeric(data$PROPDMG)
data$CROPDMG_VALUE <- as.numeric(data$CROPDMG) * as.numeric(data$CROPDMGEXP)
str(data)
## 'data.frame': 902297 obs. of 9 variables:
## $ EVTYPE : chr "tornado" "tornado" "tornado" "tornado" ...
## $ FATALITIES : chr "0" "0" "0" "0" ...
## $ INJURIES : chr "15" "0" "2" "2" ...
## $ PROPDMG : chr "25" "2.5" "25" "2.5" ...
## $ PROPDMGEXP : chr "10000" "10000" "10000" "10000" ...
## $ CROPDMG : chr "0" "0" "0" "0" ...
## $ CROPDMGEXP : chr "0" "0" "0" "0" ...
## $ PROPDMG_VALUE: num 250000 25000 250000 25000 25000 25000 25000 25000 250000 250000 ...
## $ CROPDMG_VALUE: num 0 0 0 0 0 0 0 0 0 0 ...
Aggregate the data by event and prepare the data for plotting.
fatalities <- aggregate(as.numeric(FATALITIES) ~ EVTYPE, data = data, FUN = sum)
names(fatalities)[2] <- "FATALITIES"
fatalities <- fatalities[order(fatalities$FATALITIES, decreasing = TRUE),]
fatalities$EVTYPE <- factor(fatalities$EVTYPE, levels = fatalities$EVTYPE[order(fatalities$FATALITIES, decreasing = TRUE)])
injuries <- aggregate(as.numeric(INJURIES) ~ EVTYPE, data = data, FUN = sum)
names(injuries)[2] <- "INJURIES"
injuries <- injuries[order(injuries$INJURIES, decreasing = TRUE),]
injuries$EVTYPE <- factor(injuries$EVTYPE, levels = injuries$EVTYPE[order(injuries$INJURIES, decreasing = TRUE)])
propdmgval <- aggregate(as.numeric(PROPDMG_VALUE) ~ EVTYPE, data = data, FUN = sum)
names(propdmgval)[2] <- "PROPDMG_VALUE"
propdmgval <- propdmgval[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE),]
propdmgval$EVTYPE <- factor(propdmgval$EVTYPE, levels = propdmgval$EVTYPE[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE)])
cropdmgval <- aggregate(as.numeric(CROPDMG_VALUE) ~ EVTYPE, data = data, FUN = sum)
names(cropdmgval)[2] <- "CROPDMG_VALUE"
cropdmgval <- cropdmgval[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE),]
cropdmgval$EVTYPE <- factor(cropdmgval$EVTYPE, levels = cropdmgval$EVTYPE[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE)])
Here we can see that tornados cause the most fatalities and injuries.
plot1 <- ggplot(head(fatalities, 10), aes(x=EVTYPE, y=FATALITIES, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+
xlab("")+
ylab("Total Number of Fatalities")+
ggtitle("Worst 10 Events by Fatalities")
plot2 <- ggplot(head(injuries, 10), aes(x=EVTYPE, y=INJURIES, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+
xlab("")+
ylab("Total Number of Injuries")+
ggtitle("Worst 10 Events by Injuries")
grid.arrange(plot1, plot2, ncol=2)
Next we can see that floods and draughts cause the most property damange and crop damage respectively.
plot3 <- ggplot(head(propdmgval, 10), aes(x=EVTYPE, y=PROPDMG_VALUE, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+
xlab("")+
ylab("")+
ggtitle("Worst 10 Events by Prop Damage")
plot4 <- ggplot(head(cropdmgval, 10), aes(x=EVTYPE, y=CROPDMG_VALUE, fill = EVTYPE)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.text.x = element_text(angle = 45, hjust = 1))+
xlab("")+
ylab("") +
ggtitle("Worst 10 Events by Crop Damage")
grid.arrange(plot3, plot4, ncol=2)
Here we see the combined values for the public health figures.
fata_inj <- merge(head(fatalities, 20), head(injuries, 20), by = "EVTYPE")
SH <- melt(fata_inj[,1:3], id = "EVTYPE")
plot1 <- ggplot(SH, aes(EVTYPE, log10(value+1), fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
geom_text(aes(label=value), position="stack", hjust=2) +
scale_y_continuous(name="Number of Fatalities & Injuries (Log10 Scale)")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
plot2 <- ggplot(SH, aes(EVTYPE, value, fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
scale_y_continuous(name="Number of Fatalities & Injuries")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
grid.arrange(plot1, plot2, ncol=1)
Next we see the combined values for the economic figures.
prop_crop <- merge(head(propdmgval, 20), head(cropdmgval, 20), by = "EVTYPE")
SH <- melt(prop_crop[,1:3], id = "EVTYPE")
plot3 <- ggplot(SH, aes(EVTYPE, log10(value+1), fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") + xlab("")+
geom_text(aes(label=round(value/1000000)), position="stack", hjust=2) +
scale_y_continuous(name="Damage to Properties and Crop (Log10 Scale)")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
plot4 <- ggplot(SH, aes(EVTYPE, value, fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") + xlab("")+
scale_y_continuous(name="Damage to Properties and Crop")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
grid.arrange(plot3, plot4, ncol=1)
data[grepl("precipitation|rain|hail|drizzle|wet|percip|burst|depression|fog|wall cloud", data$EVTYPE), "EVENT"] <- "Precipitation & Fog"
data[grepl("wind|storm|wnd|hurricane|typhoon", data$EVTYPE), "EVENT"] <- "Wind & Storm"
data[grepl("slide|erosion|slump", data$EVTYPE), "EVENT"] <- "Landslide & Erosion"
data[grepl("warmth|warm|heat|dry|hot|drought|thermia|temperature record|record temperature|record high", data$EVTYPE), "EVENT"] <- "Heat & Drought"
data[grepl("cold|cool|ice|icy|frost|freeze|snow|winter|wintry|wintery|blizzard|chill|freezing|avalanche|glaze|sleet", data$EVTYPE), "EVENT"] <- "Snow & Ice"
data[grepl("flood|surf|blow-out|swells|fld|dam break", data$EVTYPE), "EVENT"] <- "Flooding & High Surf"
data[grepl("seas|high water|tide|tsunami|wave|current|marine|drowning", data$EVTYPE), "EVENT"] <- "High seas"
data[grepl("dust|saharan", data$EVTYPE), "EVENT"] <- "Dust & Saharan winds"
data[grepl("tstm|thunderstorm|lightning", data$EVTYPE), "EVENT"] <- "Thunderstorm & Lightning"
data[grepl("tornado|spout|funnel|whirlwind", data$EVTYPE), "EVENT"] <- "Tornado"
data[grepl("fire|smoke|volcanic", data$EVTYPE), "EVENT"] <- "Fire & Volcanic activity"
table(data$EVENT)
##
## Dust & Saharan winds Fire & Volcanic activity Flooding & High Surf
## 588 4290 87197
## Heat & Drought High seas Landslide & Erosion
## 5568 2242 653
## Precipitation & Fog Snow & Ice Thunderstorm & Lightning
## 302685 47322 352567
## Tornado Wind & Storm
## 71544 27402
fatalities <- aggregate(as.numeric(FATALITIES) ~ EVENT, data = data, FUN = sum)
names(fatalities)[2] <- "FATALITIES"
fatalities <- fatalities[order(fatalities$FATALITIES, decreasing = TRUE),]
fatalities$EVENT <- factor(fatalities$EVENT, levels = fatalities$EVENT[order(fatalities$FATALITIES, decreasing = TRUE)])
injuries <- aggregate(as.numeric(INJURIES) ~ EVENT, data = data, FUN = sum)
names(injuries)[2] <- "INJURIES"
injuries <- injuries[order(injuries$INJURIES, decreasing = TRUE),]
injuries$EVENT <- factor(injuries$EVENT, levels = injuries$EVENT[order(injuries$INJURIES, decreasing = TRUE)])
propdmgval <- aggregate(as.numeric(PROPDMG_VALUE) ~ EVENT, data = data, FUN = sum)
names(propdmgval)[2] <- "PROPDMG_VALUE"
propdmgval <- propdmgval[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE),]
propdmgval$EVENT <- factor(propdmgval$EVENT, levels = propdmgval$EVENT[order(propdmgval$PROPDMG_VALUE, decreasing = TRUE)])
cropdmgval <- aggregate(as.numeric(CROPDMG_VALUE) ~ EVENT, data = data, FUN = sum)
names(cropdmgval)[2] <- "CROPDMG_VALUE"
cropdmgval <- cropdmgval[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE),]
cropdmgval$EVENT <- factor(cropdmgval$EVENT, levels = cropdmgval$EVENT[order(cropdmgval$CROPDMG_VALUE, decreasing = TRUE)])
Here we can see that tornados cause the most fatalities and injuries.
plot1 <- ggplot(head(fatalities, 10), aes(x=EVENT, y=FATALITIES, fill = EVENT)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+
xlab("")+
ylab("Total Number of Fatalities")+
ggtitle("Worst 10 Events by Fatalities")
plot2 <- ggplot(head(injuries, 10), aes(x=EVENT, y=INJURIES, fill = EVENT)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+
xlab("")+
ylab("Total Number of Injuries")+
ggtitle("Worst 10 Events by Injuries")
grid.arrange(plot1, plot2, ncol=2)
Next we can see that floods and draughts cause the most property damange and crop damage respectively.
plot3 <- ggplot(head(propdmgval, 10), aes(x=EVENT, y=PROPDMG_VALUE, fill = EVENT)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(),axis.ticks.x=element_blank(), axis.text.x=element_blank())+
xlab("")+
ylab("")+
ggtitle("Worst 10 Events by Prop Damage")
plot4 <- ggplot(head(cropdmgval, 10), aes(x=EVENT, y=CROPDMG_VALUE, fill = EVENT)) + geom_bar(stat="identity", color = "black")+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text(), axis.ticks.x=element_blank(), axis.text.x=element_blank())+
xlab("")+
ylab("") +
ggtitle("Worst 10 Events by Crop Damage")
grid.arrange(plot3, plot4, ncol=2)
Here we see the combined values for the public health figures.
fata_inj <- merge(head(fatalities, 20), head(injuries, 20), by = "EVENT")
SH <- melt(fata_inj[,1:3], id = "EVENT")
plot1 <- ggplot(SH, aes(EVENT, log10(value+1), fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
geom_text(aes(label=value), position="stack", hjust=2) +
scale_y_continuous(name="Number of Fatalities & Injuries (Log10 Scale)")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
plot2 <- ggplot(SH, aes(EVENT, value, fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
scale_y_continuous(name="Number of Fatalities & Injuries")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
grid.arrange(plot1, plot2, ncol=1)
Next we see the combined values for the economic figures.
prop_crop <- merge(head(propdmgval, 20), head(cropdmgval, 20), by = "EVENT")
SH <- melt(prop_crop[,1:3], id = "EVENT")
plot3 <- ggplot(SH, aes(EVENT, log10(value+1), fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
geom_text(aes(label=round(value/100000)), position="stack", hjust=2) +
scale_y_continuous(name="Damage to Properties and Crop (Log10 Scale)")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
plot4 <- ggplot(SH, aes(EVENT, value, fill=variable), ymax = max(value)) +
geom_bar(stat="identity", position="stack", color = "black") +
xlab("")+
scale_y_continuous(name="Damage to Properties and Crop")+
coord_flip()+
theme(axis.line = element_line(), axis.text=element_text(color='black'), axis.title = element_text(colour = 'black'), legend.text=element_text(), legend.title=element_text())
grid.arrange(plot3, plot4, ncol=1)