The goal of this project (the second project od the Reproduclble Research Course from Coursera) is to explore the U.S. NOAA (National Oceanic and Atmospheric Administration’s) Storm Database and explore the effects of severe weather events on both population health and economic consecuences.
The analysis aims to investigate which different types of sever weather events are most harmful on the populations health in respect of general injuries and fatalities. Further the economic consequences will be analyzed by exploring the financial damage done to both general property and agriculture (i.e. crops).
The database covers the time period between 1950 and November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
The database have information about storms and other significant weather phenomena having sufficient intensity to cause loss of life, injuries, significant property damage, and/or disruption to commerce. The database have information of others significant meteorological events, such as snow flurries or precipitation that occur in connection with another event.
We load libraries required and database.
# Load Libraries
library(plyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(grid)
# Load datasets
## Create dir principal
dir.principal <- paste(getwd())
# Load Storm Data
dir <- paste(dir.principal, "/Data/StormData.csv.bz2", sep = "")
frame <- read.csv(bzfile(dir), header = TRUE)
We show the size of the database.
# Explore Data frame
dim(frame)
## [1] 902297 37
We show the name of the variables in the database.
names(frame)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
We know that more recent years since 1950 should be considered more complete, so we work only with data since year 1990. Then, we transform variables “PROPDMGEXP” and “CROPDMGEXP” into numerical values expressed in base 10 format.
# Only get Data from 1990
frame2 <- separate(frame, BGN_DATE, into = c("Month", "Day", "Year"), sep = "/")
frame2 <- separate(frame2, Year, into = c("Year", "Hour"), sep = " ")
position <- which(as.character(frame2$Year) >= "1990")
frame2 <- frame2[position, ]
# Subset Data frame
position <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
"PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
frame2 <- frame2[, position]
# transform Data exponent
frame2$PROPDMGEXP <- as.factor(as.character(frame2$PROPDMGEXP))
frame2$CROPDMGEXP <- as.factor(as.character(frame2$CROPDMGEXP))
levels(frame2$PROPDMGEXP) <- c("1", "0", "0", "0", "0", "10", "100", "1000", "10000",
"100000", "1000000", "10000000", "100000000",
"1000000000", "100", "100", "1000", "1000000",
"1000000")
levels(frame2$CROPDMGEXP) <- c("1", "0", "0", "100", "1000000000", "1000",
"1000", "1000000", "1000000")
We take only the variables we need to do the analisys.
# Create New Variables
frame2$PropertyDamage <- as.numeric(as.character(frame2$PROPDMG)) * as.numeric(as.character(frame2$PROPDMGEXP))
frame2$CropDamage <- as.numeric(as.character(frame2$CROPDMG)) * as.numeric(as.character(frame2$CROPDMGEXP))
names(frame2)[1:3] <- c("Event", "Fatality", "Injury")
frame2 <- frame2[, -(4:7)]
frame2$FatalityInjury <- as.numeric(as.character(frame2$Fatality)) + as.numeric(as.character(frame2$Injury))
frame2$PropertyCropDamage <- as.numeric(as.character(frame2$PropertyDamage)) + as.numeric(as.character(frame2$CropDamage))
# Create New Frame with the counts of Variables
frame3 <- ddply(
frame2, .(Event), summarize,
Fatality = sum(Fatality),
Injury = sum(Injury),
FatalityInjury = sum(FatalityInjury),
PropertyDamage = sum(PropertyDamage),
CropDamage = sum(CropDamage),
PropertyCropDamage = sum(PropertyCropDamage)
)
We show the size of the new database.
# Explore Data frame
dim(frame3)
## [1] 985 7
We show the name of the variables in the new database.
names(frame3)
## [1] "Event" "Fatality" "Injury"
## [4] "FatalityInjury" "PropertyDamage" "CropDamage"
## [7] "PropertyCropDamage"
We show a part of the new database we created to solve the problem.
frame3[5:10,]
## Event Fatality Injury FatalityInjury PropertyDamage
## 5 ACCUMULATED SNOWFALL 0 0 0 0
## 6 AGRICULTURAL FREEZE 0 0 0 0
## 7 APACHE COUNTY 0 0 0 5000
## 8 ASTRONOMICAL HIGH TIDE 0 0 0 9425000
## 9 ASTRONOMICAL LOW TIDE 0 0 0 320000
## 10 AVALANCE 1 0 1 0
## CropDamage PropertyCropDamage
## 5 0 0
## 6 28820000 28820000
## 7 0 5000
## 8 0 9425000
## 9 0 320000
## 10 0 0
We use injuries and fatalities as a representation of population health. Then we plot the results in histograms.
# Plot polpulation health based on injuries
injuries <- frame3[order(as.numeric(as.character(frame3$Injury)), decreasing = TRUE), ]
injuries <- injuries[1:10,]
g1 <- ggplot(data = injuries, aes(x = reorder(Event, Injury), y = Injury)) +
geom_bar(fill = "olivedrab", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Total Number of Injuries") +
theme(legend.position = "none")
# Plot polpulation health based on facilities
fatalities <- frame3[order(as.numeric(as.character(frame3$Fatality)), decreasing = TRUE), ]
fatalities <- fatalities[1:10,]
g2 <- ggplot(data = fatalities, aes(x = reorder(Event, Fatality), y = Fatality)) +
geom_bar(fill = "orange", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Total Number of Fatalities") +
theme(legend.position = "none")
# Plot polpulation health based on injuries and facilities
fatalitiesInjurities <- frame3[order(as.numeric(as.character(frame3$FatalityInjury)), decreasing = TRUE), ]
fatalitiesInjurities <- fatalitiesInjurities[1:10,]
g3 <- ggplot(data = fatalitiesInjurities, aes(x = reorder(Event, FatalityInjury), y = FatalityInjury)) +
geom_bar(fill = "red4", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Total Number of Fatalities and Injuries") +
ggtitle("Top 10 Most Harmful Events to Polpulation Health") +
theme(legend.position = "none")
grid.arrange(g3, g2, g1, nrow = 3)
top 10 events most harmful based on injuries, fatalities and the sum of both.
We print the top 10 events most harmful in a table.
populationDamage <- data.frame(
Injuries = as.character(injuries$Event),
Fatalities = as.character(fatalities$Event),
FatalitiesInjuries = as.character(fatalitiesInjurities$Event),
Ranking = c(1:10)
)
populationDamage
## Injuries Fatalities FatalitiesInjuries Ranking
## 1 TORNADO EXCESSIVE HEAT TORNADO 1
## 2 FLOOD TORNADO EXCESSIVE HEAT 2
## 3 EXCESSIVE HEAT FLASH FLOOD FLOOD 3
## 4 LIGHTNING HEAT LIGHTNING 4
## 5 TSTM WIND LIGHTNING TSTM WIND 5
## 6 HEAT FLOOD HEAT 6
## 7 ICE STORM RIP CURRENT FLASH FLOOD 7
## 8 FLASH FLOOD TSTM WIND ICE STORM 8
## 9 THUNDERSTORM WIND HIGH WIND THUNDERSTORM WIND 9
## 10 WINTER STORM AVALANCHE WINTER STORM 10
We use property damage and crop damage as a representation of economic consecuences. Then we plot the results in histograms.
# Plot economics consecuences based on properties damage
properties <- frame3[order(as.numeric(as.character(frame3$PropertyDamage)), decreasing = TRUE), ]
properties <- properties[1:10,]
g1 <- ggplot(data = properties, aes(x = reorder(Event, PropertyDamage), y = PropertyDamage / 1000000)) +
geom_bar(fill = "olivedrab", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Property Damage in Millions of Dollars") +
theme(legend.position = "none")
# Plot economics consecuences based on crop damage
crops <- frame3[order(as.numeric(as.character(frame3$CropDamage)), decreasing = TRUE), ]
crops <- crops[1:10,]
g2 <- ggplot(data = crops, aes(x = reorder(Event, CropDamage), y = CropDamage / 1000000)) +
geom_bar(fill = "orange", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Crop Damage in Millions of Dollars") +
theme(legend.position = "none")
# Plot economics consecuences based on properties and crop damage
propertiesCrops <- frame3[order(as.numeric(as.character(frame3$PropertyCropDamage)), decreasing = TRUE), ]
propertiesCrops <- propertiesCrops[1:10,]
g3 <- ggplot(data = propertiesCrops, aes(x = reorder(Event, PropertyCropDamage), y = PropertyCropDamage / 1000000)) +
geom_bar(fill = "red4", stat = "identity") + coord_flip() +
xlab("Event Type") + ylab("Property and Crop Damage in Millions of Dollars") +
ggtitle("Top 10 Events with Greatest Economics Consecuences") +
theme(legend.position = "none")
# Plot all together
grid.arrange(g3, g2, g1, nrow = 3)
Top 10 events with greatest econimic consecuences based on proprety damage, crop damage and the sum of both.
We print the top 10 events with greatest economic consecuences.
economicDamage <- data.frame(
PropertyDamage = as.character(properties$Event),
CropDamage = as.character(crops$Event),
PropertyCropDamage = as.character(propertiesCrops$Event),
Ranking = c(1:10)
)
economicDamage
## PropertyDamage CropDamage PropertyCropDamage Ranking
## 1 FLOOD DROUGHT FLOOD 1
## 2 HURRICANE/TYPHOON FLOOD HURRICANE/TYPHOON 2
## 3 STORM SURGE RIVER FLOOD STORM SURGE 3
## 4 TORNADO ICE STORM TORNADO 4
## 5 FLASH FLOOD HAIL HAIL 5
## 6 HAIL HURRICANE FLASH FLOOD 6
## 7 HURRICANE HURRICANE/TYPHOON DROUGHT 7
## 8 TROPICAL STORM FLASH FLOOD HURRICANE 8
## 9 WINTER STORM EXTREME COLD RIVER FLOOD 9
## 10 HIGH WIND FROST/FREEZE ICE STORM 10