Synopsis

The goal of this project (the second project od the Reproduclble Research Course from Coursera) is to explore the U.S. NOAA (National Oceanic and Atmospheric Administration’s) Storm Database and explore the effects of severe weather events on both population health and economic consecuences.

The analysis aims to investigate which different types of sever weather events are most harmful on the populations health in respect of general injuries and fatalities. Further the economic consequences will be analyzed by exploring the financial damage done to both general property and agriculture (i.e. crops).

Database

The database covers the time period between 1950 and November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

The database have information about storms and other significant weather phenomena having sufficient intensity to cause loss of life, injuries, significant property damage, and/or disruption to commerce. The database have information of others significant meteorological events, such as snow flurries or precipitation that occur in connection with another event.

Data Processing

We load libraries required and database.

# Load Libraries
library(plyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(grid)


# Load datasets
## Create dir principal
dir.principal <- paste(getwd())


# Load Storm Data
dir <- paste(dir.principal, "/Data/StormData.csv.bz2", sep = "")
frame <- read.csv(bzfile(dir), header = TRUE)

We show the size of the database.

# Explore Data frame
dim(frame)
## [1] 902297     37

We show the name of the variables in the database.

names(frame)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

We know that more recent years since 1950 should be considered more complete, so we work only with data since year 1990. Then, we transform variables “PROPDMGEXP” and “CROPDMGEXP” into numerical values expressed in base 10 format.

# Only get Data from 1990
frame2 <- separate(frame, BGN_DATE, into = c("Month", "Day", "Year"), sep = "/")
frame2 <- separate(frame2, Year, into = c("Year", "Hour"), sep = " ")

position <- which(as.character(frame2$Year) >= "1990")
frame2 <- frame2[position, ]


# Subset Data frame 
position <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", 
              "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")

frame2 <- frame2[, position]


# transform Data exponent
frame2$PROPDMGEXP <- as.factor(as.character(frame2$PROPDMGEXP))
frame2$CROPDMGEXP <- as.factor(as.character(frame2$CROPDMGEXP))

levels(frame2$PROPDMGEXP) <- c("1", "0", "0", "0", "0", "10", "100", "1000", "10000", 
                               "100000", "1000000", "10000000", "100000000", 
                              "1000000000", "100", "100", "1000", "1000000", 
                              "1000000")

levels(frame2$CROPDMGEXP) <- c("1",  "0", "0", "100", "1000000000", "1000",
                              "1000", "1000000", "1000000")

We take only the variables we need to do the analisys.

# Create New Variables
frame2$PropertyDamage <- as.numeric(as.character(frame2$PROPDMG)) * as.numeric(as.character(frame2$PROPDMGEXP))
frame2$CropDamage <- as.numeric(as.character(frame2$CROPDMG)) * as.numeric(as.character(frame2$CROPDMGEXP))

names(frame2)[1:3] <- c("Event", "Fatality", "Injury")
frame2 <- frame2[, -(4:7)]

frame2$FatalityInjury <- as.numeric(as.character(frame2$Fatality)) + as.numeric(as.character(frame2$Injury))
frame2$PropertyCropDamage <- as.numeric(as.character(frame2$PropertyDamage)) + as.numeric(as.character(frame2$CropDamage))


# Create New Frame with the counts of Variables
frame3 <- ddply(
        
        frame2, .(Event), summarize, 
        Fatality = sum(Fatality), 
        Injury = sum(Injury), 
        FatalityInjury = sum(FatalityInjury),
        PropertyDamage = sum(PropertyDamage),
        CropDamage = sum(CropDamage),
        PropertyCropDamage = sum(PropertyCropDamage)
        
)

We show the size of the new database.

# Explore Data frame
dim(frame3)
## [1] 985   7

We show the name of the variables in the new database.

names(frame3)
## [1] "Event"              "Fatality"           "Injury"            
## [4] "FatalityInjury"     "PropertyDamage"     "CropDamage"        
## [7] "PropertyCropDamage"

We show a part of the new database we created to solve the problem.

frame3[5:10,]
##                     Event Fatality Injury FatalityInjury PropertyDamage
## 5    ACCUMULATED SNOWFALL        0      0              0              0
## 6     AGRICULTURAL FREEZE        0      0              0              0
## 7           APACHE COUNTY        0      0              0           5000
## 8  ASTRONOMICAL HIGH TIDE        0      0              0        9425000
## 9   ASTRONOMICAL LOW TIDE        0      0              0         320000
## 10               AVALANCE        1      0              1              0
##    CropDamage PropertyCropDamage
## 5           0                  0
## 6    28820000           28820000
## 7           0               5000
## 8           0            9425000
## 9           0             320000
## 10          0                  0

Results

Events Most Harmful to Population Health

We use injuries and fatalities as a representation of population health. Then we plot the results in histograms.

# Plot polpulation health based on injuries
injuries <- frame3[order(as.numeric(as.character(frame3$Injury)), decreasing = TRUE), ]
injuries <- injuries[1:10,]
g1 <- ggplot(data = injuries, aes(x = reorder(Event, Injury), y = Injury)) +
        geom_bar(fill = "olivedrab", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Total Number of Injuries") +
        theme(legend.position = "none")


# Plot polpulation health based on facilities
fatalities <- frame3[order(as.numeric(as.character(frame3$Fatality)), decreasing = TRUE), ]
fatalities <- fatalities[1:10,]
g2 <- ggplot(data = fatalities, aes(x = reorder(Event, Fatality), y = Fatality)) +
        geom_bar(fill = "orange", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Total Number of Fatalities") +
        theme(legend.position = "none") 


# Plot polpulation health based on injuries and facilities
fatalitiesInjurities <- frame3[order(as.numeric(as.character(frame3$FatalityInjury)), decreasing = TRUE), ]
fatalitiesInjurities <- fatalitiesInjurities[1:10,]
g3 <- ggplot(data = fatalitiesInjurities, aes(x = reorder(Event, FatalityInjury), y = FatalityInjury)) +
        geom_bar(fill = "red4", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Total Number of Fatalities and Injuries") +
        ggtitle("Top 10 Most Harmful Events to Polpulation Health") +
        theme(legend.position = "none")

Plot Events Most Harmful to Population Health

grid.arrange(g3, g2, g1, nrow = 3)
top 10 events most harmful based on injuries, fatalities and the sum of both.

top 10 events most harmful based on injuries, fatalities and the sum of both.

Table of Events Most Harmful to Population Health

We print the top 10 events most harmful in a table.

populationDamage <- data.frame(
        
        Injuries = as.character(injuries$Event),
        Fatalities = as.character(fatalities$Event),
        FatalitiesInjuries = as.character(fatalitiesInjurities$Event),
        Ranking = c(1:10)
        
)

populationDamage
##             Injuries     Fatalities FatalitiesInjuries Ranking
## 1            TORNADO EXCESSIVE HEAT            TORNADO       1
## 2              FLOOD        TORNADO     EXCESSIVE HEAT       2
## 3     EXCESSIVE HEAT    FLASH FLOOD              FLOOD       3
## 4          LIGHTNING           HEAT          LIGHTNING       4
## 5          TSTM WIND      LIGHTNING          TSTM WIND       5
## 6               HEAT          FLOOD               HEAT       6
## 7          ICE STORM    RIP CURRENT        FLASH FLOOD       7
## 8        FLASH FLOOD      TSTM WIND          ICE STORM       8
## 9  THUNDERSTORM WIND      HIGH WIND  THUNDERSTORM WIND       9
## 10      WINTER STORM      AVALANCHE       WINTER STORM      10

Events with Greatest Economic Consecuences

We use property damage and crop damage as a representation of economic consecuences. Then we plot the results in histograms.

# Plot economics consecuences based on properties damage
properties <- frame3[order(as.numeric(as.character(frame3$PropertyDamage)), decreasing = TRUE), ]
properties <- properties[1:10,]
g1 <- ggplot(data = properties, aes(x = reorder(Event, PropertyDamage), y = PropertyDamage / 1000000)) +
        geom_bar(fill = "olivedrab", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Property Damage in Millions of Dollars") +
        theme(legend.position = "none")


# Plot economics consecuences based on crop damage
crops <- frame3[order(as.numeric(as.character(frame3$CropDamage)), decreasing = TRUE), ]
crops <- crops[1:10,]
g2 <- ggplot(data = crops, aes(x = reorder(Event, CropDamage), y = CropDamage / 1000000)) +
        geom_bar(fill = "orange", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Crop Damage in Millions of Dollars") +
        theme(legend.position = "none") 


# Plot economics consecuences based on properties and crop damage
propertiesCrops <- frame3[order(as.numeric(as.character(frame3$PropertyCropDamage)), decreasing = TRUE), ]
propertiesCrops <- propertiesCrops[1:10,]
g3 <- ggplot(data = propertiesCrops, aes(x = reorder(Event, PropertyCropDamage), y = PropertyCropDamage / 1000000)) +
        geom_bar(fill = "red4", stat = "identity")  + coord_flip() +
        xlab("Event Type") + ylab("Property and Crop Damage in Millions of Dollars") +
        ggtitle("Top 10 Events with Greatest Economics Consecuences") +
        theme(legend.position = "none")

Plot Events with Greatest Economic Consecuences

# Plot all together
grid.arrange(g3, g2, g1, nrow = 3)
Top 10 events with greatest econimic consecuences based on proprety damage, crop damage and the sum of both.

Top 10 events with greatest econimic consecuences based on proprety damage, crop damage and the sum of both.

Table of Events with Greatest Economic Consecuences

We print the top 10 events with greatest economic consecuences.

economicDamage <- data.frame(
        
        PropertyDamage = as.character(properties$Event),
        CropDamage = as.character(crops$Event),
        PropertyCropDamage = as.character(propertiesCrops$Event),
        Ranking = c(1:10)
        
)

economicDamage
##       PropertyDamage        CropDamage PropertyCropDamage Ranking
## 1              FLOOD           DROUGHT              FLOOD       1
## 2  HURRICANE/TYPHOON             FLOOD  HURRICANE/TYPHOON       2
## 3        STORM SURGE       RIVER FLOOD        STORM SURGE       3
## 4            TORNADO         ICE STORM            TORNADO       4
## 5        FLASH FLOOD              HAIL               HAIL       5
## 6               HAIL         HURRICANE        FLASH FLOOD       6
## 7          HURRICANE HURRICANE/TYPHOON            DROUGHT       7
## 8     TROPICAL STORM       FLASH FLOOD          HURRICANE       8
## 9       WINTER STORM      EXTREME COLD        RIVER FLOOD       9
## 10         HIGH WIND      FROST/FREEZE          ICE STORM      10