Synopsis

The Storm Data undertaken in this analysis consists of different variables pertaining to storms. The objectives of the following analysis would be to analyse and create corresponding graphs that could answer questions pertaining to the type of events that can be generally considered harmful towards the population's health Another thing the analysis is trying to answer, is finding the economic impacts the different types of storms have caused towards the US economic market. This, in turn would provide some insights about possible preventative measures that can be undertaken when tied this analysis together with the measures undertaken after these events, and comparing the results.

Data Processing

The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site:

Dataset: Storm Data [47Mb]

There is also some documentation of the database available. Here you will find how some of the variables are constructed/defined.

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

# Load Libraries
library(ggplot2)
library(RColorBrewer)
library(dplyr)
library(lattice)
library(gridExtra)

# Download and unzip the data
fileurl = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
if (!file.exists('./NOAA_Storm')){
  download.file(fileurl,'./NOAA_Storm.csv.bz')
  dateDownloaded <- date()
}



NOAA_raw <- read.csv(bzfile('./NOAA_Storm.csv.bz'), stringsAsFactors = FALSE)

# Getting a peak at the dataset
head(NOAA_raw[1:10,1:8])
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE
## 1 TORNADO
## 2 TORNADO
## 3 TORNADO
## 4 TORNADO
## 5 TORNADO
## 6 TORNADO
str(NOAA_raw)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Question 1: Across the United States, which types of events (as indicated in the EVTYPE variable are most harmful with respect to population health?

We can notice two variables: Injuries and Fatalities for all events, so it would be advisable to plot graphs that would show how much injuries and how much fatalities each type of event as an aggregate had caused. Nevertheless, as this analysis requires at most 3 figures, I decided to go with fatalities as the one producing the most harm towards the population

# Total Affected
NOAA_copy <- NOAA_raw
NOAA_copy$TotalAFff <- NOAA_raw$FATALITIES + NOAA_raw$INJURIES

# Get the year out of the date time
NOAA_copy$DATE <- NA
NOAA_copy$DATE <- substr(NOAA_raw$BGN_DATE, 1, nchar(NOAA_raw$BGN_DATE)-8)
NOAA_copy$DATE <- as.Date(parse_date(NOAA_copy$DATE, "%m/%d/%Y"))
NOAA_copy$YEAR <- year(NOAA_copy$DATE)

NOAA_table <- data.table(NOAA_copy)
NOAA_table$EVTYPE <- toupper(NOAA_table$EVTYPE)

# There are rows indicating summary statistics as EVTYPE; This is dirty data
NOAA_table <- NOAA_table[!like(EVTYPE, "SUMMARY")]


# Many event types are with different names, but are pertaining to the same type of fatality, we need to clean this column before aggregating the data
NOAA_table$EVTYPE <- gsub(".*(WIND).*", "WIND", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(VOLCANIC).*", "VOLCANO", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(SNOW).*", "SNOW", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HURRICANE).*", "HURRICANE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(THUNDERSTORM).*", "THUNDERSTORM", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HAIL).*", "HAIL", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(TORNADO).*", "TORNADO", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(STORM).*", "STORM", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(RAIN).*", "RAIN", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(FLOOD).*", "FLOOD", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(MICROBURST).*", "MICROBURST", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(LIGHTNING).*", "LIGHTNING", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HEAVY SHOWER).*", "HEAVY SHOWER", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(EXTREME).*", "EXTREME TEMPERATURE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(TEMPERATURE).*", "EXTREME TEMPERATURE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- as.factor(NOAA_table$EVTYPE)
NOAA_table$DECADE <- as.factor(NOAA_table$YEAR - NOAA_table$YEAR%%10)
NOAA_table$DECADE <- as.factor(substr(NOAA_table$DECADE, 3, 4))





# Aggregation per fatality, injury and total (sum of fatalities + injuries)
Fatal_Agg <- aggregate(FATALITIES~EVTYPE, data=NOAA_table, sum)

Results

# The worst disasters according to how many fatalities there were
Fatal_Agg[order(-Fatal_Agg[,2]),][1:10,1:2]
##                  EVTYPE FATALITIES
## 209             TORNADO       5636
## 59       EXCESSIVE HEAT       1903
## 68                FLOOD       1524
## 266                WIND       1451
## 93                 HEAT        937
## 142           LIGHTNING        817
## 208               STORM        422
## 188         RIP CURRENT        368
## 65  EXTREME TEMPERATURE        268
## 12            AVALANCHE        224
worst_disasters <- Fatal_Agg[order(-Fatal_Agg[,2]),][1:5,1]


cleanFatalities <- filter(NOAA_table, EVTYPE %in% worst_disasters)
Fatal_Decade <- aggregate(FATALITIES~EVTYPE+DECADE, data=CleanFatalities, sum)
## Error in eval(m$data, parent.frame()): object 'CleanFatalities' not found
# The analysis could also be done with injurred or with total, and different results can be seen as determining as worst disasters
# Injur_Agg <- aggregate(INJURIES~DECADE+EVTYPE, data=NOAA_table, sum)
# Total_Agg <- aggregate(INJURIES~DECADE+EVTYPE, data=NOAA_table, sum)

ggplot(data=Fatal_Decade[Fatal_Decade$EVTYPE %in% c('TORNADO', "EXCESSIVE HEAT", "FLOOD", "WIND", "HEAT"),], aes(DECADE, FATALITIES, fill=FATALITIES)) + geom_bar(stat = "identity", width = 0.7) + facet_grid(.~EVTYPE) + scale_x_discrete(limits=c("50", "60", "70", "80", "90", "00", "10")) + labs(x = "Decade", y = "Fatalities", title = "Decade comparisons of worst fatalities") + theme(axis.text.x = element_text(angle=70, hjust=1))  + scale_fill_gradient(low = "#56B1F7", high = "#132B43")

plot of chunk Answer 1 Section

Question 2: Across the United States, which types of events have the greatest economic consequences?

The events causing the greatest economic consequences would be the property damages or crop damage. These two are comparable monetarily, so a total damage column is created as a response.

# The worst disasters according to how many fatalities there were
NOAA_table$TotalDMG <- NOAA_table$CROPDMG + NOAA_table$PROPDMG;

DMGS_Agg <- aggregate(TotalDMG~EVTYPE, data=NOAA_table, sum)

# The worst damage divided by events
DMGS_Agg[order(-DMGS_Agg[,2]),][1:10,1:2]
##               EVTYPE   TotalDMG
## 266             WIND 3358545.10
## 209          TORNADO 3315774.68
## 68             FLOOD 2798047.53
## 90              HAIL 1270729.14
## 142        LIGHTNING  606967.39
## 208            STORM  297062.99
## 206             SNOW  154375.15
## 264         WILDFIRE   88823.54
## 168             RAIN   71242.74
## 262 WILD/FOREST FIRE   43534.49
worst_damages <- DMGS_Agg[order(-Fatal_Agg[,2]),][1:5,1]


cleanDamages <- filter(NOAA_table, EVTYPE %in% worst_damages)
Damage_Decade <- aggregate(TotalDMG~EVTYPE+DECADE, data=NOAA_table, sum)

Results

ggplot(data=Damage_Decade[Damage_Decade$EVTYPE %in% worst_damages,], aes(DECADE, TotalDMG, fill=TotalDMG)) + geom_bar(stat = "identity", width = 0.7) + facet_grid(.~EVTYPE) + scale_x_discrete(limits=c("50", "60", "70", "80", "90", "00", "10")) + labs(x = "Decade", y = "Fatalities", title = "Decade comparisons of worst damages per event type") + theme(axis.text.x = element_text(angle=70, hjust=1)) + scale_fill_gradient(low = "#56B1F7", high = "#132B43")

plot of chunk Answer 2 Results