The Storm Data undertaken in this analysis consists of different variables pertaining to storms. The objectives of the following analysis would be to analyse and create corresponding graphs that could answer questions pertaining to the type of events that can be generally considered harmful towards the population's health Another thing the analysis is trying to answer, is finding the economic impacts the different types of storms have caused towards the US economic market. This, in turn would provide some insights about possible preventative measures that can be undertaken when tied this analysis together with the measures undertaken after these events, and comparing the results.
The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site:
Dataset: Storm Data [47Mb]
There is also some documentation of the database available. Here you will find how some of the variables are constructed/defined.
The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
# Load Libraries
library(ggplot2)
library(RColorBrewer)
library(dplyr)
library(lattice)
library(gridExtra)
# Download and unzip the data
fileurl = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
if (!file.exists('./NOAA_Storm')){
download.file(fileurl,'./NOAA_Storm.csv.bz')
dateDownloaded <- date()
}
NOAA_raw <- read.csv(bzfile('./NOAA_Storm.csv.bz'), stringsAsFactors = FALSE)
# Getting a peak at the dataset
head(NOAA_raw[1:10,1:8])
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE
## 1 TORNADO
## 2 TORNADO
## 3 TORNADO
## 4 TORNADO
## 5 TORNADO
## 6 TORNADO
str(NOAA_raw)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
We can notice two variables: Injuries and Fatalities for all events, so it would be advisable to plot graphs that would show how much injuries and how much fatalities each type of event as an aggregate had caused. Nevertheless, as this analysis requires at most 3 figures, I decided to go with fatalities as the one producing the most harm towards the population
# Total Affected
NOAA_copy <- NOAA_raw
NOAA_copy$TotalAFff <- NOAA_raw$FATALITIES + NOAA_raw$INJURIES
# Get the year out of the date time
NOAA_copy$DATE <- NA
NOAA_copy$DATE <- substr(NOAA_raw$BGN_DATE, 1, nchar(NOAA_raw$BGN_DATE)-8)
NOAA_copy$DATE <- as.Date(parse_date(NOAA_copy$DATE, "%m/%d/%Y"))
NOAA_copy$YEAR <- year(NOAA_copy$DATE)
NOAA_table <- data.table(NOAA_copy)
NOAA_table$EVTYPE <- toupper(NOAA_table$EVTYPE)
# There are rows indicating summary statistics as EVTYPE; This is dirty data
NOAA_table <- NOAA_table[!like(EVTYPE, "SUMMARY")]
# Many event types are with different names, but are pertaining to the same type of fatality, we need to clean this column before aggregating the data
NOAA_table$EVTYPE <- gsub(".*(WIND).*", "WIND", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(VOLCANIC).*", "VOLCANO", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(SNOW).*", "SNOW", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HURRICANE).*", "HURRICANE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(THUNDERSTORM).*", "THUNDERSTORM", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HAIL).*", "HAIL", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(TORNADO).*", "TORNADO", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(STORM).*", "STORM", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(RAIN).*", "RAIN", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(FLOOD).*", "FLOOD", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(MICROBURST).*", "MICROBURST", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(LIGHTNING).*", "LIGHTNING", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(HEAVY SHOWER).*", "HEAVY SHOWER", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(EXTREME).*", "EXTREME TEMPERATURE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- gsub(".*(TEMPERATURE).*", "EXTREME TEMPERATURE", NOAA_table$EVTYPE)
NOAA_table$EVTYPE <- as.factor(NOAA_table$EVTYPE)
NOAA_table$DECADE <- as.factor(NOAA_table$YEAR - NOAA_table$YEAR%%10)
NOAA_table$DECADE <- as.factor(substr(NOAA_table$DECADE, 3, 4))
# Aggregation per fatality, injury and total (sum of fatalities + injuries)
Fatal_Agg <- aggregate(FATALITIES~EVTYPE, data=NOAA_table, sum)
# The worst disasters according to how many fatalities there were
Fatal_Agg[order(-Fatal_Agg[,2]),][1:10,1:2]
## EVTYPE FATALITIES
## 209 TORNADO 5636
## 59 EXCESSIVE HEAT 1903
## 68 FLOOD 1524
## 266 WIND 1451
## 93 HEAT 937
## 142 LIGHTNING 817
## 208 STORM 422
## 188 RIP CURRENT 368
## 65 EXTREME TEMPERATURE 268
## 12 AVALANCHE 224
worst_disasters <- Fatal_Agg[order(-Fatal_Agg[,2]),][1:5,1]
cleanFatalities <- filter(NOAA_table, EVTYPE %in% worst_disasters)
Fatal_Decade <- aggregate(FATALITIES~EVTYPE+DECADE, data=CleanFatalities, sum)
## Error in eval(m$data, parent.frame()): object 'CleanFatalities' not found
# The analysis could also be done with injurred or with total, and different results can be seen as determining as worst disasters
# Injur_Agg <- aggregate(INJURIES~DECADE+EVTYPE, data=NOAA_table, sum)
# Total_Agg <- aggregate(INJURIES~DECADE+EVTYPE, data=NOAA_table, sum)
ggplot(data=Fatal_Decade[Fatal_Decade$EVTYPE %in% c('TORNADO', "EXCESSIVE HEAT", "FLOOD", "WIND", "HEAT"),], aes(DECADE, FATALITIES, fill=FATALITIES)) + geom_bar(stat = "identity", width = 0.7) + facet_grid(.~EVTYPE) + scale_x_discrete(limits=c("50", "60", "70", "80", "90", "00", "10")) + labs(x = "Decade", y = "Fatalities", title = "Decade comparisons of worst fatalities") + theme(axis.text.x = element_text(angle=70, hjust=1)) + scale_fill_gradient(low = "#56B1F7", high = "#132B43")
The events causing the greatest economic consequences would be the property damages or crop damage. These two are comparable monetarily, so a total damage column is created as a response.
# The worst disasters according to how many fatalities there were
NOAA_table$TotalDMG <- NOAA_table$CROPDMG + NOAA_table$PROPDMG;
DMGS_Agg <- aggregate(TotalDMG~EVTYPE, data=NOAA_table, sum)
# The worst damage divided by events
DMGS_Agg[order(-DMGS_Agg[,2]),][1:10,1:2]
## EVTYPE TotalDMG
## 266 WIND 3358545.10
## 209 TORNADO 3315774.68
## 68 FLOOD 2798047.53
## 90 HAIL 1270729.14
## 142 LIGHTNING 606967.39
## 208 STORM 297062.99
## 206 SNOW 154375.15
## 264 WILDFIRE 88823.54
## 168 RAIN 71242.74
## 262 WILD/FOREST FIRE 43534.49
worst_damages <- DMGS_Agg[order(-Fatal_Agg[,2]),][1:5,1]
cleanDamages <- filter(NOAA_table, EVTYPE %in% worst_damages)
Damage_Decade <- aggregate(TotalDMG~EVTYPE+DECADE, data=NOAA_table, sum)
ggplot(data=Damage_Decade[Damage_Decade$EVTYPE %in% worst_damages,], aes(DECADE, TotalDMG, fill=TotalDMG)) + geom_bar(stat = "identity", width = 0.7) + facet_grid(.~EVTYPE) + scale_x_discrete(limits=c("50", "60", "70", "80", "90", "00", "10")) + labs(x = "Decade", y = "Fatalities", title = "Decade comparisons of worst damages per event type") + theme(axis.text.x = element_text(angle=70, hjust=1)) + scale_fill_gradient(low = "#56B1F7", high = "#132B43")