library(dplyr, quietly=T)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly=T)
library(RCurl, quietly=T)
library(lubridate, quietly=T)
library(stringr, quietly=T)
The basic goal of this assignment is to explore the NOAA Storm Database and to answer some basic questions about severe weather events. By exploring weather events we will be able to explain its impact on the U.S. economy as well as losses to human beings in terms of lives and injuries.
Data is in the form of a compressed file bz2 and cvs format located here below:
We must use the database to answer some questions and show the code we have used for the entire analysis. The analysis will consist of tables, figures, or other summaries. We will use several R packages to support our analysis.
The data for this assignment in the form of a comma-separated-value file is compressed via the bzip2 algorithm to reduce its size. We have downloaded the file from the course web site as indicated above.
# Data Input
# Loading the Storm database file
Url <-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
sdfile <- "StormData.csv.bz2"
if (!file.exists(sdfile)) {
download.file(Url, destfile = sdfile, quiet=T, method = "libcurl")
}
sd <- tbl_df(read.table(file = sdfile, stringsAsFactors = F, sep = ",",
header = T))
# We are only interested on registered events in USA territory
usstates <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI",
"ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI",
"MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC",
"ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT",
"VT", "VA", "WA", "WV", "WI", "WY")
sd1 <- select(sd, BGN_DATE, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG,
PROPDMGEXP) %>%
filter(STATE %in% usstates) %>%
mutate(BGN_DATE = as.Date(BGN_DATE, "%m/%d/%Y"))
# Investigating data quality issues
# We will select only those records were these conditions are met:
# (1) FATALITIES != 0 or
# (2) INJURIES != 0 or
# (3) PROPDMG != 0
sd1 <- sd1 %>% filter(FATALITIES != 0 | INJURIES != 0 | PROPDMG != 0)
# Review units on PROPDMGEXP - We need to confirm the diversity of units
# used on this database to see if we can fix it.
unique(sd1$PROPDMGEXP)
# Convert all unit to upper-case
sd1$PROPDMGEXP <- toupper(sd1$PROPDMGEXP)
# We will eliminate all non-proper units on PROPDMGEXP
# We do not have access to the originator of the data
# In a real situation we should investigate more and find out
# what kind of units are those 0's, 1's, ... etc.
sd1 <- sd1 %>% filter(!(PROPDMGEXP %in%
c("0", "1", "2", "3", "4", "5", "6", "7", "8",
"", "+", "-")
)) %>%
select(BGN_DATE, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG,
PROPDMGEXP)
# We create a function to convert all units to "M"
# We will normalize all units to the Million represented by "M"
toMillion <- function(x) {
as.numeric(unlist(c(H=c(1e-06), K=c(1e-03), M=c(1), B=c(1e+03))[x]))
}
# Converting all units = ["H","K","B"] to "M" for Million
sd1$PROPDMG <- sd1$PROPDMG*toMillion(sd1$PROPDMGEXP)
sd1$PROPDMGEXP <- "M"
# Investigating cardinality of EVTYPE and clean it up if needed
sort(unique(sd1$EVTYPE))
There are 384 types of Events. We will create a new meta-category to group similar Events which appear in this case to be the kind of the same type.
# Coverting all Events to capital letter
sd1$EVTYPE <- toupper(sd1$EVTYPE)
# We observe many rows with blank spaces in the EVTYPE column, so we need to
# trim them up
# Function to trim all blanks on EVTYPE
trim <- function (x) gsub("(^ +)|( +$)", "", toupper(x))
sd1$EVTYPE <- trim(sd1$EVTYPE)
# Creating a Meta-Category based on EVTYPE to minimize number of different
# types of EVTYPE as they appear to have many typos
sd1$METAEVTYPE <- sd1$EVTYPE
# Changing all variants of Heat
sd1$METAEVTYPE[grep("(^HEAT)|(HEAT$)|
(^DROUGHT)|(^DRY)|(^DOWNBURST)",
sd1$METAEVTYPE)] <- "HEAT"
# Changing all variants of TSTM or THUNDER to THUNDERSTORM WIND
sd1$METAEVTYPE[grep("(^TST)|(^THUN)|(^THU)|(^TROP)|(RAIN)|
(^WIND)|(WIND$)|(WINDS$)|(WIND)|
(^WINT)|(^DUST)|(DUST$)|(PRECIPIT)|
(STORM$)|(^STORM)|(STORM)|
(^GUSTNADO)|(^TIDE)|(TIDE$)",
sd1$METAEVTYPE)] <- "STORM"
# Changing all variants of Wildfire
sd1$METAEVTYPE[grep("(^WIL)|(FOG)|
(SMOKE)|(FIRES$)", sd1$METAEVTYPE)] <- "FIRES"
# Changing all variants of Tornado
sd1$METAEVTYPE[grep("(^TORND)|(^TORNAD)|(^WATE)|
(^FUNNEL)|(TURBULE)", sd1$METAEVTYPE)] <- "TORNADO"
# Changing all variants of Urban Flooding
sd1$METAEVTYPE[grep("(^URB)|(^FLASH)|(^FLOOD)|
(^COASTAL)|(CSTL)|(HEAVY )|(^MARINE)|
(^RIVER)|(MUD)|(^LAND)|(^HIGH)|
(FLOOD$)|(FLOOD)|(^DAM)|
(SURF)|(^RIP)|(SURF$)", sd1$METAEVTYPE)] <- "FLOOD"
# Changing all variants of Hurricane
sd1$METAEVTYPE[grep("(^HURRIC)", sd1$METAEVTYPE)] <- "HURRICANE"
# Changing all variants of Snow
sd1$METAEVTYPE[grep("(^SNOW)|(SNOW$)|(^HEAVY SNOW)|
(^WINTRY)|(HAIL)|
(^WINTER)", sd1$METAEVTYPE)] <- "SNOW"
# Changing all variants of Lightning
sd1$METAEVTYPE[grep("(LIGHT)|(LIG)", sd1$METAEVTYPE)] <- "LIGHTNING"
# Changing all variants of Freezing Weather
sd1$METAEVTYPE[grep("(FREEZ)|(ICE)|(^COLD)|
(COLD)|(^ICY)|(COLD$)|
(^BLIZZARD)|(BLIZZARD$)|
(^GLAZE)", sd1$METAEVTYPE)] <- "FROST/FREEZE"
| BGN_DATE | STATE | EVTYPE | FATALITIES | INJURIES | PROPDMG | PROPDMGEXP | METAEVTYPE |
|---|---|---|---|---|---|---|---|
| 1950-04-18 | AL | TORNADO | 0 | 15 | 0.0250 | M | TORNADO |
| 1950-04-18 | AL | TORNADO | 0 | 0 | 0.0025 | M | TORNADO |
| 1951-02-20 | AL | TORNADO | 0 | 2 | 0.0250 | M | TORNADO |
| 1951-06-08 | AL | TORNADO | 0 | 2 | 0.0025 | M | TORNADO |
| 1951-11-15 | AL | TORNADO | 0 | 2 | 0.0025 | M | TORNADO |
| 1951-11-15 | AL | TORNADO | 0 | 6 | 0.0025 | M | TORNADO |
| 1951-11-16 | AL | TORNADO | 0 | 1 | 0.0025 | M | TORNADO |
| 1952-01-22 | AL | TORNADO | 0 | 0 | 0.0025 | M | TORNADO |
| 1952-02-13 | AL | TORNADO | 1 | 14 | 0.0250 | M | TORNADO |
| 1952-02-13 | AL | TORNADO | 0 | 0 | 0.0250 | M | TORNADO |
| 1952-02-13 | AL | TORNADO | 0 | 3 | 2.5000 | M | TORNADO |
| 1952-02-13 | AL | TORNADO | 0 | 3 | 2.5000 | M | TORNADO |
| 1952-02-13 | AL | TORNADO | 1 | 26 | 0.2500 | M | TORNADO |
| 1952-02-29 | AL | TORNADO | 0 | 12 | 0.0000 | M | TORNADO |
| 1952-03-03 | AL | TORNADO | 0 | 6 | 0.0250 | M | TORNADO |
| 1952-03-22 | AL | TORNADO | 4 | 50 | 0.0250 | M | TORNADO |
| 1952-04-04 | AL | TORNADO | 0 | 2 | 0.0250 | M | TORNADO |
| 1952-05-10 | AL | TORNADO | 0 | 0 | 0.0250 | M | TORNADO |
| 1952-05-10 | AL | TORNADO | 0 | 0 | 0.0250 | M | TORNADO |
| 1952-05-10 | AL | TORNADO | 0 | 0 | 0.0250 | M | TORNADO |
- Across the United States, the type of event (as indicated in the EVTYPE variable) most harmful with respect to population health is TORNADO: Number of Fatalities: 158 Number of Injuries : 1150
- Across the United States, the type of event having the greatest economic consequences is: Type of Event : FLOOD Economic Consequence: 1.1510^{5}
qplot(data = sd1, x = METAEVTYPE, y = PROPDMG, log = "y",
xlab = "Type of Event", ylab = "Property Damages (M$)") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))