This analysis is based on the data available on U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. It addresses the following questions:
This section involves the following processes:
- Loading the data - Preprocessing the data - Defining the scope
- Subsetting the data
- Cleaning the data
- Further grouping the data for plotting the result
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, "data.csv.bz2")
unzip("data.csv.bz2")
## Warning in unzip("data.csv.bz2"): error 1 in extracting from zip file
data <- read.csv("data.csv.bz2")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringdist)
library(ggplot2)
library(directlabels)
library(colorspace)
## Define the 50 US states to be analysised and the 48 events based on Storm Data Event Table
Ustates <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY")
eventlist <- c("Astronomical Low Tide", "Avalanche", "Blizzard",
"Coastal Flood", "Cold/Wind Chill", "Debris Flow",
"Dense Fog", "Dense Smoke", "Drought",
"Dust Devil", "Dust Storm","Excessive Heat",
"Extreme Cold/Wind Chill", "Flash Flood", "Flood",
"Frost/Freeze", "Funnel Cloud", "Freezing Fog", "Hail",
"Heat", "Heavy Rain", "Heavy Snow",
"High Surf", "High Wind", "Hurricane (Typhoon)",
"Ice Storm", "Lake-Effect Snow", "Lakeshore Flood",
"Lightning", "Marine Hail", "Marine High Wind",
"Marine Strong Wind", "Marine Thunderstorm Wind", "Rip Current",
"Seiche", "Sleet", "Storm Surge/Tide",
"Strong Wind", "Thunderstorm Wind", "Tornado",
"Tropical Depression", "Tropical Storm", "Tsunami",
"Volcanic Ash", "Waterspout", "Wildfire",
"Winter Storm", "Winter Weather")
data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y")
## Subset the data from 1 Jan 1996 across 50 US states for all event types (since only limited events were recorded before 1996)
newdata<- subset(data,
BGN_DATE > "1996-01-01" & STATE %in% Ustates,
select = c(BGN_DATE, EVTYPE,
FATALITIES, INJURIES,
PROPDMG, PROPDMGEXP,
CROPDMG, CROPDMGEXP))
## Format the Crop & Prop Damage Exp columns to get the actual numeric damage
## Note: Alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions.
newdata$PROPDMGEXP <- factor(newdata$PROPDMGEXP,
levels = c("K", "M", "B", "", "0"),
labels = c(10^3, 10^6, 10^9, 1, 1))
newdata$PROPDMGEXP <- as.numeric(as.character(newdata$PROPDMGEXP))
newdata$TTLPROPDMG <- newdata$PROPDMG * newdata$PROPDMGEXP
newdata$CROPDMGEXP <- factor(newdata$CROPDMGEXP,
levels = c("K", "M", "B", "", "0"),
labels = c(10^3, 10^6, 10^9, 1, 1))
newdata$CROPDMGEXP <- as.numeric(as.character(newdata$CROPDMGEXP))
newdata$TTLCROPDMG <- newdata$CROPDMG * newdata$CROPDMGEXP
## Further subset the data set before cleaning it
newdata <- newdata %>%
group_by(EVTYPE, YEAR = year(BGN_DATE)) %>%
summarise(FATALITIES=sum(FATALITIES),
INJURIES=sum(INJURIES),
TTLPROPDMG=sum(TTLPROPDMG),
TTLCROPDMG=sum(TTLCROPDMG)) %>%
filter(FATALITIES+INJURIES+TTLPROPDMG+TTLCROPDMG!=0)
## Clean the EVTYPE column so that its values align with the 48 defined event types
newdata$EVTYPE <- tolower(newdata$EVTYPE)
newdata$EVTYPE <- gsub("tstm", "thunderstorm", newdata$EVTYPE)
newdata$EVMATCH <- eventlist[amatch(newdata$EVTYPE, tolower(eventlist), method="jw", maxDist = 0.45)]
plot1data <- newdata %>%
group_by(EventType = EVMATCH, YEAR) %>%
summarise(FATALITIES=sum(FATALITIES),
INJURIES=sum(INJURIES),
TTLPROPDMG=sum(TTLPROPDMG),
TTLCROPDMG=sum(TTLCROPDMG))
plot2data <- plot1data %>%
group_by(EventType) %>%
summarise(HEALTHDMG = sum(FATALITIES) + sum(INJURIES),
ECONDMG = sum(TTLPROPDMG) + sum(TTLCROPDMG)) %>%
filter(ECONDMG > mean(ECONDMG))
The plot below shows the total number of fatalities and injuries under severe weather events in the US from 1996 to 2011.
ggplot(plot1data,
aes(x=YEAR,
y=FATALITIES+INJURIES,
color=EventType)) +
geom_line() +
labs(title = "Damage of Severe Weather Events on Population Health in the US (1996-2011)",
x = "Year", y = "Total number of fatalities and injuries") +
theme(legend.position = "bottom") +
geom_dl(aes(label = EventType), method = "top.points")
The plot below shows the total economic damage (i.e.for both crops and proporties) under severe weather events in the US from 1996 to 2011. To simplify the plot, only event types with totals more than the mean damage are shown.
with(plot2data,
barplot(ECONDMG, by = EventType,
xlab="Event type",
ylab = "Total economic damage",
main = "Damage of Key Weather Events on Economic in the US (1996-2011)",
names.arg=EventType,
col = "blue")
)
## Warning in plot.window(xlim, ylim, log = log, ...): "by" is not a graphical
## parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "by" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "by" is
## not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "by" is not a
## graphical parameter
The event that has the greatest economic consequences is Flood.