Reproducible Research - Programming Assignment 2
output: pdf_document
The aim of this analysis is to identify which extreme weather events are most harmful to public health and economy.
Many severe events can result in fatalities, injuries, and property damage, and identifying which events are most harmful can help in deploying preventive measures.
Data used in the analysis come from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database.
This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
To compare the impact of different events, histograms are well suited.
It is shown that tornados are most harmful to human health, floods to economic losses.
Data Processing: Initializing, reading and converting data
The .bz2 file is in the working directory
knitr::opts_chunk$set(echo = TRUE)
library("ggplot2")
library("gridExtra")
## unzip .bz2 file, read data into file=stormdata and take a look at the file
if(!exists("stormdata")) {
stormdata <- read.csv(bzfile("repdata_data_StormData.csv.bz2"), header = TRUE)
}
## examining the data file (dim, str)
dim(stormdata)
## [1] 902297 37
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
To work with lighter data sets let’s create subsets with the needed variables
One set for health issues: fatalities, injuries
Other for economic loss: property damage, crop damage
Event type is common to both
Variable names can be found with str(stormdata):
EVTYPE (type of event),
FATALITIES (number of deaths),
INJURIES (number of injuries),
PROPDMG (property damage),
PROPDMGEXP (units for property damage),
CROPDMG (crop damage)
CROPDMGEXP (units for crop damage)
## creating data frame for health issues with:
## EVTYPE, FATALITIES, INJURIES
stormdata_health <- data.frame(stormdata$EVTYPE, stormdata$FATALITIES, stormdata$INJURIES)
colnames(stormdata_health) = c("EVTYPE", "FATALITIES", "INJURIES")
head(stormdata_health)
## EVTYPE FATALITIES INJURIES
## 1 TORNADO 0 15
## 2 TORNADO 0 0
## 3 TORNADO 0 2
## 4 TORNADO 0 2
## 5 TORNADO 0 2
## 6 TORNADO 0 6
## creating data frame for economic damage with:
## EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP
stormdata_damage <- data.frame(stormdata$EVTYPE, stormdata$PROPDMG, stormdata$PROPDMGEXP, stormdata$CROPDMG, stormdata$CROPDMGEXP)
colnames(stormdata_damage) = c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
head(stormdata_damage)
## EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 25.0 K 0
## 2 TORNADO 2.5 K 0
## 3 TORNADO 25.0 K 0
## 4 TORNADO 2.5 K 0
## 5 TORNADO 2.5 K 0
## 6 TORNADO 2.5 K 0
The values of damage are in different scales (thousands, millions, billions)
It is necessary to put values in the same base, so they can be compared
The strategy for this is to convert the literal symbols K, M, B to numeric multipliers
x1000, x1000,000, x1000,000,000 and then correct values to a unique reference
A sequence of ifelse’s defines the multipliers (10^3, 10^6, 10^9)
stormdata_damage$multiplierp <- ifelse (stormdata_damage$PROPDMGEXP == "K", 1000,
ifelse (stormdata_damage$PROPDMGEXP == "M", 1000000,
ifelse (stormdata_damage$PROPDMGEXP == "B", 1000000000, 0)))
stormdata_damage$PROPDMG_unique <- stormdata_damage$PROPDMG * stormdata_damage$multiplierp
stormdata_damage$multiplierc <- ifelse (stormdata_damage$CROPDMGEXP == "K", 1000,
ifelse (stormdata_damage$CROPDMGEXP == "M", 1000000,
ifelse (stormdata_damage$CROPDMGEXP == "B", 1000000000, 0)))
stormdata_damage$CROPDMG_unique <- stormdata_damage$CROPDMG * stormdata_damage$multiplierc
## aggregating the damage values from property and crop into a single total value
stormdata_damage$TOTAL <- stormdata_damage$PROPDMG_unique + stormdata_damage$CROPDMG_unique
head(stormdata_damage)
## EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP multiplierp PROPDMG_unique
## 1 TORNADO 25.0 K 0 1000 25000
## 2 TORNADO 2.5 K 0 1000 2500
## 3 TORNADO 25.0 K 0 1000 25000
## 4 TORNADO 2.5 K 0 1000 2500
## 5 TORNADO 2.5 K 0 1000 2500
## 6 TORNADO 2.5 K 0 1000 2500
## multiplierc CROPDMG_unique TOTAL
## 1 0 0 25000
## 2 0 0 2500
## 3 0 0 25000
## 4 0 0 2500
## 5 0 0 2500
## 6 0 0 2500
Results
Analysis: which events are most harmful to human health - question 1
Since there are many types of events only the most critical ones in terms
of fatalities and injuries will be displayed.
orange is a conventional color of alert
storm_fat <- aggregate(stormdata_health$FATALITIES, by = list(stormdata_health$EVTYPE), FUN = sum, na.rm = TRUE)
colnames(storm_fat) = c("EVTYPE", "FATALITIES")
storm_fat <- storm_fat[order(-storm_fat$FATALITIES),]
top9_fatalities <- storm_fat[1: 9, ]
p<- ggplot(top9_fatalities, aes(x=reorder(EVTYPE, FATALITIES), y=FATALITIES))
p + geom_bar(stat = "identity", fill = "orange") + ggtitle("Top Weather Events by number of Fatalities") + labs(x = "Event Type", y=" n Fatalities") + theme(axis.text.x = element_text(angle=45, hjust=1))

Similar analysis for injuries - question 1 (cont)
storm_inj <- aggregate(stormdata_health$INJURIES, by = list(stormdata_health$EVTYPE), FUN = sum, na.rm = TRUE)
colnames(storm_inj) = c("EVTYPE", "INJURIES")
storm_inj <- storm_inj[order(-storm_inj$INJURIES),]
top9_injuries <- storm_inj[1: 9, ]
p <- ggplot(top9_injuries, aes(x=reorder(EVTYPE, INJURIES), y=INJURIES))
p + geom_bar(stat = "identity", fill = "green") + ggtitle("Top Weather Events by number of Injuries") + labs(x = "Event Type", y=" n Injuries") +theme(axis.text.x = element_text(angle=45, hjust=1))

Identifying the events more harmful to economy and property
Property and Crop losses
storm_dmg <- aggregate(stormdata_damage$TOTAL, by = list(stormdata_damage$EVTYPE), FUN = sum, na.rm = TRUE)
colnames(storm_dmg) = c("EVTYPE", "TOTAL")
storm_dmg <- storm_dmg[order(-storm_dmg$TOTAL),]
top9_dmg <- storm_dmg[1: 9, ]
p<- ggplot(top9_dmg, aes(x=reorder(EVTYPE, TOTAL/1000000000), y=TOTAL/1000000000))
p+geom_bar(stat = "identity", fill = "red")+ ggtitle("Top Weather Events by Economic Loss [billion US$]")+labs(x = "Event Type", y="$ Loss") +theme(axis.text.x = element_text(angle=45, hjust=1))
