This report describes the steps of downloading and processing the data from NOAA Storm Database and performs a simple statistical analysis on the impact of physical events to population health and economy. The analysis was focused on the data from 1996 to 2011 due to quality concerns with early data collection techniques.
Based on a high level investigation, we found that tornadoes caused the highest impact in terms of fatalities and injuries followed by floods, excessive heat, wind and lightning; whereas the greatest cause of economic related damage are floods followed by hurricanes, storm surges, tornadoes and hail.
Download the zip file and load the data into stormdata data frame.
fileName <- "repdata-data-StormData.csv.bz2"
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists(fileName))
{
download.file(url, fileName, mode = "wb")
}
stormdata <- read.csv(fileName, stringsAsFactors = FALSE)
Load the necessary libraries.
library(lubridate)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(ggplot2)
library(reshape)
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:lubridate':
##
## stamp
Check data structure.
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Parse the date and drop the data before 1996 due to data quality concerns (see NOAA reference).
#delete the "0:00:00" elements
stormdata$Date <- gsub(" 0:00:00", "", stormdata$BGN_DATE)
stormdata$Date <- parse_date_time(stormdata$Date, "m%d%Y")
cutOff <- parse_date_time("1.1.1996", "dmY")
mystormdata <- stormdata[stormdata$Date >= parse_date_time("1.1.1996", "dmY"),]
Clean up data.
#clean event types
mystormdata$EVTYPE[grep("THUNDERST", mystormdata$EVTYPE, ignore.case = TRUE)] <- "THUNDERSTORM"
mystormdata$EVTYPE[grep("TORNADO", mystormdata$EVTYPE, ignore.case = TRUE)] <- "TORNADO"
mystormdata$EVTYPE[grep("FLOOD", mystormdata$EVTYPE, ignore.case = TRUE)] <- "FLOOD"
mystormdata$EVTYPE[grep("HAIL", mystormdata$EVTYPE, ignore.case = TRUE)] <- "HAIL"
mystormdata$EVTYPE[grep("HOT", mystormdata$EVTYPE, ignore.case = TRUE)] <- "HEAT"
mystormdata$EVTYPE[grep("WARM", mystormdata$EVTYPE, ignore.case = TRUE)] <- "HEAT"
mystormdata$EVTYPE[grep("HEAT", mystormdata$EVTYPE, ignore.case = TRUE)] <- "HEAT"
mystormdata$EVTYPE[grep("COLD", mystormdata$EVTYPE, ignore.case = TRUE)] <- "COLD"
mystormdata$EVTYPE[grep("WIND", mystormdata$EVTYPE, ignore.case = TRUE)] <- "WIND"
mystormdata$EVTYPE[grep("RAIN", mystormdata$EVTYPE, ignore.case = TRUE)] <- "RAIN"
mystormdata$EVTYPE[grep("SNOW", mystormdata$EVTYPE, ignore.case = TRUE)] <- "SNOW"
mystormdata$EVTYPE[grep("ICE", mystormdata$EVTYPE, ignore.case = TRUE)] <- "ICE"
mystormdata$EVTYPE[grep("RIP CURRENT", mystormdata$EVTYPE, ignore.case = TRUE)] <- "RIP CURRENT"
#capitalizes the first letter of event type.
mystormdata$EVTYPE <- capitalize(tolower(mystormdata$EVTYPE))
Calculations:
damages dataframe is used to aggregate both fatal and non-fatal injuries.economic dataframe.human and econ are created to calculate the top 10 events that impacted human and economic.#recode the values.
mystormdata$PROPDMG<-mystormdata$PROPDMG*as.numeric(Recode(mystormdata$PROPDMGEXP, "'0'=1;'1'=10;'2'=100;'3'=1000;'4'=10000;'5'=100000;'6'=1000000;'7'=10000000;'8'=100000000;'B'=1000000000;'h'=100;'H'=100;'K'=1000;'m'=1000000;'M'=1000000;'-'=0;'?'=0;'+'=0",as.factor.result=FALSE))
mystormdata$CROPDMG<-mystormdata$CROPDMG*as.numeric(Recode(mystormdata$CROPDMGEXP, "'0'=1;'2'=100;'B'=1000000000;'k'=1000;'K'=1000;'m'=1000000;'M'=1000000;''=0;'?'=0",as.factor.result=FALSE))
damages<-aggregate(cbind(FATALITIES, INJURIES) ~ EVTYPE , mystormdata, sum)
human<-melt(head(damages[order(-damages$FATALITIES,-damages$INJURIES),],10))
## Using EVTYPE as id variables
economic<-aggregate(cbind(PROPDMG, CROPDMG) ~ EVTYPE , mystormdata, sum)
econ<-melt(head(economic[order(-economic$PROPDMG,-economic$CROPDMG),],10))
## Using EVTYPE as id variables
ggplot(human, aes(x=EVTYPE,y=value,fill=variable)) +
geom_bar(stat = "identity") + coord_flip() +
ggtitle("Harmful Events in USA from 1996 to 2011") +
labs(x = "", y="number of people impacted") +
scale_fill_manual (values=c("red","yellow"),
labels=c("Deaths","Injuries"))
plot of chunk HarmfulEvents
The plot shows that tornadoes caused the highest impact in terms of fatalities and injuries followed by floods, excessive heat, wind and lightning.
ggplot(econ, aes(x=EVTYPE, y=value, fill=variable)) +
geom_bar(stat = "identity") + coord_flip() +
ggtitle("Economic Consequences in USA from 1996 to 2011") +
labs(x = "", y="cost of damages in dollars") +
scale_fill_manual (values=c("red","yellow"),
labels=c("Property Damage","Crop Damage"))
plot of chunk EconomicConsequences
The plot shows that the greatest cause of economic related damage are floods followed by hurricanes, storm surges, tornadoes and hail.
Although we found that tornadoes caused the highest impact in terms of fatalities and injuries followed by floods, and the greatest cause of economic related damage are floods followed by hurricanes, storm surges, tornadoes and hail from 1996 to 2011. However, these findings are highly dependent on the data. Poor data collection or entry quality could largely affect the analysis result which will cause the government to devote large resources to focus on the wrong event prevention. Therefore, we need a collaborative effort to clean up the data, so the data will be more accurate for data analysis.