Based on storm data gathered by the National oceanic and Atmosphereic Administration (NOAA), the study compares severe weathers impact on population health, as measured by injuries and fatalities, and on economy, as measured by estimated damage on properties and crops. Based on data beginning January of 2011, the study finds, with regard to their effect on population health, TORNADO has the highest impact, with Heat and Thuderstorm Wind being distant 2nd and 3rd. Additionally, FLOOD and TORNADO have the highest impact on economy.
library(RCurl)
library(ggplot2)
library(lattice)
library(reshape2)
#download file
if(!file.exists("StormData.csv.bz2")) {
Data_URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url=Data_URL, destfile="StormData.csv.bz2", method='libcurl')
}
#read data, this may take awhile
StormData <- read.csv("StormData.csv.bz2", stringsAsFactors=F)
#examine data,
# via head, to get a quick view of column names and snippet of data
head(StormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
# via Summary, note Fatalities (mean 0.0168 / max 583) & Injuries (mean 0.1557 / max 17000)
summary(StormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI
## Min. : 0.000 Length:902297 Length:902297
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## END_DATE END_TIME COUNTY_END COUNTYENDN
## Length:902297 Length:902297 Min. :0 Mode:logical
## Class :character Class :character 1st Qu.:0 NA's:902297
## Mode :character Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## END_RANGE END_AZI END_LOCATI
## Min. : 0.0000 Length:902297 Length:902297
## 1st Qu.: 0.0000 Class :character Class :character
## Median : 0.0000 Mode :character Mode :character
## Mean : 0.9862
## 3rd Qu.: 0.0000
## Max. :925.0000
##
## LENGTH WIDTH F MAG
## Min. : 0.0000 Min. : 0.000 Min. :0.0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0 1st Qu.: 0.0
## Median : 0.0000 Median : 0.000 Median :1.0 Median : 50.0
## Mean : 0.2301 Mean : 7.503 Mean :0.9 Mean : 46.9
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.:1.0 3rd Qu.: 75.0
## Max. :2315.0000 Max. :4400.000 Max. :5.0 Max. :22000.0
## NA's :843563
## FATALITIES INJURIES PROPDMG
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## WFO STATEOFFIC ZONENAMES LATITUDE
## Length:902297 Length:902297 Length:902297 Min. : 0
## Class :character Class :character Class :character 1st Qu.:2802
## Mode :character Mode :character Mode :character Median :3540
## Mean :2875
## 3rd Qu.:4019
## Max. :9706
## NA's :47
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. :-14451 Min. : 0 Min. :-14455 Length:902297
## 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 8707 Median : 0 Median : 0 Mode :character
## Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. : 17124 Max. :9706 Max. :106220
## NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
# via unique, take a look at type of events in table for the purpose of this analysis what's "harmful with respect to population health" will be defined by mortality (fatality) & injuries an event has caused
# via aggregate, do all storms have equal amout of historical data? (no)
StormData.EarliestTracking <- aggregate(BGN_DATE ~ EVTYPE, data=StormData, FUN=min)
head(StormData.EarliestTracking)
## EVTYPE BGN_DATE
## 1 HIGH SURF ADVISORY 12/3/2001 0:00:00
## 2 COASTAL FLOOD 12/13/1996 0:00:00
## 3 FLASH FLOOD 4/2/2000 0:00:00
## 4 LIGHTNING 8/24/1998 0:00:00
## 5 TSTM WIND 6/4/1999 0:00:00
## 6 TSTM WIND (G45) 1/8/1998 0:00:00
tail(StormData.EarliestTracking)
## EVTYPE BGN_DATE
## 980 WINTER WEATHER/MIX 1/1/2003 0:00:00
## 981 WINTERY MIX 3/21/1998 0:00:00
## 982 Wintry mix 1/8/1997 0:00:00
## 983 Wintry Mix 11/22/1996 0:00:00
## 984 WINTRY MIX 1/13/1999 0:00:00
## 985 WND 11/22/2001 0:00:00
# since not all weather events are not track w/ the same start time, naturally making weathers with longer tracking history more impactful overall. as a result, this study will use max of BGN_DATE to find a more recent cutoff date for more even analysis
StormData.EarliestTracking$BGN_DATE <- as.Date(StormData.EarliestTracking$BGN_DATE , "%m/%d/%Y")
max(StormData.EarliestTracking$BGN_DATE)
## [1] "2011-01-01"
# create smaller dataset for analysis: BGN_DATE (>= 1/1/2011), EVTYPE, INJURIES, FATALITIES
StormRecent <- subset(StormData,as.Date(BGN_DATE,"%m/%d/%Y")>= as.Date(max(StormData.EarliestTracking$BGN_DATE)))
# further filter out events causing no injuries and no fatalities (i.e. keeping inj > 0 OR fat > 0)
StormPopEffect <- subset(StormRecent,(INJURIES>0 | FATALITIES>0))
# aggregat data into two sets,
# one w/ sum of injuries by event
StormPopEffect.INJ <- aggregate(INJURIES ~ EVTYPE, data=StormPopEffect, FUN=sum)
# one w/ sum of fatalities by event
StormPopEffect.FAT <- aggregate(FATALITIES ~ EVTYPE, data=StormPopEffect, FUN=sum)
# merge injuries and fatalities by event
StormPopEffect.Overall <- merge(StormPopEffect.FAT,StormPopEffect.INJ,by=c("EVTYPE"))
nrow(StormPopEffect.Overall)
## [1] 28
#28 events are too many to plot, take the top six events from each set
StormPopEffect.Overall$SUM <- StormPopEffect.Overall$FATALITIES+StormPopEffect.Overall$INJURIES
StormPopEffect.Overall <- head(StormPopEffect.Overall[order(-StormPopEffect.Overall$SUM),])
# melt it for charting
StormPopEffect.Overall <- melt(StormPopEffect.Overall, id.vars = "EVTYPE", measure.vars = c("FATALITIES", "INJURIES"))
# create subset for storm effect by property & crop damage, still looking at post 9/9/1998 data
StormEconEffect <- subset(StormRecent,(PROPDMG>0 | CROPDMG>0))
# need to reformat data by PROPDMGEXP & CROPDMGEXP by following description -
#
# "Estimates should be rounded to three significant digits, followed by an alphabetical
# character signifying the magnitude of the number, i.e., 1.55B for $1,550,000,000.
# Alphabetical characters used to signify magnitude include “K” for thousands, “M” for
# millions, and “B” for billions."
# - https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf, Pg. 12
#
# via unique, take a look at types of estimates used in table
# K & M used for PROPDMG
unique(unlist(StormEconEffect$PROPDMGEXP, use.names = FALSE))
## [1] "K" "M" "B"
# K used for CROPDMG
unique(unlist(StormEconEffect$CROPDMGEXP, use.names = FALSE))
## [1] "K" "M"
# determine if the data set make damage estimate below K (thousands)? (no)
subset(StormEconEffect,(PROPDMGEXP==''))$PROPDMG
## numeric(0)
subset(StormEconEffect,(CROPDMGEXP==''))$CROPDMG
## numeric(0)
# Since damage estimates are made by thousands (K) or millions (M), data will be normalized by
# the lowest denominator, thousands (K).
# Further, normalization is only needed for PROPDMG as CROPDMG are only in the thousands range
StormEconEffect.Norm <- within(StormEconEffect, PROPDMG[PROPDMGEXP=='M']<-PROPDMG[PROPDMGEXP=='M']*1000)
StormEconEffect.Norm <- aggregate(PROPDMG ~ EVTYPE, data=StormEconEffect.Norm, FUN=sum)
StormEconEffect.Norm <- cbind(StormEconEffect.Norm, setNames(aggregate(CROPDMG ~ EVTYPE, data=StormEconEffect, FUN=sum), list("EVT2","CROPDMG")))
nrow(StormEconEffect.Norm)
## [1] 38
#38 rows are too much to plot, take top 6 instead
StormEconEffect.Norm$TTLDMG <- StormEconEffect.Norm$PROPDMG + StormEconEffect.Norm$CROPDMG
StormEconEffect.Norm <- head(StormEconEffect.Norm[order(-StormEconEffect.Norm$TTLDMG),])
StormEconEffect.Norm <- melt(StormEconEffect.Norm, id.vars = "EVTYPE", measure.vars = c("CROPDMG", "PROPDMG"))
## Chart Populaton Health Impact Result
ggplot(data=StormPopEffect.Overall, aes(x=EVTYPE, y=value, fill=variable)) +
geom_bar(stat="identity", position=position_dodge()) +
ylab("Frequency") +
xlab("Event Type") +
scale_fill_discrete(name="Effect to\nPopulation Health") +
ggtitle("Severe Weather Events Most Harmful to Population Health\n(normalized, since 1/1/2011)") +
theme(plot.title = element_text(lineheight=.8, face="bold"), axis.text.x = element_text(angle = 30, hjust = 1) )
## Chart Economy Impact Result
ggplot(data=StormEconEffect.Norm, aes(x=EVTYPE, y=value, fill=variable)) +
geom_bar(stat="identity") +
ylab("Damage Est. (K)") +
xlab("Event Type") +
scale_fill_discrete(name="Effect to\nEconomy") +
ggtitle("Severe Weather Events Most Harmful to Economy\n(normalized, since 1/1/2011)") +
theme(plot.title = element_text(lineheight=.8, face="bold"), axis.text.x = element_text(angle = 30, hjust = 1) )