##Introduction The aim of this assignment is to explore the NOAA Storm Database and the relationship between storm type (EVTYPE) and public health impact and the relationship between storm type and economic impact.
The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
##Reading Raw Data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
#Load data
StormRaw <- read.csv("repdata-data-StormData.csv")
#review structure of data
str(StormRaw)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
head(StormRaw$BGN_DATE)
## [1] 4/18/1950 0:00:00 4/18/1950 0:00:00 2/20/1951 0:00:00 6/8/1951 0:00:00
## [5] 11/15/1951 0:00:00 11/15/1951 0:00:00
## 16335 Levels: 1/1/1966 0:00:00 1/1/1972 0:00:00 ... 9/9/2011 0:00:00
##Preprocessing In this section the year of the event is parsed into its own variable so that data can be filtered to only include 1996 and beyond.
#transform start dates to date class and parse out just the year for easier filtering
StormRaw$BGN_DATE <- as.Date(StormRaw$BGN_DATE, format="%m/%d/%Y")
StormRaw$Year <- year(StormRaw$BGN_DATE)
summary(StormRaw$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1950 1995 2002 1999 2007 2011
class(StormRaw$Year)
## [1] "numeric"
#data prior to 1996 is very limited and in order to have fair comparisons
#data inclusion dates for this analysis are from 1996-2011
Storm <- filter(StormRaw, Year >= 1996)
summary(Storm$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1996 2000 2005 2004 2008 2011
##Data processing of economic impact 1- A datafrome that focuses on the variables needed for this analysis was created
#Focus the dataframe on only the variables that we are using in damage analysis
StormDamFocused <- select(Storm, Year, EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
StormDamFocused <- filter(StormDamFocused,PROPDMG > 0 | CROPDMG > 0 )
##Storm Damage Data Transformation 2- Damage is represented by a combination of crop damage and property damage. Both crop damage and property damage have both a based column and a exponent that needs to be transfored so that a single total cost can be calculated.
#Fix issues related to damage class type must be character for grep to work
StormDamFocused$PROPDMGEXP <- as.character(StormDamFocused$PROPDMGEXP)
StormDamFocused$CROPDMGEXP <- as.character(StormDamFocused$CROPDMGEXP)
str(StormDamFocused) #check that class transformations were successful
## 'data.frame': 194525 obs. of 6 variables:
## $ Year : num 1996 1996 1996 1996 1996 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 972 834 856 856 856 359 856 856 856 153 ...
## $ PROPDMG : num 380 100 3 5 2 400 12 8 12 75 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 38 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "K" "" "" "" ...
#transform exponent characters for property damage into numbers that can be multiplied
StormDamFocused$PROPDMGEXP[!grepl("K|M|B", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1"
StormDamFocused$PROPDMGEXP[grep("K", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000"
StormDamFocused$PROPDMGEXP[grep("M", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000000"
StormDamFocused$PROPDMGEXP[grep("B", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000000000"
StormDamFocused$PROPDMGEXP<- as.numeric(StormDamFocused$PROPDMGEXP)
#transform exponents for crop damage
StormDamFocused$CROPDMGEXP[!grepl("K|M|B", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1"
StormDamFocused$CROPDMGEXP[grep("K", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000"
StormDamFocused$CROPDMGEXP[grep("M", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000000"
StormDamFocused$CROPDMGEXP[grep("B", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000000000"
StormDamFocused$CROPDMGEXP<- as.numeric(StormDamFocused$CROPDMGEXP)
#Create variable that takes property and crop damage into account
StormDamFocused$TotalDamage <- (StormDamFocused$PROPDMG * StormDamFocused$PROPDMGEXP)+
(StormDamFocused$CROPDMG * StormDamFocused$CROPDMGEXP)
str(StormDamFocused)
## 'data.frame': 194525 obs. of 7 variables:
## $ Year : num 1996 1996 1996 1996 1996 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 972 834 856 856 856 359 856 856 856 153 ...
## $ PROPDMG : num 380 100 3 5 2 400 12 8 12 75 ...
## $ PROPDMGEXP : num 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ CROPDMG : num 38 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP : num 1000 1 1 1 1 1 1 1 1 1 ...
## $ TotalDamage: num 418000 100000 3000 5000 2000 400000 12000 8000 12000 75000 ...
##Focus on Most Damaging Event Types 3- Because we want to know about the event types that cause the most damage a subset that contains only the top 5% of most damaging event types is created.
#focus on top 5% most damaging
StormDamFocused$EVTYPE<-toupper(StormDamFocused$EVTYPE)
AggTotalDamage <- aggregate(TotalDamage ~ EVTYPE, StormDamFocused, sum)
Top5<-subset(AggTotalDamage, TotalDamage > quantile(TotalDamage, prob = 0.95))
Top5$EVTYPE<-as.factor(Top5$EVTYPE)
str(Top5)
## 'data.frame': 8 obs. of 2 variables:
## $ EVTYPE : Factor w/ 8 levels "DROUGHT","FLASH FLOOD",..: 1 2 3 4 5 6 7 8
## $ TotalDamage: num 1.44e+10 1.66e+10 1.49e+11 1.71e+10 1.46e+10 ...
Top5$EVTYPE[grep("FLASH FLOOD|FLOOD", Top5$EVTYPE)]<- "FLOOD"
Top5$EVTYPE[grep("HURRICANE|HURRICANE/TYPHOON", Top5$EVTYPE)]<- "HURRICANE"
Top5<-aggregate(TotalDamage ~ EVTYPE, Top5, sum)
##Economic Impact Results 4- A plot representing total damage is created which shows that flooding events cause the most economic damage as measured by crop damage plus property damage (in US$).
qplot(Top5$TotalDamage, Top5$EVTYPE, xlab="Total Damage (US$)", ylab="Event Type",
main="Total Damage for top 5% of most costly Event Types") #plot
##Processing of Health Related Data 1- A dataframe that focuses on on the variables needed for this analysis is created and then filtered to only include events where there were either fatalities or injuries
#Focus the dataframe on only the variables that we are using in health analysis
StormHealthFocused <- select(Storm, Year, EVTYPE, FATALITIES, INJURIES)
StormHealthFocused <- filter(StormHealthFocused,FATALITIES > 0 | INJURIES > 0)
summary(StormHealthFocused)
## Year EVTYPE FATALITIES INJURIES
## Min. :1996 LIGHTNING :2649 Min. : 0.0000 Min. : 0.000
## 1st Qu.:1999 TORNADO :1968 1st Qu.: 0.0000 1st Qu.: 0.000
## Median :2002 TSTM WIND :1628 Median : 0.0000 Median : 1.000
## Mean :2003 FLASH FLOOD : 847 Mean : 0.6841 Mean : 4.542
## 3rd Qu.:2007 EXCESSIVE HEAT : 665 3rd Qu.: 1.0000 3rd Qu.: 2.000
## Max. :2011 THUNDERSTORM WIND: 656 Max. :158.0000 Max. :1150.000
## (Other) :4351
str(StormHealthFocused)
## 'data.frame': 12764 obs. of 4 variables:
## $ Year : num 1996 1996 1996 1996 1996 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 153 140 140 856 464 834 834 464 856 ...
## $ FATALITIES: num 1 1 1 1 0 0 4 2 0 0 ...
## $ INJURIES : num 3 0 0 0 15 1 40 17 1 3 ...
##Health Related Data Transformation 2- A variable is created that represents the total human lives impacted as fatalities plus injuries. Event types are then grouped for similar items and aggregated. To maintain congruent processes the most harmful 5% of Event types are the turned into a subset.
StormHealthFocused$HealthImpact <- (StormHealthFocused$FATALITIES + StormHealthFocused$INJURIES)
StormHealthFocused$EVTYPE<-toupper(StormHealthFocused$EVTYPE)
StormHealthFocused$EVTYPE<- as.factor(StormHealthFocused$EVTYPE)
AggHealth<-aggregate(HealthImpact ~ EVTYPE, StormHealthFocused, sum)
Top5Health<-subset(AggHealth, HealthImpact > quantile(HealthImpact, prob = 0.95))
##Results for Health Impact 3- A plot that shows the total impact of the most harmful 5% of event types is generated. This plot shows that tornados are by far the most harmful.
qplot(Top5Health$HealthImpact, Top5Health$EVTYPE, xlab="Fatalities + Injuries", ylab="Event Type",
main="Total Health Impact for Top 5% Most Harmful Events")