##Introduction The aim of this assignment is to explore the NOAA Storm Database and the relationship between storm type (EVTYPE) and public health impact and the relationship between storm type and economic impact.

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

##Reading Raw Data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
#Load data
StormRaw <- read.csv("repdata-data-StormData.csv")

#review structure of data
str(StormRaw)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
head(StormRaw$BGN_DATE)
## [1] 4/18/1950 0:00:00  4/18/1950 0:00:00  2/20/1951 0:00:00  6/8/1951 0:00:00  
## [5] 11/15/1951 0:00:00 11/15/1951 0:00:00
## 16335 Levels: 1/1/1966 0:00:00 1/1/1972 0:00:00 ... 9/9/2011 0:00:00

##Preprocessing In this section the year of the event is parsed into its own variable so that data can be filtered to only include 1996 and beyond.

#transform start dates to date class and parse out just the year for easier filtering
StormRaw$BGN_DATE <- as.Date(StormRaw$BGN_DATE, format="%m/%d/%Y")
StormRaw$Year <- year(StormRaw$BGN_DATE)
summary(StormRaw$Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1950    1995    2002    1999    2007    2011
class(StormRaw$Year)
## [1] "numeric"
#data prior to 1996 is very limited and in order to have fair comparisons 
#data inclusion dates for this analysis are from 1996-2011
Storm <- filter(StormRaw, Year >= 1996)
summary(Storm$Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1996    2000    2005    2004    2008    2011

##Data processing of economic impact 1- A datafrome that focuses on the variables needed for this analysis was created

#Focus the dataframe on only the variables that we are using in damage analysis 
StormDamFocused <- select(Storm, Year, EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
StormDamFocused <- filter(StormDamFocused,PROPDMG > 0 | CROPDMG > 0 )

##Storm Damage Data Transformation 2- Damage is represented by a combination of crop damage and property damage. Both crop damage and property damage have both a based column and a exponent that needs to be transfored so that a single total cost can be calculated.

 #Fix issues related to damage class type must be character for grep to work
StormDamFocused$PROPDMGEXP <- as.character(StormDamFocused$PROPDMGEXP)
StormDamFocused$CROPDMGEXP <- as.character(StormDamFocused$CROPDMGEXP)
str(StormDamFocused) #check that class transformations were successful
## 'data.frame':    194525 obs. of  6 variables:
##  $ Year      : num  1996 1996 1996 1996 1996 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 972 834 856 856 856 359 856 856 856 153 ...
##  $ PROPDMG   : num  380 100 3 5 2 400 12 8 12 75 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  38 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "K" "" "" "" ...
#transform exponent characters for property damage into numbers that can be multiplied
StormDamFocused$PROPDMGEXP[!grepl("K|M|B", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1"
StormDamFocused$PROPDMGEXP[grep("K", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000"
StormDamFocused$PROPDMGEXP[grep("M", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000000"
StormDamFocused$PROPDMGEXP[grep("B", StormDamFocused$PROPDMGEXP, ignore.case=T)]<- "1000000000"
StormDamFocused$PROPDMGEXP<- as.numeric(StormDamFocused$PROPDMGEXP)

#transform exponents for crop damage
StormDamFocused$CROPDMGEXP[!grepl("K|M|B", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1"
StormDamFocused$CROPDMGEXP[grep("K", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000"
StormDamFocused$CROPDMGEXP[grep("M", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000000"
StormDamFocused$CROPDMGEXP[grep("B", StormDamFocused$CROPDMGEXP, ignore.case=T)]<- "1000000000"
StormDamFocused$CROPDMGEXP<- as.numeric(StormDamFocused$CROPDMGEXP)

#Create variable that takes property and crop damage into account
StormDamFocused$TotalDamage <- (StormDamFocused$PROPDMG * StormDamFocused$PROPDMGEXP)+
  (StormDamFocused$CROPDMG * StormDamFocused$CROPDMGEXP)
str(StormDamFocused)
## 'data.frame':    194525 obs. of  7 variables:
##  $ Year       : num  1996 1996 1996 1996 1996 ...
##  $ EVTYPE     : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 972 834 856 856 856 359 856 856 856 153 ...
##  $ PROPDMG    : num  380 100 3 5 2 400 12 8 12 75 ...
##  $ PROPDMGEXP : num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ CROPDMG    : num  38 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP : num  1000 1 1 1 1 1 1 1 1 1 ...
##  $ TotalDamage: num  418000 100000 3000 5000 2000 400000 12000 8000 12000 75000 ...

##Focus on Most Damaging Event Types 3- Because we want to know about the event types that cause the most damage a subset that contains only the top 5% of most damaging event types is created.

#focus on top 5% most damaging 
StormDamFocused$EVTYPE<-toupper(StormDamFocused$EVTYPE)
AggTotalDamage <- aggregate(TotalDamage ~ EVTYPE, StormDamFocused, sum)
Top5<-subset(AggTotalDamage, TotalDamage > quantile(TotalDamage, prob = 0.95))
Top5$EVTYPE<-as.factor(Top5$EVTYPE)
str(Top5)
## 'data.frame':    8 obs. of  2 variables:
##  $ EVTYPE     : Factor w/ 8 levels "DROUGHT","FLASH FLOOD",..: 1 2 3 4 5 6 7 8
##  $ TotalDamage: num  1.44e+10 1.66e+10 1.49e+11 1.71e+10 1.46e+10 ...
Top5$EVTYPE[grep("FLASH FLOOD|FLOOD", Top5$EVTYPE)]<- "FLOOD"
Top5$EVTYPE[grep("HURRICANE|HURRICANE/TYPHOON", Top5$EVTYPE)]<- "HURRICANE"

Top5<-aggregate(TotalDamage ~ EVTYPE, Top5, sum)

##Economic Impact Results 4- A plot representing total damage is created which shows that flooding events cause the most economic damage as measured by crop damage plus property damage (in US$).

qplot(Top5$TotalDamage, Top5$EVTYPE, xlab="Total Damage (US$)", ylab="Event Type", 
    main="Total Damage for top 5% of most costly Event Types") #plot

##Processing of Health Related Data 1- A dataframe that focuses on on the variables needed for this analysis is created and then filtered to only include events where there were either fatalities or injuries

#Focus the dataframe on only the variables that we are using in health analysis
StormHealthFocused <- select(Storm, Year, EVTYPE, FATALITIES, INJURIES)
StormHealthFocused <- filter(StormHealthFocused,FATALITIES > 0 | INJURIES > 0)

summary(StormHealthFocused)
##       Year                    EVTYPE       FATALITIES          INJURIES       
##  Min.   :1996   LIGHTNING        :2649   Min.   :  0.0000   Min.   :   0.000  
##  1st Qu.:1999   TORNADO          :1968   1st Qu.:  0.0000   1st Qu.:   0.000  
##  Median :2002   TSTM WIND        :1628   Median :  0.0000   Median :   1.000  
##  Mean   :2003   FLASH FLOOD      : 847   Mean   :  0.6841   Mean   :   4.542  
##  3rd Qu.:2007   EXCESSIVE HEAT   : 665   3rd Qu.:  1.0000   3rd Qu.:   2.000  
##  Max.   :2011   THUNDERSTORM WIND: 656   Max.   :158.0000   Max.   :1150.000  
##                 (Other)          :4351
str(StormHealthFocused)
## 'data.frame':    12764 obs. of  4 variables:
##  $ Year      : num  1996 1996 1996 1996 1996 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 153 140 140 856 464 834 834 464 856 ...
##  $ FATALITIES: num  1 1 1 1 0 0 4 2 0 0 ...
##  $ INJURIES  : num  3 0 0 0 15 1 40 17 1 3 ...

##Health Related Data Transformation 2- A variable is created that represents the total human lives impacted as fatalities plus injuries. Event types are then grouped for similar items and aggregated. To maintain congruent processes the most harmful 5% of Event types are the turned into a subset.

StormHealthFocused$HealthImpact <- (StormHealthFocused$FATALITIES + StormHealthFocused$INJURIES) 

StormHealthFocused$EVTYPE<-toupper(StormHealthFocused$EVTYPE)

StormHealthFocused$EVTYPE<- as.factor(StormHealthFocused$EVTYPE)


AggHealth<-aggregate(HealthImpact ~ EVTYPE, StormHealthFocused, sum)

Top5Health<-subset(AggHealth, HealthImpact > quantile(HealthImpact, prob = 0.95))

##Results for Health Impact 3- A plot that shows the total impact of the most harmful 5% of event types is generated. This plot shows that tornados are by far the most harmful.

qplot(Top5Health$HealthImpact, Top5Health$EVTYPE, xlab="Fatalities + Injuries", ylab="Event Type", 
    main="Total Health Impact for Top 5% Most Harmful Events")