Data Processing

First step is to get and read the data

library(dplyr)
if (!file.exists("Storm_Data.csv")) {
        fileURL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
        download.file(fileURL,destfile = "Storm_Data.csv")
        }
storm_data<-read.csv("Storm_Data.csv",stringsAsFactors = FALSE)

Changing the event type into categorical variable

storm_data$EVTYPE<-factor(storm_data$EVTYPE)

Events are categorized into 985 different groups. For example

head(levels(factor(storm_data$EVTYPE)))
## [1] "   HIGH SURF ADVISORY" " COASTAL FLOOD"        " FLASH FLOOD"         
## [4] " LIGHTNING"            " TSTM WIND"            " TSTM WIND (G45)"

Data is collected from 1950 to the end of 2011

summary(strptime(storm_data$BGN_DATE, "%m/%d/%Y %H:%M:%S"))
##                  Min.               1st Qu.                Median 
## "1950-01-03 00:00:00" "1995-04-20 00:00:00" "2002-03-18 00:00:00" 
##                  Mean               3rd Qu.                  Max. 
## "1998-12-27 22:49:50" "2007-07-28 00:00:00" "2011-11-30 00:00:00"

part 1: The most harmful events with respect to population health across the United States

I considered following variables from the storm data set

first we group the data by event

grouped_by_event<-group_by(storm_data,EVTYPE)

Then we calculate the summation of all fatalities and all injuries for each event category

event_harm <- summarize_each(grouped_by_event,funs(sum),FATALITIES,INJURIES)

‘event_with_injury’ and ‘event_with_fatality’ show the events with highest cumulative injury and fatality (1% quantile)

event_with_injury<-event_harm[(event_harm$INJURIES>quantile(event_harm$INJURIES,0.99)),]
event_with_fatality<-event_harm[(event_harm$FATALITIES>quantile(event_harm$FATALITIES,0.99)),]

Then we make a bar plot for events with the highest fatality and injury

par(mfrow=c(2,1), mar=c(4,4,2,2))

barplot(height = event_with_injury$INJURIES,names.arg=event_with_injury$EVTYPE, ylab = "Injury",cex.axis=0.65,cex.names=0.65,las=2)

barplot(height = event_with_fatality$INJURIES,names.arg=event_with_fatality$EVTYPE, ylab = "Fatality",cex.axis=0.65,cex.names=0.65,las=2)


Also the tables show the result

event_with_injury$FATALITIES<-NULL
event_with_injury
## Source: local data frame [10 x 2]
## 
##               EVTYPE INJURIES
##               (fctr)    (dbl)
## 1     EXCESSIVE HEAT     6525
## 2        FLASH FLOOD     1777
## 3              FLOOD     6789
## 4               HAIL     1361
## 5               HEAT     2100
## 6          ICE STORM     1975
## 7          LIGHTNING     5230
## 8  THUNDERSTORM WIND     1488
## 9            TORNADO    91346
## 10         TSTM WIND     6957
event_with_fatality$INJURIES<-NULL
event_with_fatality
## Source: local data frame [10 x 2]
## 
##            EVTYPE FATALITIES
##            (fctr)      (dbl)
## 1       AVALANCHE        224
## 2  EXCESSIVE HEAT       1903
## 3     FLASH FLOOD        978
## 4           FLOOD        470
## 5            HEAT        937
## 6       HIGH WIND        248
## 7       LIGHTNING        816
## 8     RIP CURRENT        368
## 9         TORNADO       5633
## 10      TSTM WIND        504

As graphs and tables show Tornado has the highest rate of fatality and Injury

part 2: Events with greatest economic consequences across the United States

I take into account:

First, selecting the data for damage analysis

damage_data<-select(storm_data,EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)

Applying the scale to the property damage and setting all units as dollar
“K” for thousands, “M” for millions, and “B” for billions dollars

damage_data$PROPDMG[damage_data$PROPDMGEXP=="K"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="K"]*1000
damage_data$PROPDMG[damage_data$PROPDMGEXP=="M"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="M"]*1000000
damage_data$PROPDMG[damage_data$PROPDMGEXP=="B"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="B"]*1000000000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="K"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="K"]*1000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="M"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="M"]*1000000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="B"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="B"]*1000000000

Calculating total damage (dollar) = property damage + crop damage

damage_data<-mutate(damage_data,total_damage=PROPDMG+CROPDMG)

Grouping the data by event and calculate the summation of total damage for each event category

grouped_by_event_damage<-group_by(damage_data,EVTYPE)
event_damage <- summarize_each(grouped_by_event_damage,funs(sum),total_damage)

‘event_total_damage’ shows the events with highest cumulative damage (1% quantile)

event_total_damage<-event_damage[(event_damage$total_damage>quantile(event_damage$total_damage,0.99)),]

Plotting and table

par(mfrow=c(1,1))
barplot(height = event_total_damage$total_damage/10e9,names.arg=event_total_damage$EVTYPE, ylab = "Total damage (billion dollars)",cex.axis=0.65,cex.names=0.65,las=2)


event_total_damage
## Source: local data frame [10 x 2]
## 
##               EVTYPE total_damage
##               (fctr)        (dbl)
## 1            DROUGHT  15018672000
## 2        FLASH FLOOD  17562129167
## 3              FLOOD 150319678257
## 4               HAIL  18752904943
## 5          HURRICANE  14610229010
## 6  HURRICANE/TYPHOON  71913712800
## 7          ICE STORM   8967041360
## 8        RIVER FLOOD  10148404500
## 9        STORM SURGE  43323541000
## 10           TORNADO  57340614060

As bar plot and table show, flood has the greatest economic consequences

Result

Among events occured across the United States from 1950 to 2011, results of this anaylsis show that Tornado has the highest rate of fatality and Injury and flood has the greatest economic consequences