First step is to get and read the data
library(dplyr)
if (!file.exists("Storm_Data.csv")) {
fileURL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileURL,destfile = "Storm_Data.csv")
}
storm_data<-read.csv("Storm_Data.csv",stringsAsFactors = FALSE)
Changing the event type into categorical variable
storm_data$EVTYPE<-factor(storm_data$EVTYPE)
Events are categorized into 985 different groups. For example
head(levels(factor(storm_data$EVTYPE)))
## [1] " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD"
## [4] " LIGHTNING" " TSTM WIND" " TSTM WIND (G45)"
Data is collected from 1950 to the end of 2011
summary(strptime(storm_data$BGN_DATE, "%m/%d/%Y %H:%M:%S"))
## Min. 1st Qu. Median
## "1950-01-03 00:00:00" "1995-04-20 00:00:00" "2002-03-18 00:00:00"
## Mean 3rd Qu. Max.
## "1998-12-27 22:49:50" "2007-07-28 00:00:00" "2011-11-30 00:00:00"
I considered following variables from the storm data set
first we group the data by event
grouped_by_event<-group_by(storm_data,EVTYPE)
Then we calculate the summation of all fatalities and all injuries for each event category
event_harm <- summarize_each(grouped_by_event,funs(sum),FATALITIES,INJURIES)
‘event_with_injury’ and ‘event_with_fatality’ show the events with highest cumulative injury and fatality (1% quantile)
event_with_injury<-event_harm[(event_harm$INJURIES>quantile(event_harm$INJURIES,0.99)),]
event_with_fatality<-event_harm[(event_harm$FATALITIES>quantile(event_harm$FATALITIES,0.99)),]
Then we make a bar plot for events with the highest fatality and injury
par(mfrow=c(2,1), mar=c(4,4,2,2))
barplot(height = event_with_injury$INJURIES,names.arg=event_with_injury$EVTYPE, ylab = "Injury",cex.axis=0.65,cex.names=0.65,las=2)
barplot(height = event_with_fatality$INJURIES,names.arg=event_with_fatality$EVTYPE, ylab = "Fatality",cex.axis=0.65,cex.names=0.65,las=2)
Also the tables show the result
event_with_injury$FATALITIES<-NULL
event_with_injury
## Source: local data frame [10 x 2]
##
## EVTYPE INJURIES
## (fctr) (dbl)
## 1 EXCESSIVE HEAT 6525
## 2 FLASH FLOOD 1777
## 3 FLOOD 6789
## 4 HAIL 1361
## 5 HEAT 2100
## 6 ICE STORM 1975
## 7 LIGHTNING 5230
## 8 THUNDERSTORM WIND 1488
## 9 TORNADO 91346
## 10 TSTM WIND 6957
event_with_fatality$INJURIES<-NULL
event_with_fatality
## Source: local data frame [10 x 2]
##
## EVTYPE FATALITIES
## (fctr) (dbl)
## 1 AVALANCHE 224
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 FLOOD 470
## 5 HEAT 937
## 6 HIGH WIND 248
## 7 LIGHTNING 816
## 8 RIP CURRENT 368
## 9 TORNADO 5633
## 10 TSTM WIND 504
As graphs and tables show Tornado has the highest rate of fatality and Injury
I take into account:
First, selecting the data for damage analysis
damage_data<-select(storm_data,EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
Applying the scale to the property damage and setting all units as dollar
“K” for thousands, “M” for millions, and “B” for billions dollars
damage_data$PROPDMG[damage_data$PROPDMGEXP=="K"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="K"]*1000
damage_data$PROPDMG[damage_data$PROPDMGEXP=="M"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="M"]*1000000
damage_data$PROPDMG[damage_data$PROPDMGEXP=="B"]<-damage_data$PROPDMG[damage_data$PROPDMGEXP=="B"]*1000000000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="K"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="K"]*1000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="M"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="M"]*1000000
damage_data$CROPDMG[damage_data$CROPDMGEXP=="B"]<-damage_data$CROPDMG[damage_data$CROPDMGEXP=="B"]*1000000000
Calculating total damage (dollar) = property damage + crop damage
damage_data<-mutate(damage_data,total_damage=PROPDMG+CROPDMG)
Grouping the data by event and calculate the summation of total damage for each event category
grouped_by_event_damage<-group_by(damage_data,EVTYPE)
event_damage <- summarize_each(grouped_by_event_damage,funs(sum),total_damage)
‘event_total_damage’ shows the events with highest cumulative damage (1% quantile)
event_total_damage<-event_damage[(event_damage$total_damage>quantile(event_damage$total_damage,0.99)),]
Plotting and table
par(mfrow=c(1,1))
barplot(height = event_total_damage$total_damage/10e9,names.arg=event_total_damage$EVTYPE, ylab = "Total damage (billion dollars)",cex.axis=0.65,cex.names=0.65,las=2)
event_total_damage
## Source: local data frame [10 x 2]
##
## EVTYPE total_damage
## (fctr) (dbl)
## 1 DROUGHT 15018672000
## 2 FLASH FLOOD 17562129167
## 3 FLOOD 150319678257
## 4 HAIL 18752904943
## 5 HURRICANE 14610229010
## 6 HURRICANE/TYPHOON 71913712800
## 7 ICE STORM 8967041360
## 8 RIVER FLOOD 10148404500
## 9 STORM SURGE 43323541000
## 10 TORNADO 57340614060
As bar plot and table show, flood has the greatest economic consequences
Among events occured across the United States from 1950 to 2011, results of this anaylsis show that Tornado has the highest rate of fatality and Injury and flood has the greatest economic consequences