Synopsis

Data processing

There are dependencies w.r.t. CRAN packages.

# attaching libraries
sapply(c("plyr", "dplyr", "reshape2", "ggplot2", "knitr"), 
       library, character.only= T)

Data file download

# making a subdirectory under the current working one.
if (!file.exists("RepResProj2")){
  dir.create("RepResProj2")
}
url <-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
 download.file(url, destfile="./RepResProj2/StormData.zip", method = "curl")
# filling the NOAAWeatherDf dataframe with the content of the downloaded data file.
NOAAWeatherDf <- read.csv("./RepResProj2/StormData.zip", header = T, 
                          na.strings = "?")

Preliminary overview of the data set

# how many types of weather events?
length(levels(NOAAWeatherDf$EVTYPE))
## [1] 984
# quick look at the start and end of the weather events' set.
head(sort(gsub("^\\s*", "", levels(NOAAWeatherDf$EVTYPE))), n=1)
## [1] "ABNORMAL WARMTH"
head(sort(gsub("^\\s*", "", levels(NOAAWeatherDf$EVTYPE)), decreasing = T), n=2)
## [1] "WND"        "WINTRY MIX"
# time frame of the data colleaction
range(as.Date(NOAAWeatherDf$BGN_DATE, "%m/%d/%Y"))
## [1] "1950-01-03" "2011-11-30"

Ranking weather events by their effect on population.

# computing the cumulative number of fatalities and injuries per event.
EventOnPopulationDf <- ddply(NOAAWeatherDf, .(EVTYPE), 
                      summarize, Deaths = sum(FATALITIES), 
                      Injuries = sum(INJURIES))
kable(summary(EventOnPopulationDf)[,c(2,3)])
Deaths Injuries
Min. : 0.00 Min. : 0.0
1st Qu.: 0.00 1st Qu.: 0.0
Median : 0.00 Median : 0.0
Mean : 15.38 Mean : 142.7
3rd Qu.: 0.00 3rd Qu.: 0.0
Max. :5633.00 Max. :91346.0
NA NA
sum((head(arrange(EventOnPopulationDf, desc(Deaths)), n=20))[,2]) / sum(EventOnPopulationDf$Deaths)
## [1] 0.8922417
  • The cumulative number of deaths and injuries per event is highhly skewed.
  • The vast majority of the referenced 984 weather events are not reported to cause either deaths or injuries.
  • 20 events account for ~90% of all deaths caused by extreme weather.
SevereEvents <- head(arrange(EventOnPopulationDf, desc(Deaths))[,1], n=20)
# subsetting the NOAAWeatherDf for the top 20 most severe events
NOAAWeatherDf_subset1 <- filter(NOAAWeatherDf, EVTYPE %in% SevereEvents)
# selecting the 3 relevant attributes for the subsequent plot
EvtypePopHealthDf <- select(NOAAWeatherDf_subset1, EVTYPE, FATALITIES, INJURIES)
# reshapping the EvtypePopHealthDf into a tidy data set.
EvtypePopHealthDfMelted <- melt(EvtypePopHealthDf, 
                                variable.name = "Outcome", 
                                value.name = "Occurrences")
## Using EVTYPE as id variables

Ranking weather events by their effect on economic activities.

# quick look at the relevant data
range(NOAAWeatherDf$PROPDMG)
## [1]    0 5000
range(NOAAWeatherDf$CROPDMG)
## [1]   0 990
levels(NOAAWeatherDf$PROPDMGEXP)
##  [1] ""  "-" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m"
## [18] "M"
# filtering the NOAAWeatherDf for values of PROPDMG or CROPDMG equals to 0.
NOAAWeatherDf_subset2 <- filter(NOAAWeatherDf, PROPDMG > 0 | CROPDMG > 0)
# creation of a vector that maps the character code of the PROPDMGEXP and 
# CROPDMGEXP attributes to actual numeric multiplier
multiplier <- c('H' = 100, 'K' = 1000, 'M' = 1000000, 'B' = 1000000000)
# Creation of 2 new variables featuring the actual numerical $ values of the PROP
# and CROP damages
NOAAWeatherDf_subset2 <- 
        mutate(NOAAWeatherDf_subset2, 
               PropDamage = PROPDMG * multiplier[toupper(PROPDMGEXP)], 
               CropDamage = CROPDMG * multiplier[toupper(CROPDMGEXP)] )

# property damages per event.
EventOnPropertiesDf <- ddply(NOAAWeatherDf_subset2, .(EVTYPE), 
                             summarize, Cost = sum(PropDamage))

# selecting the top 20 for subsequent barchart
EventOnPropertiesDf_top20 <- head(arrange(EventOnPropertiesDf, 
                                          desc(Cost)), n=20)

# crop damages per event.
EventOnCropsDf <- ddply(NOAAWeatherDf_subset2, .(EVTYPE), 
                             summarize, Cost = sum(CropDamage))

# selecting the top 20 for subsequent barchart
EventOnCropsDf_top20 <- head(arrange(EventOnCropsDf, 
                                          desc(Cost)), n=20)

Results

Barchart displaying the number of cumulative death and injuries of the top 20 most harmful weather events

ggplot(EvtypePopHealthDfMelted, 
       aes(reorder(factor(EVTYPE), Occurrences), Occurrences, Occurrences, 
           fill = Outcome))  + 
        geom_bar(stat = "identity", position = position_dodge()) + 
        ggtitle("Fatalities and injuries caused by weather events") + 
        labs(x ="Weather Event Type") +  
        theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

kable(head(arrange(EventOnPopulationDf, desc(Deaths)), n=20))
EVTYPE Deaths Injuries
TORNADO 5633 91346
EXCESSIVE HEAT 1903 6525
FLASH FLOOD 978 1777
HEAT 937 2100
LIGHTNING 816 5230
TSTM WIND 504 6957
FLOOD 470 6789
RIP CURRENT 368 232
HIGH WIND 248 1137
AVALANCHE 224 170
WINTER STORM 206 1321
RIP CURRENTS 204 297
HEAT WAVE 172 309
EXTREME COLD 160 231
THUNDERSTORM WIND 133 1488
HEAVY SNOW 127 1021
EXTREME COLD/WIND CHILL 125 24
STRONG WIND 103 280
BLIZZARD 101 805
HIGH SURF 101 152

Barchart displaying the amount of propery damages for the top 20 most damaging weather events

ggplot(EventOnPropertiesDf_top20, aes(EVTYPE, Cost)) + 
        geom_bar(stat="identity") + 
        ggtitle("Property damages caused by weather events") + 
        labs(x ="Weather Event Type") + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

kable(head(arrange(EventOnPropertiesDf, desc(Cost)), n=20))
EVTYPE Cost
HURRICANE/TYPHOON 69305840000
STORM SURGE 43323536000
STORM SURGE/TIDE 4641188000
HURRICANE OPAL 3172846000
HEAVY RAIN/SEVERE WEATHER 2500000000
TORNADOES, TSTM WIND, HAIL 1600000000
BLIZZARD 659213950
WILD FIRES 624100000
TYPHOON 600230000
LANDSLIDE 324596000
HAILSTORM 241000000
COASTAL FLOOD 237665560
TSUNAMI 144062000
COASTAL FLOODING 126640500
HIGH WINDS/COLD 110500000
River Flooding 106155000
MAJOR FLOOD 105000000
WILDFIRES 100500000
HURRICANE OPAL/HIGH WINDS 100000000
HIGH SURF 89575000

Barchart displaying the amount of crop damages for the top 20 most damaging weather events

ggplot(EventOnCropsDf_top20, aes(EVTYPE, Cost)) + 
        geom_bar(stat="identity") + 
        ggtitle("Crop damages caused by weather events") + 
        labs(x ="Weather Event Type") + 
        theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

kable(head(arrange(EventOnCropsDf, desc(Cost)), n=20))
EVTYPE Cost
FROST/FREEZE 1094086000
EXCESSIVE WETNESS 142000000
FLOOD/RAIN/WINDS 112800000
COLD AND WET CONDITIONS 66000000
Early Frost 42000000
Damaging Freeze 34130000
AGRICULTURAL FREEZE 28820000
UNSEASONABLY COLD 25042500
Extreme Cold 20000000
TROPICAL STORM JERRY 16000000
HARD FREEZE 13100000
Freeze 10500000
HURRICANE OPAL/HIGH WINDS 10000000
UNSEASONAL RAIN 10000000
HIGH WINDS/COLD 7000000
Unseasonable Cold 5100000
COOL AND WET 5000000
WINTER STORM HIGH WINDS 5000000
TORNADOES, TSTM WIND, HAIL 2500000
Heavy Rain/High Surf 1500000

1 the National Oceanic and Atmospheric Administration.
2 the National Weather Service.