Synopsis:-

The analysis of NOAA Storm Data, for events which cause maximum Economic losses, and the events which cause maximum impact on population health, has been done to help the decision making authorities to be better prepared in future. The focus is on top ten events in both categories, which shall help priortising the resources.

Data Processing :-

Loaded the required packages

library(R.utils)
library(plyr);library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, desc, failwith, id, mutate, summarise, summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(reshape)
## 
## Attaching package: 'reshape'
## 
## The following objects are masked from 'package:plyr':
## 
##     rename, round_any

Unzipping the raw file and loading the dataset with desired columns

bunzip2("repdata_data_StormData.csv.bz2", "repdata_data_StormData.csv", overwrite=TRUE,remove=FALSE)
mydf1 <- read.csv("repdata_data_StormData.csv",stringsAsFactors = FALSE,colClasses=c("NULL","NULL","NULL","NULL","NULL","NULL","NULL",NA,"NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL",NA,NA,NA,NA,NA,NA,"NULL","NULL"))
mydf2 <- tbl_df(mydf1)
head(mydf2)
## Source: local data frame [6 x 7]
## 
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0

To address the first question:-

“Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?”

First the events were grouped, then summarised, injuries and fatalities were summed up eventwise.Then arranged in descending order to find the top 10 events.The plotting was done stackwise, showing both the injuries and fatalities, contribution to the total impact.

by_evt <- group_by(mydf2,EVTYPE)
sumInj <- summarize(by_evt, sum(FATALITIES),sum(INJURIES),sum(FATALITIES+INJURIES))
sumInj1 <- filter(sumInj,EVTYPE!="?")
names(sumInj1) [2] <- c("Fatalities")
names(sumInj1) [3] <- c("Injuries")
names(sumInj1) [4] <- c("T.Injuries")

# Event with maximum impact

sumInj1[which(sumInj1$T.Injuries==max(sumInj1$T.Injuries)),]
## Source: local data frame [1 x 4]
## 
##      EVTYPE Fatalities Injuries T.Injuries
## 825 TORNADO       5633    91346      96979
mydf3 <- arrange(sumInj1,desc(T.Injuries))
mydf4 <- mydf3[1:10,]
x <- c("Injuries", "Fatalities")
mydf5 <- melt(mydf4, x="category")
## Using EVTYPE as id variables
mydf6 <- mydf5[1:20,]
head(mydf6)
##           EVTYPE   variable value
## 1        TORNADO Fatalities  5633
## 2 EXCESSIVE HEAT Fatalities  1903
## 3      TSTM WIND Fatalities   504
## 4          FLOOD Fatalities   470
## 5      LIGHTNING Fatalities   816
## 6           HEAT Fatalities   937

To address the second question:-

Across the United States, which types of events have the greatest economic consequences?

First the Property and crop damage variables were recoded from categorical data to numeric data.The plyr package was used for the data recoding. The variables were recoded as follows :-

“K” = 1000 , “M” =1,000,000 , “” =0 , “B” =1,000,000,000 , “m” =1,000,000 , “+” =1 , “0” =0 , “5” =1,000,00 , “6” =1,000,000 , “?” =0 , “4” =10,000 , “2” =100 , “3” =1000 , “h” =100 , “7” =1,000,0000 , “H” =100 , “-” =0 , “1” =10 , “8” = 1,000,000,00 ,

library(plyr)
u1 <- unique(mydf2$PROPDMGEXP)
u2 <- unique(mydf2$CROPDMGEXP)
u1
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
u2
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"
mydf2$PROPDMGEXP1 <- as.numeric(mapvalues(mydf2$PROPDMGEXP, from = u1, to = c("1000", "1000000","0","1000000000","1000000","1","0","100000","1000000","0","10000","100","1000","100","10000000","100","0","10","100000000")))
mydf2$CROPDMGEXP1 <- as.numeric(mapvalues(mydf2$CROPDMGEXP, from = u2, to = c("0", "1000000","1000","1000000","1000000000","0","1","1000","100")))

First the events were grouped, then summarised, property damages and crop damages were summed up eventwise.Then arranged in descending order to find the top 10 events.The plotting was done stackwise, showing both the property damages and crop damages , contribution to the total impact.

by_evt1 <- group_by(mydf2,EVTYPE)
sumeco <- summarize(by_evt1, sum((PROPDMG*PROPDMGEXP1)),sum((CROPDMG*CROPDMGEXP1)),sum((PROPDMG*PROPDMGEXP1)+(CROPDMG*CROPDMGEXP1)))
sumeco1 <- filter(sumeco,EVTYPE!="?")
names(sumeco1) [2] <- c("PropDamage")
names(sumeco1) [3] <- c("CropDamage")
names(sumeco1) [4] <- c("T.Damag")
max(sumeco1$T.Damag)
## [1] 1.503e+11
# Event with maximum impact

sumeco1[which(sumeco1$T.Damag==max(sumeco1$T.Damag)),]
## Source: local data frame [1 x 4]
## 
##     EVTYPE PropDamage CropDamage   T.Damag
## 166  FLOOD  1.447e+11  5.662e+09 1.503e+11
mydf7 <- arrange(sumeco1,desc(T.Damag))
mydf8 <- mydf7[1:10,]
x <- c("PropDamage", "CropDamage")
mydf9 <- melt(mydf8, x="category")
## Using EVTYPE as id variables
mydf10 <- mydf9[1:20,]
head(mydf10)
##              EVTYPE   variable     value
## 1             FLOOD PropDamage 1.447e+11
## 2 HURRICANE/TYPHOON PropDamage 6.931e+10
## 3           TORNADO PropDamage 5.695e+10
## 4       STORM SURGE PropDamage 4.332e+10
## 5              HAIL PropDamage 1.574e+10
## 6       FLASH FLOOD PropDamage 1.682e+10
a <- ggplot(data=mydf6,aes(x=EVTYPE ,y=value,fill=variable))
a <- a+ geom_bar(stat="identity",alpha=.8)
a <- a+xlab("Event Type") + ylab("Number of People Affected")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))              
a <- a +   ggtitle("Top Ten Natural Events Harmful to Population ")
a<- a + scale_fill_manual(values=c("#599ad3", "#f9a65a"))
a   

plot of chunk unnamed-chunk-6

b <- ggplot(data=mydf10,aes(x=EVTYPE ,y=value,fill=variable))
b <- b+ geom_bar(stat="identity",alpha=.8)
b <- b+xlab("Event Type") + ylab("Damage in $") +theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))             
b <- b +   ggtitle("Events causing the Maximum property and Crop Damage")
b<- b + scale_fill_manual(values=c("#599ad3", "#f9a65a"))
b              

plot of chunk unnamed-chunk-7

Result and conclusion:-

The plots clearly show that the tornado causes th maximum impact on population health, whereas the floods cause the maximum economic damage.

The data source: NOAA The events in the database start in the year 1950 and end in November 2011.