Synopsis:

In this report, we’ll analyze the National Weather Service Storm Data to look at effects on human health and damage to property, crops from weather events. The original data set contains records from 1950 to 2011.

However, The data analysis is restricted to data only from the year 1996 to 2011. This is due due to better and more comprehensive data collection since 1996.

Health Effects: Tornadoes cause the most injuries, Excessive Heat causes the most fatalities.

Property Damage: Floods cause the most Property damage, Drought cause the most Crop Damage

Data Processing

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(reshape2)
#Reading in data - Current working directory is set 
storm = read.csv(bzfile("repdata-data-StormData.csv.bz2")) #reading directly from compressed filetype bz

Let us explore the uploaded data set.

dim(storm)
## [1] 902297     37

The data set contatins 902297 rows and 37 columns.

storm$BGN_DATE = mdy_hms(storm$BGN_DATE)
storm$year = year(storm$BGN_DATE)
hist(storm$year, breaks=length(unique(storm$year)), main = "Number of data points by year in Storm Data", xlab="Year", ylab="")

Looking at the number of data observations, it is seen that data collection prior to 1994-95 is very sparse compared to later years.
Also, Based on the information in this NOAA page, all event types were recorded starting only in 1996. So, we’ll confine our data analysis to the time period starting in 1996.

storm = filter(storm, year>=1996)
dim(storm)
## [1] 653530     38

The new data set contatins 653530 observations and this represents more than 70% of the initial number of observations.

The monetory damage data is provided as a value plus another column defining exponential value i.e., 1.55B for $1,550,000,000. Alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions.

Results

Events most harmful to human health

Fatalities_by_type = as.data.frame(tapply(storm$FATALITIES, storm$EVTYPE, sum))
Fatalities_by_type <- add_rownames(Fatalities_by_type, "Event_Type")
colnames(Fatalities_by_type) = c("Event_Type", "Fatalities")

Injuries_by_type = as.data.frame(tapply(storm$INJURIES, storm$EVTYPE, sum))
Injuries_by_type <- add_rownames(Injuries_by_type, "Event_Type")
colnames(Injuries_by_type) = c("Event_Type", "Injuries")

harmful = left_join(Fatalities_by_type, Injuries_by_type, by = "Event_Type")
harmful = mutate(harmful, Total.Harmful = Fatalities + Injuries )
harmful = arrange(harmful, desc(Total.Harmful))
Top10harmful = harmful[1:10,] 
Top10harmful = select(Top10harmful, -Total.Harmful)
mTop10harmful = melt(Top10harmful, id.vars="Event_Type", 
                   value.name = "Total")

attr(mTop10harmful$Total, "dim") <- 2 * attr(mTop10harmful$Total, "dim")

p1 = ggplot(mTop10harmful, aes(x=Event_Type, y=Total, fill=variable)) 
p1= p1 + geom_bar(stat="identity", position="dodge") 
p1= p1 + ggtitle("Number of injuries and fatalities by storm event for 1996-2011 period")
p1 <- p1 + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1= p1 + ylab("Number of events") + xlab("Event type")
print(p1)

The top 10 harmful events are represented in the chart above.

Tornados cause the most number of Injuries by far.
Excessive heat casuses the most number of Fatalities, Tornadoes are a close second.

Events cuasing most monetory damage (property, crops)

#resizing, reducing the data frame to needed variables
storm_econ = select(storm, EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP, year)
storm_econ$PROPDMGEXP = as.character(storm_econ$PROPDMGEXP)
storm_econ$CROPDMGEXP = as.character(storm_econ$CROPDMGEXP)

# to see what are the exponential values used
unique(storm_econ$PROPDMGEXP)
## [1] "K" ""  "M" "B" "0"
unique(storm_econ$CROPDMGEXP)
## [1] "K" ""  "M" "B"

We see that the values of damage are expressed in K. M, B that have to be read as Thousands, Millions, Billions respectively.

#assigning exp values to K, M, B i.e K = 10^3 and so on...
storm_econ$PROP_exp = ifelse(storm_econ$PROPDMGEXP=="K", 10^3, ifelse(storm_econ$PROPDMGEXP=="M", 10^6, ifelse(storm_econ$PROPDMGEXP=="B", 10^9, 0)))
storm_econ$CROP_exp = ifelse(storm_econ$CROPDMGEXP=="K", 10^3, ifelse(storm_econ$CROPDMGEXP=="M", 10^6, ifelse(storm_econ$CROPDMGEXP=="B", 10^9, 0)))

#calculating actual value
storm_econ$Property.Damage = storm_econ$PROPDMG * storm_econ$PROP_exp
storm_econ$Crop.Damage = storm_econ$CROPDMG * storm_econ$CROP_exp

#reducing dataframe
storm_econ = select(storm_econ, EVTYPE,Property.Damage, Crop.Damage)

Prop_damage_type = as.data.frame(tapply(storm_econ$Property.Damage, storm_econ$EVTYPE, sum))
Prop_damage_type <- add_rownames(Prop_damage_type, "Event_Type")
colnames(Prop_damage_type) = c("Event_Type", "Property.Damage")

Crop_damage_type = as.data.frame(tapply(storm_econ$Crop.Damage, storm_econ$EVTYPE, sum))
Crop_damage_type <- add_rownames(Crop_damage_type, "Event_Type")
colnames(Crop_damage_type) = c("Event_Type", "Crop.Damage")

damages = left_join(Prop_damage_type, Crop_damage_type, by = "Event_Type")
damages = mutate(damages, Total.Damages = Property.Damage + Crop.Damage)
damages = arrange(damages, desc(Total.Damages))
Top10damages = damages[1:10,] #taking top 10 by totla damage
Top10damages = select(Top10damages, -Total.Damages)
Top10damages$Property.Damage = Top10damages$Property.Damage/(10^9)
Top10damages$Crop.Damage = Top10damages$Crop.Damage/(10^9)
mTop10damages = melt(Top10damages, id.vars="Event_Type", 
                    value.name = "Total")

attr(mTop10damages$Total, "dim") <- 2 * attr(mTop10damages$Total, "dim")
#ggplot soed not work without previous step - do not undertsand why

#plotting
q1 = ggplot(mTop10damages, aes(x=Event_Type, y=Total, fill=variable)) 
q1= q1 + geom_bar(stat="identity", position="dodge") 
q1= q1 + ggtitle("Economic damage by storm event for 1996-2011 period")
q1= q1 + ylab("Total damage in Billion $") + xlab("Event type")
q1 <- q1 + theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(q1)

The top 10 damaging economic event types are represented in the chart above.

Floods cause the most Property damage and Drought cause the most Crop Damage

Appendix

  1. R Session Info
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.1  lubridate_1.5.6 knitr_1.12.3    ggplot2_2.1.0  
## [5] dplyr_0.4.3    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.4      digest_0.6.9     assertthat_0.1   plyr_1.8.3      
##  [5] grid_3.3.0       R6_2.1.2         gtable_0.2.0     DBI_0.4-1       
##  [9] formatR_1.3      magrittr_1.5     scales_0.4.0     evaluate_0.9    
## [13] stringi_1.0-1    rmarkdown_0.9.6  tools_3.3.0      stringr_1.0.0   
## [17] munsell_0.4.3    yaml_2.1.13      parallel_3.3.0   colorspace_1.2-6
## [21] htmltools_0.3.5