This report analyzes data on weather events collected in the U.S. in terms of harm to health or economic factors. The data was collected between January 1950 and November 2011 by the National Oceanic and Atmospheric Administration (NOAA). The database tracks characteristics of major storms and weather events in the U.S., including when and where they occur, as well as estimates of fatalities, injuries, and property damage. The data is available at https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2. Documentation for the data is available at https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf. The National Climatic Data Center Storm Events FAQ is available at https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf.
The following is a high-level overview of the steps in data processing.
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.1.2
setwd("C:/Users/Bhawna Arora/Desktop/Coursera")
storm.data <- read.csv("repdata-data-StormData.csv")
storm.data$year <- as.numeric(format(as.Date(storm.data$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
storm.data1 <- subset(storm.data, storm.data$year > 2000)
dim(storm.data1)
## [1] 488692 38
evt_aggr <- integer()
exclude <- function(set1) {
result = setdiff(set1, evt_aggr)
evt_aggr <<- union(evt_aggr, set1)
return (result)
}
storm.data1$EVTYPE_GRP <- str_trim(storm.data1$EVTYPE)
evt_groups <- data.frame(group=character(), term1=character(), term2=character(), term3=character(), term4=character(),
term5=character(), term6=character(), term7=character(), term8=character(), term9=character(), stringsAsFactors=FALSE)
evt_groups[1,] <- c("HEAT (GRP)", "HEAT", "HOT", "WARMTH", "WARM", rep(NA,5))
evt_groups[2,] <- c("TORNADO (GRP)", "TORNADO", "WATERSPOUT", "WAYTERSPOUT", "TORNDAO", "GUSTNADO", rep(NA,4))
evt_groups[3,] <- c("COLD (GRP)", "COLD", "HYPOTHERMIA", "EXPOSURE", "WIND CHILL", "LOW.*TEMP", "UNSEASON.*COOL", rep(NA,3))
evt_groups[4,] <- c("WIND (GRP)", "WIND", rep(NA,8))
evt_groups[5,] <- c("RIP CURRENT (GRP)", "RIP CURRENT", rep(NA,8))
evt_groups[6,] <- c("WINTER (GRP)", "WINTER", "WINTRY", "SNOW", "BLIZZARD", "ICE", "ICY", "FROST", "FREEZE", "FREEZING")
evt_groups[7,] <- c("HURRICANE (GRP)", "HURRICANE", "TYPHOON", rep(NA,7))
evt_groups[8,] <- c("FLOOD (GRP)", "FLOOD", "FLDG", rep(NA,7))
evt_groups[9,] <- c("FIRE (GRP)", "FIRE", rep(NA,8))
evt_groups[10,] <- c("HAIL (GRP)", "HAIL", rep(NA,8))
evt_groups[11,] <- c("TROP. STORM (GRP)", "TROPICAL STORM", rep(NA,8))
evt_groups[12,] <- c("T-STORM (GRP)", "THUNDERSTORM", "TSTM", rep(NA,7))
evt_groups[13,] <- c("LIGHTNING (GRP)", "LIGHTNING","LIGNTNING","LIGHTING", rep(NA,6))
evt_groups[14,] <- c("MUD SLIDE (GRP)", "MUD.*SLIDE", rep(NA,8))
evt_aggr= integer()
evt_groups_data = list()
for(i in 1:nrow(evt_groups)) {
non_na_cols <- which(!is.na(evt_groups[i,2:10])) + 1
pattern = paste0("(", paste0(evt_groups[i,non_na_cols], sep=")", collapse="|("))
x <- grep(pattern, storm.data1$EVTYPE_GRP, ignore.case=TRUE)
x <- exclude(x)
evt_groups_data[[evt_groups[i,]$group]] = x
storm.data1[x,]$EVTYPE_GRP <- evt_groups[i,]$group
}
storm.data1$CROPDMGFACTOR <- 0
storm.data1[storm.data1$CROPDMGEXP %in% c("k","K"),]$CROPDMGFACTOR = 1000
storm.data1[storm.data1$CROPDMGEXP %in% c("m","M"),]$CROPDMGFACTOR = 1000000
storm.data1[storm.data1$CROPDMGEXP %in% c("b","B"),]$CROPDMGFACTOR = 1000000000
storm.data1$CROPDMGADJ <- storm.data1$CROPDMG * storm.data1$CROPDMGFACTOR
storm.data1$PROPDMGFACTOR <- 0
storm.data1[storm.data1$PROPDMGEXP %in% c("k","K"),]$PROPDMGFACTOR = 1000
storm.data1[storm.data1$PROPDMGEXP %in% c("m","M"),]$PROPDMGFACTOR = 1000000
storm.data1[storm.data1$PROPDMGEXP %in% c("b","B"),]$PROPDMGFACTOR = 1000000000
storm.data1$PROPDMGADJ <- storm.data1$PROPDMG * storm.data1$PROPDMGFACTOR
storm.data1.summary <- group_by(storm.data1, EVTYPE_GRP) %>%
summarize(FATALITIES=sum(FATALITIES), INJURIES=sum(INJURIES), CROPDMG=sum(CROPDMGADJ), PROPDMG=sum(PROPDMGADJ))
top_fatalities <- head(
storm.data1.summary %>%
filter(FATALITIES > 0) %>%
arrange(desc(FATALITIES)) %>%
select(EVTYPE_GRP, FATALITIES)
, 20)
total_fatalities <- sum(storm.data1$FATALITIES)
total_fatalities_top_causes <- sum(top_fatalities$FATALITIES)
par(las=2,mar=c(9,4,4,4))
top_injuries <- head(
storm.data1.summary %>%
filter(INJURIES > 0) %>%
arrange(desc(INJURIES)) %>%
select(EVTYPE_GRP , INJURIES)
,20)
total_injuries <- sum(storm.data1$INJURIES)
total_injuries_top_causes <- sum(top_injuries$INJURIES)
top_cropdmg <- head(
storm.data1.summary %>%
filter(CROPDMG > 0) %>%
arrange(desc(CROPDMG)) %>%
select(EVTYPE_GRP, CROPDMG)
, 20)
total_cropdmg <- sum(storm.data1.summary$CROPDMG)
total_cropdmg_top_causes <- sum(top_cropdmg$CROPDMG)
top_propdmg <- head(
storm.data1.summary %>%
filter(PROPDMG > 0) %>%
arrange(desc(PROPDMG)) %>%
select(EVTYPE_GRP, PROPDMG)
, 20)
total_propdmg <- sum(storm.data1.summary$PROPDMG)
total_propdmg_top_causes <- sum(top_propdmg$PROPDMG)
par(las=2,mfrow=c(1,2),mar=c(12,3,3,3))
barplot(top_fatalities$FATALITIES, names.arg=top_fatalities$EVTYPE_GRP, cex.names=0.95, main="Top Weather-Related Causes of Fatality")
barplot(top_injuries$INJURIES, names.arg=top_injuries$EVTYPE_GRP, cex.names=0.95, main="Top Weather-Related Causes of Injury")
par(las=1)
mtext
## function (text, side = 3, line = 0, outer = FALSE, at = NA, adj = NA,
## padj = NA, cex = NA, col = NA, font = NA, ...)
## invisible(.External.graphics(C_mtext, as.graphicsAnnot(text),
## side, line, outer, at, adj, padj, cex, col, font, ...))
## <bytecode: 0x19e8b80c>
## <environment: namespace:graphics>
par(las=2,mfrow=c(1,2),mar=c(14,7,3,3))
barplot(top_cropdmg$CROPDMG, names.arg=top_cropdmg$EVTYPE, cex.names=0.95, main="Top Weather-Related Causes of Crop Damage")
barplot(top_propdmg$PROPDMG, names.arg=top_propdmg$EVTYPE, cex.names=0.95, main="Top Weather-Related Causes of Property Damage")
par(las=1)
mtext("Weather events causing the most economic damage", side=3, line=-35, outer=TRUE)