library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(ggplot2)

Title: Analysis of Weather Damage, Injuries and Fatalities in the United States

By Joel G. Polanco

Synopsis: This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern. This analysis summarizes the top 10 weather events that cause the most injuries and fatalities plus property and crop damage in the United States.

Data Load and Preprocessing

setwd("C:/Users/jgpolanc/Desktop/Coursera/c5p2/StormData_PeerAssessment2")
dat <- read.csv(paste(getwd(),"/data/StormData.csv",sep=""))

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

event_dat<-select(dat, EVTYPE, FATALITIES, INJURIES)
sum_event<- event_dat %>% group_by(EVTYPE) %>% summarise_each(funs(sum)) %>% 
        mutate( TOTAL = FATALITIES + INJURIES) %>%arrange(desc(TOTAL)) 
clean_event<- sum_event %>% filter(TOTAL > 1500) %>% arrange(desc(TOTAL))
clean_event <- clean_event[order(clean_event$TOTAL, decreasing=TRUE), ]
clean_event$EVTYPE <- factor(clean_event$EVTYPE, levels=clean_event$EVTYPE)
clean_event_breakout <- clean_event %>% select (EVTYPE, FATALITIES, INJURIES) %>%
        gather(IJ_TYPE, COUNT, FATALITIES:INJURIES)

Across the United States, which types of events have the greatest economic consequences?

event_dat2<-select(dat, EVTYPE, PROPDMG:CROPDMGEXP)
PROP_EXP<- event_dat2 %>% select(PROPDMG, PROPDMGEXP) %>% distinct(PROPDMGEXP)
CROP_EXP<- event_dat2 %>% select(CROPDMG,CROPDMGEXP) %>% distinct(CROPDMGEXP)

PROP_EXP CROP_EXP

After reviewing the distinct values returned I refered back to the NOAA Documenation and FAQ. The documentation was not very helpful. I then performed a Google search and discovered that several prior Data Science Course students (several were actuaries by profession) had documented what each of the EXP values for property and crop damage were by analyzing individuals events in the NOAA database. As a result, I am decided to leverage their mappings to create a multiplier variable that could be used to convert the numeric crop/prop damage columns.


These are possible values of CROPDMGEXP and PROPDMGEXP:

H,h,K,k,M,m,B,b,+,-,?,0,1,2,3,4,5,6,7,8, and blank-character

H,h = hundreds = 100

K,k = kilos = thousands = 1,000

M,m = millions = 1,000,000

B,b = billions = 1,000,000,000

(+) = 1

(-) = 0

(?) = 0

black/empty character = 0

numeric 0..8 = 10

Proof: https://rstudio-pubs-static.s3.amazonaws.com/58957_37b6723ee52b455990e149edde45e5b6.html

multiplier <- c('H'= 1000, 'h' = 100, 'K'= 1000, 'k' = 1000, 'M' = 1000000, 'm' = 1000000, 'B'=1000000000,'b'=1000000000, 
                '+'= 1, '-' = 0, '?'=0, ' ' = 0 , '0' = 10, '1' = 10, '2' = 10, '3' = 10, '4' = 10, '5' = 10, '6' = 10,
                '7' = 10, '8' = 10, '9' = 10)
multiplier<-as.data.frame(multiplier)
multiplier<-data.frame(as.factor(rownames(multiplier)),multiplier)
names(multiplier)[1] <- "factor"
event_dat2$PROPDMGEXP<-multiplier[match(event_dat2$PROPDMGEXP, multiplier$factor),2]
event_dat2$CROPDMGEXP<-multiplier[match(event_dat2$CROPDMGEXP, multiplier$factor),2]
event_dat2[is.na(event_dat2)] <- 0
event_dat_final<- event_dat2 %>% mutate(CROP_DAMAGE = CROPDMGEXP * CROPDMG) %>%
        mutate(PROP_DAMAGE = PROPDMGEXP * PROPDMG) %>% select(EVTYPE, PROP_DAMAGE, CROP_DAMAGE)
sum_event_damage<- event_dat_final %>% group_by(EVTYPE) %>% summarise_each(funs(sum)) %>% 
        mutate(TOTAL_DAMAGE = PROP_DAMAGE + CROP_DAMAGE) %>%arrange(desc(TOTAL_DAMAGE)) 
clean_event_damage <- sum_event_damage %>% slice(1:10) %>% arrange(desc(TOTAL_DAMAGE))
clean_event_damage <- clean_event_damage[order(clean_event_damage$TOTAL_DAMAGE, decreasing=TRUE), ]
clean_event_damage$EVTYPE <- factor(clean_event_damage$EVTYPE, levels=clean_event_damage$EVTYPE)
clean_event_damage_breakout <- clean_event_damage %>% select (EVTYPE, PROP_DAMAGE, CROP_DAMAGE) %>%
        gather(DAMAGE_TYPE, TOTAL, PROP_DAMAGE:CROP_DAMAGE)

Results

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

ggplot(data=clean_event_breakout, aes(x=EVTYPE, y=COUNT,fill=IJ_TYPE)) + 
        geom_bar(stat="identity") +  coord_flip()

Across the United States, which types of events have the greatest economic consequences?

clean_event_damage

ggplot(data=clean_event_damage, aes(x=EVTYPE, y=TOTAL_DAMAGE),group=1) + 
        geom_bar(stat="identity") +  coord_flip() 

ggplot(data=clean_event_damage_breakout, aes(x=EVTYPE, y=TOTAL,fill=DAMAGE_TYPE)) + 
        geom_bar(stat="identity") +  coord_flip()