Synopsis: 1. Across the United States, Tornado is most harmful to population health. Due to National Weather Service Storm Database, it caused over 9378 fatalities, 1064 injuries, 11960 casualities.

1.1 In Central Standard Time Flood cause biggest impact on population health on 1998, but among all year tornado is most harmful.

  1. Across the United States, HAIL has greatest economic consequences. Due to National Weather Service Storm Database, it caused over 7,991,783,690 property damage, 127,374,570,870 crop damage, 135,366,354,560 total damage.
    2.1. In Eastern Standard Time, Flood casuse biggest damage on 2011 then Flash Flood on 1996 but the damage is smaller than CST time zone.
    2.2. In Central Standard Time, Flash.Flood cause biggest damage on 2008, but Hail cause largest damange in almost other years.
    2.3. In Mountain Standard Time, Hail cause biggest damage on 2010.

Data Processing

  1. use fread to read data, extract csv from bz2 file before processing
setwd("/Users/hadoop/RepData_PeerAssessment2")
#If you want to use fread, please install the dev version  

source("installPackage.R")
## Loading required package: devtools
## Removing package from '/Library/Frameworks/R.framework/Versions/3.2/Resources/library'
## (as 'lib' is unspecified)
## Downloading GitHub repo Rdatatable/data.table@master
## Installing data.table
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore CMD INSTALL  \
##   '/private/var/folders/4d/dcchbvqj711gdzj0n8hd64000000gp/T/RtmpL1QGTh/devtools147810b69adf/Rdatatable-data.table-34aa2a4'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.2/Resources/library'  \
##   --install-tests 
## 
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: ggplot2
## Loading required package: grid
setwd("/Users/hadoop/RepData_PeerAssessment2")
data.file <-"repdata-data-StormData.csv"
if(require(data.table) && packageVersion("data.table") >= "1.9.5" ) {
        topData <- read.table(data.file, sep=",", nrows = 50, header=TRUE)
        col.classes <- sapply(topData, class)
        raw.data <- fread(data.file, header= TRUE, sep=",", colClasses = col.classes)
} else {
        raw.data <- read.table(data.file, sep=",", header = TRUE)
}
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, last
## 
Read 34.1% of 967216 rows
Read 48.6% of 967216 rows
Read 67.2% of 967216 rows
Read 81.7% of 967216 rows
Read 91.0% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:07
## Warning in fread(data.file, header = TRUE, sep = ",", colClasses =
## col.classes): Read less rows (902297) than were allocated (967216). Run
## again with verbose=TRUE and please report.
  1. cleaning data
require(dplyr)
cleanData <- function(data) {
        clean.data <- data %>%
                mutate(EVTYPE = gsub("[^(a-zA-Z0-9/,)]",".", toupper(EVTYPE)) ) %>%
                mutate(TIME_ZONE = toupper(TIME_ZONE)) %>%
                mutate(PROPDMGEXP = toupper(PROPDMGEXP)) %>%
                mutate(CROPDMGEXP = toupper(CROPDMGEXP)) %>%
                filter(PROPDMGEXP %in% c("B","M","K")) %>%
                filter(CROPDMGEXP %in% c("B","M","K")) %>%
                mutate(BGN_DATE = as.Date(BGN_DATE, format="%m/%d/%Y")) %>%
                mutate(BGN_YR = year(BGN_DATE)) %>%
                mutate(BGN_DECADE = cut(BGN_YR, breaks = c(1949, 1959, 1969, 1979, 1989, 1999,2012) ) )%>%
                mutate(PROPDMG = ifelse(PROPDMGEXP == "K", ifelse(PROPDMG > 1000, PROPDMG, PROPDMG * 1000),
                                        ifelse(PROPDMGEXP == "M", ifelse(PROPDMG > 1000000, PROPDMG, PROPDMG * 1000000), 
                                               ifelse(PROPDMGEXP == "B", ifelse(PROPDMG > 1000000000, PROPDMG, PROPDMG * 1000000000), PROPDMG ) ) )) %>%
                mutate(CROPDMG = ifelse(CROPDMGEXP == "K", CROPDMG * ifelse(CROPDMG > 1000, CROPDMG, CROPDMG * 1000) , 
                                        ifelse(CROPDMGEXP == "M", ifelse(CROPDMG > 1000000, CROPDMG, CROPDMG * 1000000), 
                                               ifelse(CROPDMGEXP == "B", ifelse(CROPDMG > 1000000000, CROPDMG, CROPDMG * 1000000000) , CROPDMG ) ) )) 
        levels(clean.data$BGN_DECADE) <- c('1950-1959', '1960-1969','1970-1979','1980-1989','1990-1999','2000-2011')
        #the biggest damange shoud be Hurricane Katrina
        clean.data <- subset(clean.data, PROPDMG != 115000000000)
        return (clean.data)
}

clean.data <- cleanData(raw.data)
  1. summary clean data by Event Type
require(dplyr)
summaryEventData <- function(cleanData) {
        summary.data <- cleanData %>%
                group_by(EVTYPE) %>%
                summarise(event.count=n() ,
                          sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
                          sum.INJURIES = sum(INJURIES, na.rm= TRUE),
                          sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
                          sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
                mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG)  %>%
                mutate(sum.casualties = sum.FATALITIES  + sum.INJURIES) %>%
                mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES)
        return (summary.data)
}  

summary.event.data <- summaryEventData(clean.data)

Result

Summary Damage by Event Type

  1. order by sum.DMG descending, and get top 20 events which cause largest damage amont all years.
require(ggplot2)
summary.dmg.data <- summary.event.data %>%
        arrange(desc(sum.DMG)) %>%
        top_n(20) %>%
        select(EVTYPE, c(1,5,6,7))
## Selecting by index.casualties
p1 <- ggplot(summary.dmg.data, aes(x=EVTYPE, y=sum.DMG))  +
        geom_bar(stat="identity")  +
        labs(title = "Total damages casuse by Top 20 Weather Event") +
        labs(y = "Estimates of Damage (property + crop) ") +
        labs(x = "Event Type") +
        theme(axis.text.x= element_text(angle=90, vjust=0.6, size=8))+
        theme(title= element_text(vjust=0.6, size=8))

Summary Casualties by Event Type

  1. I use index.casualties = FATALITIES * 3 + INJURIES, to gave death large weight to measure population health impact.
  2. order by index.casualties descending, and get top 20 event which has largest population health impact(3 * fatalities + injuries)
require(ggplot2)
summary.casualties.data <- summary.event.data %>%
        arrange(desc(index.casualties)) %>%
        top_n(20) %>%
        select(EVTYPE, c(1,3,4,8,9))
## Selecting by index.casualties
p2 <- ggplot(summary.casualties.data, aes(x=EVTYPE, y=sum.casualties))  +
        geom_bar(stat="identity")  +
        labs(title = "Total Casualties casuse by Top 20 Weather Event") +
        labs(y = "Casualties (fatalities + injuries) ") +
        labs(x = "Event Type") +
        theme(axis.text.x= element_text(angle=90, vjust=0.6, size=8))+
        theme(title= element_text(vjust=0.6, size=8))

plot total damage/casualities in all years by Event Type

  1. Across the United States, in past years, the Flood has greatest economic consequences. Due to National Weather Service Storm Data, it caused over 132,836,489,050 property damage, 5,170,955,450 crop damage, 138,007,444,500 total damage.
  2. Across the United States, in past years, the Tornado is most harmful to population health. Due to National Weather Service Storm Data, it caused over 9378 fatalities, 1064 injuries, and 11960 casualities.
setwd("/Users/hadoop/RepData_PeerAssessment2") 
require(ggplot2)
require(grid)
source("multiplot.R") 
multiplot(p1, p2, cols=2) 

Summary Damage by Top3 Event in different time zone in the United States during the past years

  1. pickup TIME_ZONE(CST,EST, MST, PST)
  2. pickup top 3 EVENT TYPE (Flood, Flash.Flood, Hail)
  3. plot total damage (property+crop) in different time_zone by year
    http://www.timeanddate.com/time/zones/cst
  4. In Eastern Standard Time, Flood casuse biggest damage on 2011 then Flash Flood on 1996 but the damage is smaller than CST time zone.
  5. In Central Standard Time, Flash.Flood cause biggest damage on 2008, but Hail cause largest damange in almost other years.
  6. In Mountain Standard Time, Hail cause biggest damage on 2010.
summary.damage <- clean.data %>%
            filter(EVTYPE %in% as.data.frame(summary.dmg.data)[1:3,1]) %>%
            filter(TIME_ZONE %in% c("PST","MST","EST","CST")) %>%
            group_by(EVTYPE,BGN_YR,TIME_ZONE) %>%
            summarise(event.count=n() ,
                  sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
                  mean.PROPDMG = mean(PROPDMG, na.rm= TRUE),
                  sum.CROPDMG = sum(CROPDMG, na.rm= TRUE),
                  mean.CROPDMG = mean(CROPDMG, na.rm= TRUE)) %>%
        mutate(sum.Damage = sum.PROPDMG + sum.CROPDMG)  %>%
        mutate(mean.Damage = mean.PROPDMG + mean.CROPDMG) 

pd1 <-  ggplot(aes(x=BGN_YR, col=EVTYPE), data=summary.damage) +        
        geom_line(aes(y= sum.Damage))+
        geom_point(aes(y= sum.Damage, shape=EVTYPE))+
        facet_wrap(~ TIME_ZONE, ncol=2) +  
        labs(title = "Total damages by Top3 Event Across the United States" ) + 
        labs(y = "Estimates of Damage (property + crop)") +
        labs(x = "Year of Event")  +
        theme(title=element_text(vjust=0.5,hjust=0.5, size=7))+ 
        theme(axis.text.x= element_text(angle=0, vjust=0.5, size=8))    
plot(pd1)

Total Casualties by Top3 Event in different time zone in the United States during the past years

  1. pickup TIME_ZONE(CST,EST, MST, PST)
  2. pickup top 3 EVENT TYPE (Flood, Flash.Flood, Tornado)
  3. plot casualities (FATALITIES+INJURIES) in different time_zone by year

In differnt time zone in the United States, which event type has biggest population health impact has little difference. 1. In Central Standard Time Flood cause biggest impact on population health on 1998, but among all year tornado is most harmful. 1. In Pacific Standard Time, tornado cause biggest damage on 2011.

summary.casualties <- clean.data %>%
        filter(EVTYPE %in% as.data.frame(summary.casualties.data)[1:3,1]) %>%
        filter(TIME_ZONE %in% c("PST","MST","EST","CST")) %>%
        group_by(EVTYPE,BGN_YR,TIME_ZONE) %>%
        summarise(event.count=n() ,
                  sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
                  sum.INJURIES = sum(INJURIES, na.rm= TRUE)) %>%
        mutate(sum.Casualties = sum.FATALITIES + sum.INJURIES)  


pc1 <- ggplot(aes(x=BGN_YR, col=EVTYPE), data=summary.casualties) +        
        geom_line(aes(y= sum.Casualties, col=EVTYPE))+
        geom_point(aes(y= sum.Casualties, shape=EVTYPE))+
        scale_x_continuous(breaks=unique(summary.casualties$BGN_YR)) +
        facet_wrap(~ TIME_ZONE) +  
        labs(title = "Total Casualties by Top3 Event Across the United States" ) + 
        labs(y = "Casualties(Fatalities + Injuries)") +
        labs(x = "Year of Event")  +
        theme(axis.text.y=element_text(vjust=0.5,hjust=0.5, size=8))+ 
        theme(axis.text.x= element_text(angle=90, vjust=0.5, size=8))    
plot(pc1)