Synopsis: 1. Across the United States, Tornado is most harmful to population health. Due to National Weather Service Storm Database, it caused over 9378 fatalities, 1064 injuries, 11960 casualities.
1.1 In Central Standard Time Flood cause biggest impact on population health on 1998, but among all year tornado is most harmful.
setwd("/Users/hadoop/RepData_PeerAssessment2")
#If you want to use fread, please install the dev version
source("installPackage.R")
## Loading required package: devtools
## Removing package from '/Library/Frameworks/R.framework/Versions/3.2/Resources/library'
## (as 'lib' is unspecified)
## Downloading GitHub repo Rdatatable/data.table@master
## Installing data.table
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore CMD INSTALL \
## '/private/var/folders/4d/dcchbvqj711gdzj0n8hd64000000gp/T/RtmpL1QGTh/devtools147810b69adf/Rdatatable-data.table-34aa2a4' \
## --library='/Library/Frameworks/R.framework/Versions/3.2/Resources/library' \
## --install-tests
##
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: ggplot2
## Loading required package: grid
setwd("/Users/hadoop/RepData_PeerAssessment2")
data.file <-"repdata-data-StormData.csv"
if(require(data.table) && packageVersion("data.table") >= "1.9.5" ) {
topData <- read.table(data.file, sep=",", nrows = 50, header=TRUE)
col.classes <- sapply(topData, class)
raw.data <- fread(data.file, header= TRUE, sep=",", colClasses = col.classes)
} else {
raw.data <- read.table(data.file, sep=",", header = TRUE)
}
## Loading required package: data.table
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, last
##
Read 34.1% of 967216 rows
Read 48.6% of 967216 rows
Read 67.2% of 967216 rows
Read 81.7% of 967216 rows
Read 91.0% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:07
## Warning in fread(data.file, header = TRUE, sep = ",", colClasses =
## col.classes): Read less rows (902297) than were allocated (967216). Run
## again with verbose=TRUE and please report.
require(dplyr)
cleanData <- function(data) {
clean.data <- data %>%
mutate(EVTYPE = gsub("[^(a-zA-Z0-9/,)]",".", toupper(EVTYPE)) ) %>%
mutate(TIME_ZONE = toupper(TIME_ZONE)) %>%
mutate(PROPDMGEXP = toupper(PROPDMGEXP)) %>%
mutate(CROPDMGEXP = toupper(CROPDMGEXP)) %>%
filter(PROPDMGEXP %in% c("B","M","K")) %>%
filter(CROPDMGEXP %in% c("B","M","K")) %>%
mutate(BGN_DATE = as.Date(BGN_DATE, format="%m/%d/%Y")) %>%
mutate(BGN_YR = year(BGN_DATE)) %>%
mutate(BGN_DECADE = cut(BGN_YR, breaks = c(1949, 1959, 1969, 1979, 1989, 1999,2012) ) )%>%
mutate(PROPDMG = ifelse(PROPDMGEXP == "K", ifelse(PROPDMG > 1000, PROPDMG, PROPDMG * 1000),
ifelse(PROPDMGEXP == "M", ifelse(PROPDMG > 1000000, PROPDMG, PROPDMG * 1000000),
ifelse(PROPDMGEXP == "B", ifelse(PROPDMG > 1000000000, PROPDMG, PROPDMG * 1000000000), PROPDMG ) ) )) %>%
mutate(CROPDMG = ifelse(CROPDMGEXP == "K", CROPDMG * ifelse(CROPDMG > 1000, CROPDMG, CROPDMG * 1000) ,
ifelse(CROPDMGEXP == "M", ifelse(CROPDMG > 1000000, CROPDMG, CROPDMG * 1000000),
ifelse(CROPDMGEXP == "B", ifelse(CROPDMG > 1000000000, CROPDMG, CROPDMG * 1000000000) , CROPDMG ) ) ))
levels(clean.data$BGN_DECADE) <- c('1950-1959', '1960-1969','1970-1979','1980-1989','1990-1999','2000-2011')
#the biggest damange shoud be Hurricane Katrina
clean.data <- subset(clean.data, PROPDMG != 115000000000)
return (clean.data)
}
clean.data <- cleanData(raw.data)
require(dplyr)
summaryEventData <- function(cleanData) {
summary.data <- cleanData %>%
group_by(EVTYPE) %>%
summarise(event.count=n() ,
sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE),
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG) %>%
mutate(sum.casualties = sum.FATALITIES + sum.INJURIES) %>%
mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES)
return (summary.data)
}
summary.event.data <- summaryEventData(clean.data)
require(ggplot2)
summary.dmg.data <- summary.event.data %>%
arrange(desc(sum.DMG)) %>%
top_n(20) %>%
select(EVTYPE, c(1,5,6,7))
## Selecting by index.casualties
p1 <- ggplot(summary.dmg.data, aes(x=EVTYPE, y=sum.DMG)) +
geom_bar(stat="identity") +
labs(title = "Total damages casuse by Top 20 Weather Event") +
labs(y = "Estimates of Damage (property + crop) ") +
labs(x = "Event Type") +
theme(axis.text.x= element_text(angle=90, vjust=0.6, size=8))+
theme(title= element_text(vjust=0.6, size=8))
require(ggplot2)
summary.casualties.data <- summary.event.data %>%
arrange(desc(index.casualties)) %>%
top_n(20) %>%
select(EVTYPE, c(1,3,4,8,9))
## Selecting by index.casualties
p2 <- ggplot(summary.casualties.data, aes(x=EVTYPE, y=sum.casualties)) +
geom_bar(stat="identity") +
labs(title = "Total Casualties casuse by Top 20 Weather Event") +
labs(y = "Casualties (fatalities + injuries) ") +
labs(x = "Event Type") +
theme(axis.text.x= element_text(angle=90, vjust=0.6, size=8))+
theme(title= element_text(vjust=0.6, size=8))
setwd("/Users/hadoop/RepData_PeerAssessment2")
require(ggplot2)
require(grid)
source("multiplot.R")
multiplot(p1, p2, cols=2)
summary.damage <- clean.data %>%
filter(EVTYPE %in% as.data.frame(summary.dmg.data)[1:3,1]) %>%
filter(TIME_ZONE %in% c("PST","MST","EST","CST")) %>%
group_by(EVTYPE,BGN_YR,TIME_ZONE) %>%
summarise(event.count=n() ,
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
mean.PROPDMG = mean(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE),
mean.CROPDMG = mean(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.Damage = sum.PROPDMG + sum.CROPDMG) %>%
mutate(mean.Damage = mean.PROPDMG + mean.CROPDMG)
pd1 <- ggplot(aes(x=BGN_YR, col=EVTYPE), data=summary.damage) +
geom_line(aes(y= sum.Damage))+
geom_point(aes(y= sum.Damage, shape=EVTYPE))+
facet_wrap(~ TIME_ZONE, ncol=2) +
labs(title = "Total damages by Top3 Event Across the United States" ) +
labs(y = "Estimates of Damage (property + crop)") +
labs(x = "Year of Event") +
theme(title=element_text(vjust=0.5,hjust=0.5, size=7))+
theme(axis.text.x= element_text(angle=0, vjust=0.5, size=8))
plot(pd1)
In differnt time zone in the United States, which event type has biggest population health impact has little difference. 1. In Central Standard Time Flood cause biggest impact on population health on 1998, but among all year tornado is most harmful. 1. In Pacific Standard Time, tornado cause biggest damage on 2011.
summary.casualties <- clean.data %>%
filter(EVTYPE %in% as.data.frame(summary.casualties.data)[1:3,1]) %>%
filter(TIME_ZONE %in% c("PST","MST","EST","CST")) %>%
group_by(EVTYPE,BGN_YR,TIME_ZONE) %>%
summarise(event.count=n() ,
sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE)) %>%
mutate(sum.Casualties = sum.FATALITIES + sum.INJURIES)
pc1 <- ggplot(aes(x=BGN_YR, col=EVTYPE), data=summary.casualties) +
geom_line(aes(y= sum.Casualties, col=EVTYPE))+
geom_point(aes(y= sum.Casualties, shape=EVTYPE))+
scale_x_continuous(breaks=unique(summary.casualties$BGN_YR)) +
facet_wrap(~ TIME_ZONE) +
labs(title = "Total Casualties by Top3 Event Across the United States" ) +
labs(y = "Casualties(Fatalities + Injuries)") +
labs(x = "Year of Event") +
theme(axis.text.y=element_text(vjust=0.5,hjust=0.5, size=8))+
theme(axis.text.x= element_text(angle=90, vjust=0.5, size=8))
plot(pc1)