Synopsis: 1. Across the United States, Tornado is most harmful to population health. Due to National Weather Service Storm Database, it caused over 9378 fatalities, 1064 injuries, 11960 casualities.
1.1 In Central Standard Time Flood cause biggest impact on population health on 1998, but among all year tornado is most harmful.
setwd("/Users/hadoop/RepData_PeerAssessment2")
suppressMessages(source("installPackage.R"))
suppressMessages(require(ggplot2))
suppressMessages(require(data.table))
suppressMessages(require(grid))
suppressMessages(require(dplyr))
suppressMessages(require(lubridate))
suppressMessages(require(tidyr))
suppressMessages(source("multiplot.R"))
#If you want to use fread, please install the dev version
data.file <-"repdata-data-StormData.csv"
if(packageVersion("data.table") >= "1.9.5" ) {
topData <- read.table(data.file, sep=",", nrows = 50, header=TRUE)
col.classes <- sapply(topData, class)
raw.data <- fread(data.file, header= TRUE, sep=",", colClasses = col.classes)
} else {
raw.data <- read.table(data.file, sep=",", header = TRUE)
}
##
Read 0.0% of 967216 rows
Read 32.1% of 967216 rows
Read 52.7% of 967216 rows
Read 69.3% of 967216 rows
Read 79.6% of 967216 rows
Read 92.0% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:08
## Warning in fread(data.file, header = TRUE, sep = ",", colClasses =
## col.classes): Read less rows (902297) than were allocated (967216). Run
## again with verbose=TRUE and please report.
transferDamage <- function(Dmg, Exp) {
value <- Dmg
if(value > 1000000) {
value <- Dmb
} else if(Exp == "K" || Exp == "k") {
value <- Dmg * 1000
} else if(Exp == "M" || Exp == "m") {
value <- Dmg * 1e+6
} else if(Exp == "B" || Exp == "b") {
value <- Dmg * 1e+9
} else if(Exp == "H" || Exp == "h") {
value <- Dmg * 100
}
return (value)
}
unifyEventType <- function(EVTYPE) {
EventType <- EVTYPE
if(grepl('WILD', EVTYPE) & grepl('FIRE', EVTYPE)) {
EventType <- 'WILDFIRE'
} else if(grepl('/', EVTYPE) ) {
EventType <- EVTYPE
} else if(grepl('.AND.', EVTYPE) ) {
EventType <- EVTYPE
} else if(grepl('TSTM.WIND', EVTYPE) ) {
EventType <- 'TSTM.WIND'
} else if(grepl('VOLCANIC', EVTYPE)) {
EventType <- 'VOLCANIC.ASH'
} else if(grepl('FLASH.FLOOD', EVTYPE)) {
EventType <- 'FLASH.FLOOD'
} else if(grepl('FLOOD', EVTYPE)) {
EventType <- 'FLOOD'
} else if(grepl('TORNADO', EVTYPE)) {
EventType <- 'TORNADO'
} else if(grepl('THUNDERSTORM', EVTYPE)) {
EventType <- 'THUNDERSTORM'
} else if(grepl('LIGHTNING', EVTYPE)) {
EventType <- 'LIGHTNING'
} else if(grepl('HIGH.WINDS', EVTYPE)){
EventType <- 'HIGH.WINDS'
}else if(grepl('HAIL', EVTYPE)){
EventType <- 'HAIL'
}
return (EventType)
}
cleanData <- function(data) {
clean.data <- data %>%
mutate(EVTYPE = gsub("[^(a-zA-Z0-9/,)]",".", toupper(EVTYPE)) ) %>%
mutate(TIME_ZONE = toupper(TIME_ZONE)) %>%
mutate(PROPDMGEXP = toupper(PROPDMGEXP)) %>%
mutate(CROPDMGEXP = toupper(CROPDMGEXP)) %>%
mutate(BGN_DATE = as.Date(BGN_DATE, format="%m/%d/%Y")) %>%
mutate(BGN_YR = year(BGN_DATE)) %>%
mutate(BGN_DECADE = cut(BGN_YR, breaks = c(1949, 1959, 1969, 1979, 1989, 1999,2012) ) )%>%
rowwise() %>%
mutate(EVTYPE = unifyEventType(EVTYPE)) %>%
mutate(PROPDMG = transferDamage(PROPDMG, PROPDMGEXP) ) %>%
mutate(CROPDMG = transferDamage(CROPDMG, CROPDMGEXP) )
levels(clean.data$BGN_DECADE) <- c('1950~', '1960~','1970~','1980~','1990~','2000~')
return (clean.data)
}
clean.data <- cleanData(raw.data)
max.PROPDMG <- clean.data[which(clean.data$PROPDMG == max(clean.data$PROPDMG, na.rm = TRUE) ), ]
max.PROPDMG[,c("EVTYPE","BGN_DATE","STATE","PROPDMG")]
## Source: local data frame [1 x 4]
##
## EVTYPE BGN_DATE STATE PROPDMG
## 1 FLOOD 2006-01-01 CA 1.15e+11
max.PROPDMG$REMARKS
## [1] "Major flooding continued into the early hours of January 1st, before the Napa River finally fell below flood stage and the water receeded. Flooding was severe in Downtown Napa from the Napa Creek and the City and Parks Department was hit with $6 million in damage alone. The City of Napa had 600 homes with moderate damage, 150 damaged businesses with costs of at least $70 million."
I saw discussion about this event in the course forum, the damage mentioned for this event is far below the PROPDMG, so I decided to remove this record in the following steps
max.CROPDMG <- clean.data[which(clean.data$CROPDMG == max(clean.data$CROPDMG, na.rm = TRUE) ),]
max.CROPDMG[,c("EVTYPE","BGN_DATE","STATE","CROPDMG")]
## Source: local data frame [2 x 4]
##
## EVTYPE BGN_DATE STATE CROPDMG
## 1 FLOOD 1993-08-31 IL 5e+09
## 2 ICE.STORM 1994-02-09 MS 5e+09
#max.CROPDMG$REMARKS
max.FATALITIES <- clean.data[which(clean.data$FATALITIES == max(clean.data$FATALITIES, na.rm = TRUE) ),]
max.FATALITIES[,c("EVTYPE","BGN_DATE","STATE","FATALITIES")]
## Source: local data frame [1 x 4]
##
## EVTYPE BGN_DATE STATE FATALITIES
## 1 HEAT 1995-07-12 IL 583
#max.FATALITIES$REMARKS
max.INJURIES <- clean.data[which(clean.data$INJURIES == max(clean.data$INJURIES, na.rm = TRUE) ),]
max.INJURIES[,c("EVTYPE","BGN_DATE","STATE","INJURIES")]
## Source: local data frame [1 x 4]
##
## EVTYPE BGN_DATE STATE INJURIES
## 1 TORNADO 1979-04-10 TX 1700
#max.INJURIES$REMARKS
noExpCnt <- sum(!raw.data$PROPDMGEXP %in% c("B","M","K","H","") |
!raw.data$CROPDMGEXP %in% c("B","M","K","H",""))
noExpCnt/ dim(raw.data)[1]
## [1] 0.0004100645
clean.data <- clean.data %>%
filter(PROPDMGEXP %in% c("B","M","K","H","")) %>%
filter(CROPDMGEXP %in% c("B","M","K","H","")) %>%
filter(PROPDMG != 115000000000)
dim(clean.data)
## [1] 901955 39
summaryEventData <- function(cleanData) {
summary.data <- cleanData %>%
group_by(EVTYPE) %>%
summarise(event.count=n() ,
sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE),
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG) %>%
mutate(sum.casualties = sum.FATALITIES + sum.INJURIES) %>%
mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES)
return (summary.data)
}
summary.event.data <- summaryEventData(clean.data)
summary.dmg.data <- summary.event.data %>%
ungroup() %>%
arrange(desc(sum.DMG)) %>%
top_n(10) %>%
select(EVTYPE, c(1,5,6,7))
## Selecting by index.casualties
dmg.data <- summary.dmg.data %>%
gather(damageType, damageValue, sum.PROPDMG:sum.CROPDMG)
p1 <- ggplot(dmg.data, aes(x=EVTYPE, y=damageValue/1e+9, fill=damageType, order=desc(damageType))) + #works
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
#geom_text(aes(label=totalDeathInjury)) +
ggtitle(paste("Weather Events Casuse Most Damage ")) +
xlab("Weather Event") +
ylab("Damage(billion)") +
labs(fill="Damage Type") +
theme(axis.text.x= element_text(size=5)) +
theme(axis.text.y= element_text(size=7)) +
theme(strip.text.x= element_text(size=7)) +
theme(legend.text= element_text(size=7)) +
theme(title= element_text(size=7))
max.dmg <- summary.dmg.data[which(summary.dmg.data$sum.DMG == max(summary.dmg.data$sum.DMG)), ]
max.dmg
## Source: local data frame [1 x 4]
##
## EVTYPE sum.PROPDMG sum.CROPDMG sum.DMG
## 1 TORNADO 56941811733 364958360 57306770093
summary.casualties.data <- summary.event.data %>%
ungroup() %>%
arrange(desc(index.casualties)) %>%
top_n(10) %>%
select(EVTYPE, c(1,3,4,8,9))
## Selecting by index.casualties
casualties.data <- summary.casualties.data %>%
gather(harmType, harmValue, sum.FATALITIES:sum.INJURIES)
p2 <- ggplot(casualties.data, aes(x=EVTYPE, y=harmValue/1000, fill=harmType, order=(harmType))) + #works
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
#geom_text(aes(label=totalDeathInjury)) +
ggtitle(paste("Weather Events Casuse Most Casualties ")) +
xlab("Weather Event") +
ylab("Casualties(thousand)") +
labs(fill="Harm Type") +
theme(axis.text.x= element_text( size=5)) +
theme(axis.text.y= element_text( size=7)) +
theme(strip.text.x= element_text( size=7)) +
theme(legend.text= element_text( size=5)) +
theme(title= element_text( size=7))
max.casualties <- summary.casualties.data[which(summary.casualties.data$sum.casualties == max(summary.casualties.data$sum.casualties)), ]
max.casualties
## Source: local data frame [1 x 5]
##
## EVTYPE sum.FATALITIES sum.INJURIES sum.casualties index.casualties
## 1 TORNADO 5630 91340 96970 108230
setwd("/Users/hadoop/RepData_PeerAssessment2")
multiplot(p1, p2, cols=2)
summaryYeadData <- function(cleanData) {
summary.data <- cleanData %>%
group_by(BGN_DECADE) %>%
summarise(event.count=n() ,
sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE),
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG) %>%
mutate(sum.casualties = sum.FATALITIES + sum.INJURIES) %>%
mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES)
return (summary.data)
}
summary.year.data <- summaryYeadData(clean.data)
year.dmg.data <- summary.year.data %>%
select(c(1,3,4,5,6)) %>%
ungroup() %>%
arrange(desc(sum.PROPDMG), desc(sum.INJURIES))
head(year.dmg.data)
## Source: local data frame [6 x 5]
##
## BGN_DECADE sum.FATALITIES sum.INJURIES sum.PROPDMG sum.CROPDMG
## 1 2000~ 5994 35133 215797717610 23563649040
## 2 1990~ 5083 38512 70041019047 25432443881
## 3 1980~ 699 13454 13180060840 0
## 4 1970~ 998 21640 8024638050 0
## 5 1960~ 942 17265 3757536860 0
## 6 1950~ 1419 14470 1516409420 0
summaryTimezoneData <- function(cleanData) {
summary.data <- cleanData %>%
group_by(TIME_ZONE) %>%
summarise(event.count=n() ,
sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE),
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG) %>%
mutate(sum.casualties = sum.FATALITIES + sum.INJURIES) %>%
mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES)
return (summary.data)
}
sum.tz.data <- summaryTimezoneData(clean.data)
tz.dmg.data <- sum.tz.data %>%
select(c(1,3,4,5,6)) %>%
ungroup() %>%
arrange(desc(sum.PROPDMG), desc(sum.INJURIES))
head(tz.dmg.data)
## Source: local data frame [6 x 5]
##
## TIME_ZONE sum.FATALITIES sum.INJURIES sum.PROPDMG sum.CROPDMG
## 1 CST 9246 105684 211243390715 33046623911
## 2 EST 3888 25386 74902879232 10276063080
## 3 PST 902 4421 11148174040 4073395200
## 4 MST 736 3826 10504919740 636586550
## 5 AST 189 164 2467821280 616858000
## 6 HST 46 95 843077250 7876630
summaryDamageByTimeZone <- function(data, tz, topn = 3) {
print(tz)
sum.dmg.data <- data %>%
filter(TIME_ZONE == tz) %>%
group_by(EVTYPE) %>%
summarise(sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.DMG = sum.PROPDMG + sum.CROPDMG) %>%
ungroup() %>%
arrange(desc(sum.DMG))
topEvent <- as.data.frame(sum.dmg.data)[1:topn,1]
summary.damage <- data %>%
filter(EVTYPE %in% topEvent) %>%
filter(TIME_ZONE == tz) %>%
group_by(EVTYPE,BGN_YR, TIME_ZONE) %>%
summarise(event.count=n() ,
sum.PROPDMG = sum(PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(CROPDMG, na.rm= TRUE)) %>%
mutate(sum.Damage = sum.PROPDMG + sum.CROPDMG)
return (summary.damage)
}
plotDamageByTimeZone <- function(data, tz, topn = 3) {
summary.dmg.data <- summaryDamageByTimeZone(data, tz, topn)
head(summary.dmg.data)
tzTitle <- paste("Damages by Top",topn,"Event at Time Zone" , tz)
require(ggplot2)
pd1 <- ggplot(aes(x=BGN_YR), data=summary.dmg.data) + geom_line(aes(y= sum.PROPDMG/1e+09, colour="PROPDMG"))+
geom_line(aes(y= sum.CROPDMG/1e+09, colour="CROPDMG"))+
facet_wrap(~ EVTYPE, ncol=topn) +
labs(title = tzTitle) +
labs(y = "Damage(billion)") +
labs(x = "Year of Event") +
theme(axis.text.x= element_text( size=7)) +
theme(axis.text.y= element_text( size=7)) +
theme(strip.text.x= element_text( size=6)) +
theme(strip.text.y= element_text( size=6)) +
theme(legend.text= element_text( size=5)) +
theme(title= element_text( size=7))
damage <- summary.dmg.data
max.dmg <- damage[which(damage$sum.Damage == max(damage$sum.Damage, na.rm = TRUE)),]
print(paste("Top Damage at TimeZone", tz))
print(max.dmg)
return (pd1)
}
pd1 <- plotDamageByTimeZone(clean.data, "CST", 4)
## [1] "CST"
## [1] "Top Damage at TimeZone CST"
## Source: local data frame [1 x 7]
## Groups: EVTYPE, BGN_YR
##
## EVTYPE BGN_YR TIME_ZONE event.count sum.PROPDMG sum.CROPDMG
## 1 STORM.SURGE 2005 CST 7 42995540000 0
## Variables not shown: sum.Damage (dbl)
pd2 <- plotDamageByTimeZone(clean.data, "EST", 4)
## [1] "EST"
## [1] "Top Damage at TimeZone EST"
## Source: local data frame [1 x 7]
## Groups: EVTYPE, BGN_YR
##
## EVTYPE BGN_YR TIME_ZONE event.count sum.PROPDMG sum.CROPDMG
## 1 HURRICANE/TYPHOON 2004 EST 19 11750225000 512250000
## Variables not shown: sum.Damage (dbl)
multiplot( pd1, pd2, cols = 1)
summaryTimeZoneEvent <- function(data, tz, topn) {
sum.data <- summaryDamageByTimeZone(data, tz, topn)
head(sum.data)
sum.tz.data <- sum.data %>%
group_by(EVTYPE, TIME_ZONE) %>%
summarise(sum.PROPDMG = sum(sum.PROPDMG, na.rm= TRUE),
sum.CROPDMG = sum(sum.CROPDMG, na.rm= TRUE),
sum.DMG = sum(sum.Damage, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(sum.DMG))
return (sum.tz.data)
}
head(summaryTimeZoneEvent(clean.data, "CST", 5))
## [1] "CST"
## Source: local data frame [5 x 5]
##
## EVTYPE TIME_ZONE sum.PROPDMG sum.CROPDMG sum.DMG
## 1 TORNADO CST 48070077350 313851410 48383928760
## 2 HURRICANE/TYPHOON CST 45869775000 1627880800 47497655800
## 3 STORM.SURGE CST 43007805000 0 43007805000
## 4 FLOOD CST 20044169800 8328922000 28373091800
## 5 DROUGHT CST 1027051000 11568273000 12595324000
head(summaryTimeZoneEvent(clean.data, "EST", 5))
## [1] "EST"
## Source: local data frame [5 x 5]
##
## EVTYPE TIME_ZONE sum.PROPDMG sum.CROPDMG sum.DMG
## 1 HURRICANE/TYPHOON EST 23163325000 975620000 24138945000
## 2 FLOOD EST 11161732317 1686538450 12848270767
## 3 FLASH.FLOOD EST 8029923072 667079050 8697002122
## 4 TORNADO EST 7912645943 42505050 7955150993
## 5 HURRICANE EST 5853741000 1844430000 7698171000
plotCasualtiesByTimeZone <- function(clean.data, tz, topn =3) {
summary.tz.data <- clean.data %>%
filter(TIME_ZONE == tz) %>%
group_by(EVTYPE) %>%
summarise(sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE)) %>%
mutate(sum.casualties = sum.FATALITIES + sum.INJURIES) %>%
mutate(index.casualties = sum.FATALITIES * 3 + sum.INJURIES) %>%
ungroup() %>%
arrange(desc(index.casualties))
summary.casualties <- clean.data %>%
filter(EVTYPE %in% as.data.frame(summary.tz.data)[1:topn,1]) %>%
filter(TIME_ZONE == tz) %>%
group_by(EVTYPE,BGN_DECADE,TIME_ZONE) %>%
summarise(sum.FATALITIES = sum(FATALITIES, na.rm= TRUE),
sum.INJURIES = sum(INJURIES, na.rm= TRUE)) %>%
mutate(sum.Casualties = sum.FATALITIES + sum.INJURIES)
tzTitle <- paste("Total Casualties by Top",topn," Event at TimeZone", tz)
casualties.data <- summary.casualties %>%
gather(harmType, harmValue, sum.FATALITIES:sum.Casualties)
pc1 <- ggplot(casualties.data, aes(x=BGN_DECADE, y=harmValue/1000, fill=harmType)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = tzTitle) +
facet_wrap(~ EVTYPE, ncol=topn) +
xlab("Event Decade") +
ylab("Casualties(thousand)") +
labs(fill="Harm Type")
theme(axis.text.x= element_text( size=7)) +
theme(axis.text.y= element_text( size=7)) +
theme(strip.text.x= element_text( size=6)) +
theme(strip.text.y= element_text( size=6)) +
theme(legend.text= element_text( size=7)) +
theme(title= element_text( size=7))
casualties <- subset(summary.casualties, TIME_ZONE == tz )
max.casualties <- casualties[which(casualties$sum.Casualties == max(casualties$sum.Casualties)),]
print(paste("Top Casualties in TimeZone", tz))
print(max.casualties)
return (pc1)
}
pc1 <- plotCasualtiesByTimeZone(clean.data, "EST", 4)
## [1] "Top Casualties in TimeZone EST"
## Source: local data frame [1 x 6]
## Groups: EVTYPE, BGN_DECADE
##
## EVTYPE BGN_DECADE TIME_ZONE sum.FATALITIES sum.INJURIES sum.Casualties
## 1 TORNADO 2000~ EST 259 4540 4799
pc2 <- plotCasualtiesByTimeZone(clean.data, "CST", 4)
## [1] "Top Casualties in TimeZone CST"
## Source: local data frame [1 x 6]
## Groups: EVTYPE, BGN_DECADE
##
## EVTYPE BGN_DECADE TIME_ZONE sum.FATALITIES sum.INJURIES sum.Casualties
## 1 TORNADO 1970~ CST 995 21436 22431
multiplot(pc1, pc2, cols = 1)