This is prepared for Johns Hopkins’ Reproducible Research online class Course Project Reproducible Research Peer Assessment 2
The NOAA Storm Data is used for my entire analysis to explore the Storm Database and answer the following questions about severe weather events recorded across the United States
My analysis result is based on table summaries, figures, computation results between 1950 and 2011 across the United State.
which types of events are most harmful with respect to population health? Determined by the max value of sum by FATALITIES, INJURIES data
which types of events have the greatest economic consequences> Determined by the max value of sum by PROPDMG and CROPDMG data
library(dplyr)
library(ggplot2)
library(knitr)
opts_chunk$set(echo=TRUE, results="hide")
# helper function
checkDir <- function ( thedir ) {
# if datadir not exist, create datadir
if (dir.exists( thedir ) == FALSE) {
dir.create( thedir )
}
return(thedir)
}
downloadZipFile <- function( src.file.url, dest.dir, dest.file ) {
dest.filepath <- file.path(dest.dir, dest.file)
if (file.exists(dest.filepath)) {
} else {
download.file( src.file.url, destfile=dest.filepath, method="libcurl", mode="wb")
}
}
cur.dir <- getwd()
data.dir <- checkDir(file.path(cur.dir, "mydata"))
rawdata.filename <- "StormData.csv.bz2"
downloadZipFile(src.file.url="https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
dest.dir=data.dir, dest.file=rawdata.filename)
# helper function
loadbzDataset <- function( data.file ) { # load one data set at one time
df.data <- read.csv( bzfile( data.file, "rt"), header = TRUE)
return (df.data)
}
rawdata <- loadbzDataset( file.path(data.dir, rawdata.filename))
names(rawdata)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
data <- distinct( select(rawdata, EVTYPE, BGN_DATE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP))
# FYI
summary(data)
## Warning: closing unused connection 5 (D:/2015/training/_coursera/
## 5.ReProducibleResearch.Jul2015/assignment.2/RepDataAssignment2/mydata/
## StormData.csv.bz2)
## EVTYPE BGN_DATE FATALITIES
## TSTM WIND :38082 4/27/2011 0:00:00: 263 Min. : 0.0000
## TORNADO :33836 6/27/1998 0:00:00: 190 1st Qu.: 0.0000
## HAIL :27231 6/4/2008 0:00:00 : 183 Median : 0.0000
## FLASH FLOOD :18083 5/31/1998 0:00:00: 176 Mean : 0.0692
## THUNDERSTORM WIND:15330 5/18/1995 0:00:00: 174 3rd Qu.: 0.0000
## LIGHTNING :12343 4/15/2011 0:00:00: 164 Max. :583.0000
## (Other) :66889 (Other) :210644
## INJURIES PROPDMG PROPDMGEXP CROPDMG
## Min. : 0.0000 Min. : 0.00 K :143575 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.00 : 58188 1st Qu.: 0.000
## Median : 0.0000 Median : 2.00 M : 9760 Median : 0.000
## Mean : 0.6449 Mean : 38.29 0 : 148 Mean : 5.402
## 3rd Qu.: 0.0000 3rd Qu.: 25.00 B : 40 3rd Qu.: 0.000
## Max. :1700.0000 Max. :5000.00 5 : 22 Max. :990.000
## (Other): 61
## CROPDMGEXP
## :139136
## K : 70816
## M : 1795
## k : 16
## 0 : 15
## B : 8
## (Other): 8
table(rawdata$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
table(rawdata$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
# convert code value to uppercase
data$EVTYPE <- toupper( data$EVTYPE )
data$PROPDMGEXP <- toupper( data$PROPDMGEXP)
data$CROPDMGEXP <- toupper( data$CROPDMGEXP)
data <- mutate( data, EVYEARMO=format( as.POSIXlt( data$BGN_DATE, format="%m/%d/%Y %H:%M:%S"), "%Y-%m"))
# trim leading/ending spaces, Correction: comma, typos ...
data$EVTYPE <- gsub('^ +', '', data$EVTYPE)
data$EVTYPE <- gsub(' +$', '', data$EVTYPE)
data$EVTYPE <- gsub(' ', ' ', data$EVTYPE)
data$EVTYPE <- gsub(' , ', ',', data$EVTYPE)
data$EVTYPE <- gsub('AVALANCE', 'AVALANCHE', data$EVTYPE)
data$EVTYPE <- gsub('COASTALFLOOD', 'COASTAL FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('COASTALSTORM', 'COASTAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('TEMPERATURES', 'TEMPERATURE', data$EVTYPE)
data$EVTYPE <- gsub('DUST DEVEL', 'DUST DEVIL', data$EVTYPE)
data$EVTYPE <- gsub('PRECIPATATION', 'PRECIPITATION', data$EVTYPE)
# re-get unique data
data <- distinct( select(data, EVTYPE, BGN_DATE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP) )
# convert to numbers, K= 1000, M=million, B= billion
data$PROPDMG <- as.numeric(data$PROPDMG) * as.numeric(
ifelse ((data$PROPDMGEXP == "K"), 1000,
ifelse ((data$PROPDMGEXP == "M"), 1000000,
ifelse ((data$PROPDMGEXP == "B"), 1000000000,
ifelse (is.numeric(data$PROPDMGEXP), data$PROPDMGEXP, 1))))
)
data$CROPDMG <- as.numeric(data$CROPDMG) * as.numeric(
ifelse ((data$CROPDMGEXP =="K"), 1000,
ifelse ((data$CROPDMGEXP =="M"), 1000000,
ifelse ((data$CROPDMGEXP =="B"), 1000000000,
ifelse (is.numeric(data$CROPDMGEXP), data$CROPDMGEXP, 1))))
)
# calculate sum and max value by event types
byevtype <- group_by( data, EVTYPE)
summarize.byevtype <- summarise( byevtype, count=n(),
FATALITIES_SUM=sum(FATALITIES, na.rm=TRUE ), FATALITIES_MAX = max(FATALITIES, na.rm=TRUE),
INJURIES_SUM = sum(INJURIES, na.rm=TRUE), INJURIES_MAX = max(INJURIES, na.rm=TRUE),
PROPDMG_SUM = sum(PROPDMG, na.rm = TRUE), PROPDMG_MAX = max(PROPDMG, na.rm=TRUE),
CROPDMG_SUM = sum(CROPDMG, na.rm=TRUE), CROPDMG_MAX = max(CROPDMG, na.rm=TRUE)
)
summary(summarize.byevtype)
## EVTYPE count FATALITIES_SUM FATALITIES_MAX
## Length:46 Min. : 1.0 Min. : 0.00 Min. : 0.000
## Class :character 1st Qu.: 18.0 1st Qu.: 0.00 1st Qu.: 0.000
## Mode :character Median : 59.0 Median : 1.00 Median : 1.000
## Mean : 258.6 Mean : 19.98 Mean : 4.935
## 3rd Qu.: 189.0 3rd Qu.: 8.25 3rd Qu.: 2.000
## Max. :3365.0 Max. :547.00 Max. :158.000
## INJURIES_SUM INJURIES_MAX PROPDMG_SUM
## Min. : 0.0 Min. : 0.00 Min. :0.000e+00
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:7.970e+04
## Median : 0.5 Median : 0.50 Median :3.714e+06
## Mean : 153.6 Mean : 34.57 Mean :4.070e+08
## 3rd Qu.: 11.0 3rd Qu.: 5.50 3rd Qu.:2.501e+07
## Max. :5478.0 Max. :1150.00 Max. :9.417e+09
## PROPDMG_MAX CROPDMG_SUM CROPDMG_MAX
## Min. :0.000e+00 Min. : 0 Min. : 0
## 1st Qu.:5.125e+04 1st Qu.: 0 1st Qu.: 0
## Median :1.250e+06 Median : 0 Median : 0
## Mean :1.218e+08 Mean : 13235065 Mean : 5374935
## 3rd Qu.:1.180e+07 3rd Qu.: 9450250 3rd Qu.: 4500000
## Max. :2.800e+09 Max. :136525000 Max. :50000000
# order/sort computed results for sum, mean, max
# FATALITIES
cleandata.sum.1 <- arrange( summarize.byevtype, desc(FATALITIES_SUM))
data.1 <- cleandata.sum.1[1:10, c("EVTYPE","FATALITIES_SUM", "count")]
data.1
## Source: local data frame [10 x 3]
##
## EVTYPE FATALITIES_SUM count
## 1 TORNADO 547 1160
## 2 HEAT 60 71
## 3 FLASH FLOOD 57 1060
## 4 THUNDERSTORM WIND 51 3365
## 5 FLOOD 41 1062
## 6 EXCESSIVE HEAT 32 81
## 7 RIP CURRENT 28 44
## 8 LIGHTNING 25 637
## 9 COLD/WIND CHILL 19 35
## 10 HIGH SURF 11 58
fatality_sum_max <- round(max(cleandata.sum.1$FATALITIES_SUM, na.rm=TRUE))
fatality_sum_avg <- round(mean(cleandata.sum.1$FATALITIES_SUM, na.rm=TRUE))
#index/location of max value located
fatality_index_max <- which( round(cleandata.sum.1$FATALITIES_SUM) == fatality_sum_max)
paste("FATALITIES: EVENT TYPE= ", cleandata.sum.1$EVTYPE[fatality_index_max],
" max value= ", fatality_sum_max, " avg/mean value is ", fatality_sum_avg)
## [1] "FATALITIES: EVENT TYPE= TORNADO max value= 547 avg/mean value is 20"
# INJURIES
cleandata.sum.2 <- arrange( summarize.byevtype, desc(INJURIES_SUM))
data.2 <- cleandata.sum.2[1:10, c("EVTYPE","INJURIES_SUM", "count")]
data.2
## Source: local data frame [10 x 3]
##
## EVTYPE INJURIES_SUM count
## 1 TORNADO 5478 1160
## 2 HEAT 611 71
## 3 THUNDERSTORM WIND 355 3365
## 4 LIGHTNING 177 637
## 5 EXCESSIVE HEAT 138 81
## 6 WILDFIRE 110 486
## 7 STRONG WIND 32 277
## 8 HAIL 31 1128
## 9 FLASH FLOOD 30 1060
## 10 RIP CURRENT 27 44
injury_sum_avg <- round(mean(cleandata.sum.2$INJURIES_SUM, na.rm=TRUE))
injury_sum_max <- round(max(cleandata.sum.2$INJURIES_SUM, na.rm=TRUE))
injury_index_max <- which( round(cleandata.sum.2$INJURIES_SUM) == injury_sum_max)
paste("INJURIES: EVENT TYPE= ", cleandata.sum.2$EVTYPE[injury_index_max],
" max value= ", injury_sum_max, " avg/mean value is ", injury_sum_avg)
## [1] "INJURIES: EVENT TYPE= TORNADO max value= 5478 avg/mean value is 154"
# PROPDMG
cleandata.sum.3 <- arrange( summarize.byevtype, desc(PROPDMG_SUM))
data.3 <- cleandata.sum.3[1:10, c("EVTYPE", "PROPDMG_SUM", "count")]
data.3
## Source: local data frame [10 x 3]
##
## EVTYPE PROPDMG_SUM count
## 1 TORNADO 9417129900 1160
## 2 FLOOD 6309196950 1062
## 3 FLASH FLOOD 1244228700 1060
## 4 WILDFIRE 637917400 486
## 5 HAIL 428118800 1128
## 6 THUNDERSTORM WIND 270133560 3365
## 7 TROPICAL STORM 84112700 66
## 8 TSUNAMI 53554000 9
## 9 LIGHTNING 41343920 637
## 10 STORM SURGE/TIDE 40695000 13
propdmg_sum_avg <- round(mean(cleandata.sum.3$PROPDMG_SUM,na.rm=TRUE))
propdmg_sum_max <- round(max(cleandata.sum.3$PROPDMG_SUM,na.rm=TRUE))
propdmg_index_max <- which( round(cleandata.sum.3$PROPDMG_SUM) == propdmg_sum_max)
paste("PROPDMG: EVENT TYPE= ", cleandata.sum.3$EVTYPE[propdmg_index_max],
" max value= ", propdmg_sum_max, " avg/mean value is ", propdmg_sum_avg)
## [1] "PROPDMG: EVENT TYPE= TORNADO max value= 9417129900 avg/mean value is 406972765"
# CROPDMG
cleandata.sum.4 <- arrange( summarize.byevtype, desc(CROPDMG_SUM))
data.4 <- cleandata.sum.4[1:10, c("EVTYPE", "CROPDMG_SUM", "count")]
data.4
## Source: local data frame [10 x 3]
##
## EVTYPE CROPDMG_SUM count
## 1 FLOOD 136525000 1062
## 2 THUNDERSTORM WIND 130574000 3365
## 3 FLASH FLOOD 83648000 1060
## 4 HAIL 63578000 1128
## 5 HIGH WIND 44293000 488
## 6 DROUGHT 31269000 77
## 7 TORNADO 29602000 1160
## 8 TROPICAL STORM 24501000 66
## 9 HEAVY RAIN 20713000 246
## 10 STRONG WIND 15059000 277
cropdmg_sum_avg <- round(mean(cleandata.sum.4$CROPDMG_SUM, na.rm=TRUE))
cropdmg_sum_max <- round(max(cleandata.sum.4$CROPDMG_SUM, na.rm=TRUE))
cropdmg_index_max <- which( round(cleandata.sum.4$CROPDMG_SUM) == cropdmg_sum_max)
paste("CROPDMG: EVENT TYPE= ", cleandata.sum.4$EVTYPE[cropdmg_index_max],
" max value= ", cropdmg_sum_max, " avg/mean value is ", cropdmg_sum_avg)
## [1] "CROPDMG: EVENT TYPE= FLOOD max value= 136525000 avg/mean value is 13235065"
Create exploratory plots
ggplot( data.1, aes(reorder(EVTYPE, -FATALITIES_SUM),FATALITIES_SUM , fill=EVTYPE)) +
geom_histogram(stat="identity") + labs(x="Event Type", y="Total Fatalities") +
ggtitle("1950-2011 U.S. Total Fatalities of Top 10 Weather-Related Events ") +
theme(legend.position="bottom", axis.text.x=element_text(angle=25))
ggplot( data.3, aes(reorder(EVTYPE, -PROPDMG_SUM), PROPDMG_SUM, fill=EVTYPE)) +
geom_histogram(stat="identity") + geom_line() + labs(x="Event Type", y="Total Property Damage Estimate") +
ggtitle("1950-2011 U.S. Total Property Damage of Top 10 Weather-Related Events") +
theme(legend.position="bottom", axis.text.x=element_text(angle=25))
## geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
ggplot( data.4, aes(reorder(EVTYPE, -CROPDMG_SUM), CROPDMG_SUM, fill=EVTYPE)) +
geom_histogram(stat="identity") + labs(x="Event Type", y="Total Crop Damage Estimate") +
ggtitle("1950-2011 Total Crop Damage of Top 10 Weather-Related Events") +
theme(legend.position="bottom", axis.text.x=element_text(angle=25))