##The data file is decompressed and loaded into R through fread function
library(data.table, quietly = TRUE)
library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
if(!file.exists("repdata_data_StormData.csv")){
shell("bzip2 -d -k repdata_data_StormData.csv.bz2")
}
##specify nrows only to suppress warning message because the total rows count is 902297
DF_StormData <- fread("repdata_data_StormData.csv", nrows = 902298, showProgress = FALSE)
tbl_df_StormData <- tbl_df(DF_StormData)
##Data rows with non-zreo value of fatalities, injuries, property damage and crop damage are filtered for later processing
filtered_StormData <- filter(tbl_df_StormData,
FATALITIES > 0 | INJURIES > 0 |
PROPDMG > 0 | CROPDMG > 0, EVTYPE != "?")
## Data Cleaning
##Add a factor column corresponding with EVTYPE column, in order to use standard type names ##definedinrepdata_peer2_doc_pd01016005curr.pdf and to lessen too much event types due to spelling inconsistency
## Function definition part
func_eliminatedupeventtype <- function(arg_EVTYPE){
##convert to lower case
ProcessedEVTYPE <- tolower(arg_EVTYPE)
##eliminate non-alphabet character
ProcessedEVTYPE <- gsub("[^A-Za-z]","",ProcessedEVTYPE)
##eliminate plural form:
ProcessedEVTYPE <- gsub("s$","",ProcessedEVTYPE)
##convert relative event description of thunderstormwind event to standard format
ProcessedEVTYPE <- gsub("^thu.*","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^tst.*","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^tun.*","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^severe.*storm.*","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*burst.*","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("apachecounty","thunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("gustnado","thunderstormwind",ProcessedEVTYPE)
##convert relative event description of lake effect snow event and heavy snow event
ProcessedEVTYPE <- gsub("heavylakesnow","lakeeffect",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("lakeeffectsnow","lakeeffect",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*snow.*","heavysnow",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("lakeeffect","lakeeffectsnow",ProcessedEVTYPE)
##convert relative event description of flood event, coastal flood and flash flood event
ProcessedEVTYPE <- gsub(".*coastal.*","coastal",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*flash.*","flash",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*flood.*","flood",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*urban.*","flood",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("coastal","coastalflood",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("flash","flashflood",ProcessedEVTYPE)
##convert relative event description of winter storm event and winter weather event
ProcessedEVTYPE <- gsub(".*winterstorm.*","winterstorm",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*winterweather.*","winterweather",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*mixed.*","winterweather",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("wintrymix","winterweather",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("glaze","winterweather",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("heavymix","winterweather",ProcessedEVTYPE)
##convert relative event description of heavy rain event and lightning event
ProcessedEVTYPE <- gsub(".*rain.*","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("dambreak","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("drowning","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("excessivewetnes","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("heavyprecipitation","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("heavyshower","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("rapidlyrisingwater","heavyrain",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*lig.*","lightning",ProcessedEVTYPE)
##convert relative event description of ice storm event
ProcessedEVTYPE <- gsub(".*ice.*","icestorm",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^ic.*","icestorm",ProcessedEVTYPE)
##convert relative event description of hurricane/typhoon event
ProcessedEVTYPE <- gsub("hurri.*","hurricane",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("(hurri|typh).*","hurricanetyphoon",ProcessedEVTYPE)
##convert relative event description of waterspout event
ProcessedEVTYPE <- gsub(".*waterspout.*","waterspout",ProcessedEVTYPE)
##convert relative event description of tornado event
ProcessedEVTYPE <- gsub("landspout","tornado",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*torn.*","tornado",ProcessedEVTYPE)
##convert relative event description of cold wind chill event and extreme cold wind chill event
ProcessedEVTYPE <- gsub("^(ex|re).*cold.*","exco",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*cold.*","coldwindchill",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("lowtemp.*","coldwindchill",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*hyp.*","coldwindchill",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("coolandwet","coldwindchill",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*exco.*","extremecoldwindchill",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("extremewindchill","extremecoldwindchill",ProcessedEVTYPE)
##convert relative event description of frost/freeze event, dense fog event and freezing fog event
ProcessedEVTYPE <- gsub("freezingspray|freezingfog","frefo",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*free.*","frostfreeze",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^fog$","densefog",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("frefo","freezingfog",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*frost.*","frostfreeze",ProcessedEVTYPE)
##convert relative event description of drought event, excessive heat event and heat event
ProcessedEVTYPE <- gsub(".*drou.*","drought",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^(ex|re).*heat.*","exhe",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("heat.*","heat",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*warm.*","heat",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("exhe.*","excessiveheat",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("tropicals.*","tropicalstorm",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^marinet.*wind","marinethunderstormwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*wind.*sea.*","marinehighwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("marinemishap","marinestrongwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^highwind.*","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("duststormhighwind","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("gustywind","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("nontstmwind","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^high$","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^severe.*","highwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^wind.*","strongwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("gradientwind","strongwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("nonseverewinddamage","strongwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("stormforcewind","strongwind",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("marinehail","maha",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*hail.*","hail",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("maha","marinehail",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*surf.*","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*swell.*","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("beacherosion","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^highsea$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^highwater$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^highwave$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^landslump$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^marineaccident$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^roguewave$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^roughsea$","highsurf",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("whirlwind","dustdevil",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("blowingdust","dustdevil",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*blizzard.*","blizzard",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*stormsurge.*","stormsurgetide",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*hightide.*","stormsurgetide",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*fire.*","wildfire",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub(".*slide.*","debrisflow",ProcessedEVTYPE)
index_other <- which(ProcessedEVTYPE == "other")
ProcessedEVTYPE[index_other] <- c("winterweather","dustdevil",rep("heavyrain",30),"dustdevil","dustdevil")
ProcessedEVTYPE <- gsub(".*aval.*","avalanche",ProcessedEVTYPE)
ProcessedEVTYPE <- gsub("^heavysea$","ripcurrent",ProcessedEVTYPE)
return(ProcessedEVTYPE)
}
##Caculate the actual damage value according to the DMG column and DMGEXP column
func_dmgvalue <- function(arg_DMG, arg_DMGEXP){
DMGVALUE <- ifelse(tolower(arg_DMGEXP) == 'h', arg_DMG * 100,
ifelse(tolower(arg_DMGEXP) == 'k' , arg_DMG * 1000,
ifelse(tolower(arg_DMGEXP) == 'm' , arg_DMG * 1000000,
ifelse(tolower(arg_DMGEXP) == 'b' , arg_DMG * 1000000000,
ifelse(arg_DMGEXP == '-' | arg_DMGEXP == '', 0,
ifelse(arg_DMGEXP %in% 0:8, arg_DMG * 10, arg_DMG))))))
return(DMGVALUE)
}
## Execution part
filtered_StormData$ProcessedEVTYPE <- func_eliminatedupeventtype(filtered_StormData$EVTYPE)
filtered_StormData$PROPDMGVALUE <- func_dmgvalue(filtered_StormData$PROPDMG, filtered_StormData$PROPDMGEXP)
filtered_StormData$CROPDMGVALUE <- func_dmgvalue(filtered_StormData$CROPDMG, filtered_StormData$CROPDMGEXP)
library(data.table, quietly = TRUE)
library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
library(grid, quietly = TRUE)
library(ggplot2, quietly = TRUE)
##make caculation and draw figures for the 1st questions
by_ProcessedEVTYPE <- group_by(filtered_StormData, ProcessedEVTYPE)
df_result <- summarise(by_ProcessedEVTYPE,
sumFATALITIES = sum(FATALITIES),
sumINJURIES = sum(INJURIES),
sumPROPDMGVALUE = sum(PROPDMGVALUE) / (10^9),
sumCROPDMGVALUE = sum(CROPDMGVALUE) / (10^9))
df_sumFATALITIES <- df_result %>% arrange(desc(sumFATALITIES)) %>% select(1:2)
p1 <- ggplot(df_sumFATALITIES[1:10,], aes(x = reorder(ProcessedEVTYPE, -sumFATALITIES), y = sumFATALITIES)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sumFATALITIES), vjust = -0.2) +
ylab("FATALITIES (unit:person)") +
xlab("EVTYPE") +
ggtitle("THE TOP 10 EVENT TYPES RESULTING \nIN FATALITIES(1950-2011)") +
theme(plot.title = element_text(size = 18)) +
coord_cartesian(ylim = c(0, 6500))
df_sumINJURIES <- df_result %>% arrange(desc(sumINJURIES)) %>% select(c(1,3))
p2 <- ggplot(df_sumINJURIES[1:10,], aes(x = reorder(ProcessedEVTYPE, -sumINJURIES), y = sumINJURIES)) +
geom_bar(stat = "identity", color = "darkgrey", fill = "darkgrey") +
geom_text(aes(label = sumINJURIES), vjust = -0.2) +
ylab("INJURIES (unit:person)") +
xlab("EVTYPE") +
ggtitle("THE TOP 10 EVENT TYPES RESULTING \nIN INJURIES(1950-2011)") +
theme(plot.title = element_text(size = 18)) +
coord_cartesian(ylim = c(0, 100000))
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 1)))
vplayout = function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(p1, vp = vplayout(1, 1))
print(p2, vp = vplayout(2, 1))
Tornado, excessive heat and heat are the top 3 event types which are most harmful with respect to fatalities
Tornado, thunderstrom wind and flood are the top 3 event types which are most harmful with respect to injuries
##make caculation and draw figures for the 2nd questions
df_sumPROPDMG <- df_result %>% arrange(desc(sumPROPDMGVALUE)) %>% select(c(1,4))
p3 <- ggplot(df_sumPROPDMG[1:10,], aes(x = reorder(ProcessedEVTYPE, -sumPROPDMGVALUE), y = sumPROPDMGVALUE)) +
geom_bar(stat = "identity") +
geom_text(aes(label = format(sumPROPDMGVALUE,digits = 3)), vjust = -0.2) +
ylab("PROPERTY DAMAGE (unit:billion)") +
xlab("EVTYPE") +
ggtitle("THE TOP 10 EVENT TYPES RESULTING \nIN PROPERTY DAMAGE(1950-2011)") +
theme(plot.title = element_text(size = 18)) +
coord_cartesian(ylim = c(0, 165))
df_sumCROPDMG <- df_result %>% arrange(desc(sumCROPDMGVALUE)) %>% select(c(1,5))
p4 <- ggplot(df_sumCROPDMG[1:10,], aes(x = reorder(ProcessedEVTYPE, -sumCROPDMGVALUE), y = sumCROPDMGVALUE)) +
geom_bar(stat = "identity", color = "darkgrey", fill = "darkgrey") +
geom_text(aes(label = format(sumCROPDMGVALUE,digits = 2)), vjust = -0.2) +
ylab("CROP DAMAGE (unit:billion)") +
xlab("EVTYPE") +
ggtitle("THE TOP 10 EVENT TYPES RESULTING \nIN CROP DAMAGE(1950-2011)") +
theme(plot.title = element_text(size = 18)) +
coord_cartesian(ylim = c(0, 16))
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 1)))
vplayout = function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(p3, vp = vplayout(1, 1))
print(p4, vp = vplayout(2, 1))
Flood, hurricane/typhoon and tornado are the top 3 event types which are most harmful with respect to property damage
Drought, flood and hurricane/typhoon are the top 3 event types which are most harmful with respect to crop damage