Synopsis: The following analysis data, code and report are based on storm database from the U.S National Oceanic and Atmospheric Administration (NOAA). Downloaded data were processed and analyzed in R. The objective is to explore and identify the most “disastrous” storms or other weather events, that cause the major health and economic damage. The findings of this analysis may facilitate the further prevention of those events.
library(data.table)
library(plyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
dataUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(dataUrl, destfile=file.path(getwd(),"StormData.csv.bz2"),
method = "curl")
storm <-fread(sprintf("bzcat %s", "StormData.csv.bz2"))
##
Read 8.3% of 967216 rows
Read 33.1% of 967216 rows
Read 49.6% of 967216 rows
Read 64.1% of 967216 rows
Read 78.6% of 967216 rows
Read 87.9% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:09
storm <- storm[,c("EVTYPE", "FATALITIES",
"INJURIES", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
storm$EVTYPE<-tolower(trimws(storm$EVTYPE))
# look at the original events.
event.table <- as.data.frame(table(storm$EVTYPE))
event.table <- event.table[order(-event.table$Freq),]
event.table$percent <- event.table$Freq/sum(event.table$Freq)
event.table$cum <- cumsum(event.table$percent)
plot(event.table$cum)+abline(h = 0.98, col = "red", lty = 3)
## numeric(0)
Even though there are nearly thousands of events types, but: 1) the top 10 events contributes more than 90% of total events, 2) we can tidy the original events.
events.csv is an mannully cleaned events data, the original events are downloaded from in Jun 8, 2017: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
you can also download the tidy event file from my github repository: https://github.com/Rui-Lian/reproducible-research-project-2/blob/master/events.csv
# read "official" events data
events48 <- read.csv(file.path(getwd(),"events.csv"),
header = FALSE, sep = ",")
# paste the events by row
events48 <- apply(events48,1,paste,collapse=" ")
# remove the white spaces
events48 <- trimws(events48)
#lower case the events
events48 <- tolower(events48)
events48
## [1] "astronomical low tide" "avalanche"
## [3] "blizzard" "coastal flood"
## [5] "cold wind chill" "debris flow"
## [7] "dense fog" "dense smoke"
## [9] "drought" "dust devil"
## [11] "dust storm" "excessive heat"
## [13] "extreme cold wind chill" "flash flood"
## [15] "flood" "frost freeze"
## [17] "funnel cloud" "freezing fog"
## [19] "hail" "heat"
## [21] "heavy rain" "heavy snow"
## [23] "high surf" "high wind"
## [25] "hurricane typhoon" "ice storm"
## [27] "lake-effect snow" "lakeshore flood"
## [29] "lightning" "marine hail"
## [31] "marine high wind" "marine strong wind"
## [33] "marine thunderstorm wind" "rip current"
## [35] "seiche" "sleet"
## [37] "storm surge tide" "strong wind"
## [39] "thunderstorm wind" "tornado"
## [41] "tropical depression" "tropical storm"
## [43] "tsunami" "volcanic ash"
## [45] "waterspout" "wildfire"
## [47] "winter storm" "winter weather"
method: calculate the “string distance between character vectors”. this approach is adopted from : https://www.r-bloggers.com/fuzzy-string-matching-a-survival-skill-to-tackle-unstructured-information/, by Bigdata Doc. Check it out for more methodology For more information on Levenshtein (edit) distance, check: https://en.wikipedia.org/wiki/Levenshtein_distance
dist.name<-adist(event.table$Var1, events48, partial = TRUE, ignore.case = TRUE)
# dist.name is a matrix that iterate the distance between variables in source1.devices and source2.devices.
min.name<-apply(dist.name, 1, min) # the minimum distance value of each row.
# min.name is a vector that is the minimum value of each row.
match.s1.s2<-NULL
# initiate a empty dataframe.
for(i in 1:nrow(dist.name))
{
s2.i<-match(min.name[i],dist.name[i,]) # for each row, match (reture column number) the column that has the minimum distance of that row
s1.i<-i #row number
match.s1.s2<-rbind(data.frame(s2.i=s2.i,s1.i=s1.i,name48=events48[s2.i],
name.raw=event.table$Var1[s1.i],
adist=min.name[i]),
match.s1.s2)
}
# and we then can have a look at the results
head(match.s1.s2,10)
## s2.i s1.i name48 name.raw adist
## 1 5 890 cold wind chill wnd 1
## 2 31 889 marine high wind winter storm/high winds 10
## 3 31 888 marine high wind winter storm/high wind 9
## 4 31 887 marine high wind winter storm high winds 10
## 5 5 886 cold wind chill wind/hail 3
## 6 26 885 ice storm wind storm 3
## 7 31 884 marine high wind wind chill/high wind 8
## 8 48 883 winter weather wind and wave 7
## 9 16 882 frost freeze wild/forest fires 9
## 10 48 881 winter weather wet weather 2
storm.new <- merge(storm, match.s1.s2, by.x = "EVTYPE", by.y = "name.raw")
storm.new$FATALITIES[storm.new$FATALITIES == ""] <- 0
storm.new$INJURIES[storm.new$INJURIES == ""] <- 0
storm.new$PROPDMG [storm.new$PROPDMG == ""] <- 0
storm.new$CROPDMG[storm.new$CROPDMG == ""] <- 0
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "")] <- 0
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "+") | (storm.new$PROPDMGEXP == "-") | (storm.new$PROPDMGEXP == "?")] <- 1
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "h") | (storm.new$PROPDMGEXP == "H")] <- 2
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "k") | (storm.new$PROPDMGEXP == "K")] <- 3
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "m") | (storm.new$PROPDMGEXP == "M")] <- 6
storm.new$PROPDMGEXP[(storm.new$PROPDMGEXP == "B")] <- 9
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "")] <- 0
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "+") | (storm.new$CROPDMGEXP == "-") | (storm.new$CROPDMGEXP == "?")] <- 1
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "h") | (storm.new$CROPDMGEXP == "H")] <- 2
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "k") | (storm.new$CROPDMGEXP == "K")] <- 3
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "m") | (storm.new$CROPDMGEXP == "M")] <- 6
storm.new$CROPDMGEXP[(storm.new$CROPDMGEXP == "B")] <- 9
storm.new$PROPDMGEXP <- as.integer(storm.new$PROPDMGEXP)
storm.new$CROPDMGEXP <- as.integer(storm.new$CROPDMGEXP)
storm.new$prop.dollar <- storm.new$PROPDMG * 10^storm.new$PROPDMGEXP
storm.new$crop.dollar <- storm.new$CROPDMG * 10^storm.new$CROPDMGEXP
fatal <- ddply(
storm.new,
.(name48),
summarize,
fatal=sum(FATALITIES)
)
fatal <- fatal[order(-fatal$fatal),]
fatal <- fatal[1:10,]
g <- ggplot(fatal, aes(reorder(name48,-fatal), y = fatal, fill = name48))
g <- g + geom_bar(stat = "identity")
g <- g + guides (fill = FALSE)
g <- g + theme(axis.text.x = element_text(angle = 90))
g <- g + xlab("events") + ylab("total fatal cases")
g <- g + ggtitle("leaing causes of fatal cases is tornado")
print(g)
Tornado is the leading cause of fatal accidents from 1996.
injury <- ddply(
storm.new,
.(name48),
summarize,
Injury=sum(INJURIES)
)
injury <- injury[order(-injury$Injury),]
injury <- injury[1:10,]
injury
## name48 Injury
## 19 tornado 91364
## 7 marine thunderstorm wind 9467
## 27 excessive heat 9077
## 14 coastal flood 6894
## 17 lightning 5231
## 3 ice storm 2128
## 32 flash flood 1802
## 35 high wind 1482
## 36 winter storm 1415
## 38 hail 1361
g <- ggplot(injury, aes(x = reorder(name48, -Injury), y = Injury, fill = name48))
g<- g + geom_bar(stat = "identity")
g <- g + guides (fill = FALSE)
g <- g + theme(axis.text.x = element_text(angle = 90))
g <- g + xlab("events") + ylab("total injury")
g <- g + ggtitle("leaing causes of injury cases is tornado")
print(g)
tornado is also the leading cause of injuries.
property <- data.frame(event = storm.new$name48,
crop = storm.new$crop.dollar,
prop = storm.new$prop.dollar)
property.melt <- melt(property, id.vars = "event", variable.name = "type", value.name = "value")
property.melt <- property.melt[order(-property.melt$value),]
g <- ggplot(property.melt, aes(x = reorder(event, -value), y = value, fill = event))
g <- g + geom_bar(stat = "identity")
g <- g + guides (fill = FALSE)
g <- g + theme(axis.text.x = element_text(angle = 90, size = 4))
g <- g + xlab("events") + ylab("total value lost")
g <- g + facet_grid(.~type)
g <- g + ggtitle("events that causes the economic damage.")
print(g)
costal flood is the leading cause of property lost. draught is the leading cause of crop damage.