=================================================================================================================== Synopsis:
The document contains the analyis of severe weather effects in the USA from 1957 to November 2011. Data was obtained from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The aim of the analysis is to establish which weather conditions are most harmful to the human population and cause severe economic consquences. The data download and cleaning steps are included in this document for the purpose of reproducibility. At the end of each results section, conclusion is drawn from the data about the most harmful or consequential event.
Data download
URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(URL, destfile = "storm.data.csv.bz2", method = "curl")
Data cleaning steps
library(stringdist)
rawData <- read.csv("storm.data.csv.bz2")
subData <- subset(rawData, FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0, select = c(BGN_DATE, EVTYPE,FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP))
subData$BGN_DATE <- as.Date(as.character(subData$BGN_DATE),format= "%m/%d/%Y")
subData$EVTYPE <- as.character(subData$EVTYPE)
subData$EVTYPE <- trimws(subData$EVTYPE)
subData$EVTYPE <- toupper(subData$EVTYPE)
officialEvents <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill","Debris Flow", "Dense Fog", "Dense Smoke","Drought", "Dust Devil", "Dust Storm","Excessive Heat", "Extreme Cold/Wind Chill", "Freezing Fog","Flash Flood", "Flood", "Frost/Freeze", "Funnel Cloud", "Lightning", "Hail", "Heat", "Heavy Rain", "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", "Ice Storm", "Lake-Effect Snow", "Lakeshore Flood", "Marine Hail", "Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet", "Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", "Tropical Depression", "Tropical Storm", "Tsunami","Volcanic Ash", "Waterspout", "Wildfire", "Winter Storm", "Winter Weather")
officialEvents <- toupper(officialEvents)
officialEvents <- trimws(officialEvents)
Event <-subData$EVTYPE
matches <- match(Event, officialEvents)
cleanMatches <- subData[!is.na(matches),]
dirtyMatches <- subData[is.na(matches),]
Dmatch <- dirtyMatches$EVTYPE
for (i in 1:length(Dmatch)){
original_g <- Dmatch[i]
for(j in 1: length(officialEvents)){
if(grepl(officialEvents[j], Dmatch[i], fixed = TRUE) == TRUE){
Dmatch[i] <- officialEvents[j]
break
}
else if(grepl(Dmatch[i], officialEvents[j], fixed = TRUE) == TRUE){
Dmatch[i] <- officialEvents[j]
break
}
}
if(Dmatch[i] == original_g) {
Dmatch[i] <- NA
}
}
dirtyMatches_1 <- dirtyMatches[is.na(Dmatch),]
cleanMatches_1 <- dirtyMatches[!is.na(Dmatch),]
cleanMatches_1$EVTYPE <- Dmatch[!is.na(Dmatch)]
Dmatch.1 <- dirtyMatches_1$EVTYPE
for(i in 1: length(Dmatch.1)){
Dmatch.1[i] <- officialEvents[amatch(Dmatch.1[i], officialEvents, maxDist = 15, weight = c(d = 1, i = 1, s = 1, t = 1))]
}
dirtyMatches_2 <- dirtyMatches_1[is.na(Dmatch.1),]
cleanMatches_2 <- dirtyMatches_1[!is.na(Dmatch.1),]
cleanMatches_2$EVTYPE <- Dmatch.1[!is.na(Dmatch.1)]
dirtyMatches_2$EVTYPE <- officialEvents[c(26, 26, 26, 14, 21, 25)]
cleanMatches_3 <- dirtyMatches_2
cleanAnalysisData <- rbind(cleanMatches,cleanMatches_1,cleanMatches_2, cleanMatches_3)
2.1 Population harm ie death or injuries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
grouped <- group_by(cleanAnalysisData,EVTYPE )
humanDamage <- summarise(grouped, TOTFAT = sum(FATALITIES), TOTINJ = sum(INJURIES))
maxFat <- humanDamage$EVTYPE[which.max(humanDamage$TOTFAT)]
maxInj <- humanDamage$EVTYPE[which.max(humanDamage$TOTINJ)]
fat <- ggplot(humanDamage, aes(EVTYPE, TOTFAT))
fat <- fat + geom_bar(stat="identity") +ggtitle("Total number of fatalities by Event type (1957 -2011)")
fat <- fat + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("Number of fatalities") + xlab("Event")
fat <- fat + geom_text(aes(label=TOTFAT), position=position_dodge(width=0.9), size = 2, vjust=-0.25)
fat
inj <- ggplot(humanDamage, aes(EVTYPE, TOTINJ))
inj <- inj + geom_bar(stat="identity") +ggtitle("Total number of injuries by Event type (1957 -2011)")
inj <- inj + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("Number of injuries") + xlab("Event")
inj <- inj + geom_text(aes(label=TOTINJ), position=position_dodge(width=0.9), size = 2, vjust=-0.25)
inj
2.2 Economic consquences ie property damage
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
propertyDamage <- subset(cleanAnalysisData, PROPDMG >0 | CROPDMG >0, select = c(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
#Sorting exponential values for Property damage
pdexp <- as.character(propertyDamage$PROPDMGEXP)
for(i in 1: length(pdexp)){
ex <- 0:10
ex <- as.character(ex)
if(pdexp[i] == "H" | pdexp[i] == "h"){
pdexp[i] <- 100
}
else if (pdexp[i] == "K" | pdexp[i] == "k"){
pdexp[i] <- 1000
}
else if (pdexp[i] == "M" | pdexp[i] =="m"){
pdexp[i] <- 1000000
}
else if (pdexp[i] == "B" | pdexp[i] == "b"){
pdexp[i] <- 1000000000
}
else if (pdexp[i] %in% ex){
pdexp[i] <- 10^as.numeric(pdexp[i])
}
else if (pdexp[i] == ""){
pdexp[i] <- 1
}
else {
pdexp[i] <- NA
}
}
propertyDamage$PROPDMGEXP <- as.numeric(pdexp)
# sorting exponential values for crop damage
cdexp <- as.character(propertyDamage$CROPDMGEXP)
for(i in 1: length(cdexp)){
exp <- 0:10
exp <- as.character(exp)
if(cdexp[i] == "H" | cdexp[i] == "h"){
cdexp[i] <- 100
}
else if (cdexp[i] == "K" | cdexp[i] == "k"){
cdexp[i] <- 1000
}
else if (cdexp[i] == "M" | cdexp[i] =="m"){
cdexp[i] <- 1000000
}
else if (cdexp[i] == "B" | cdexp[i] == "b"){
cdexp[i] <- 1000000000
}
else if (cdexp[i] %in% ex){
cdexp[i] <- 10^as.numeric(cdexp[i])
}
else if (cdexp[i] == ""){
cdexp[i] <- 1
}
else {
cdexp[i] <- NA
}
}
propertyDamage$CROPDMGEXP <- as.numeric(cdexp)
propertyDamage <- mutate(propertyDamage, PROPDMGVALUE = PROPDMG*PROPDMGEXP, CROPDMGVALUE = CROPDMG * CROPDMGEXP)
groupDMG <- group_by(propertyDamage, EVTYPE)
eventsDMG <- summarise(groupDMG, PROPDAMAGEVALUE = sum(PROPDMGVALUE, na.rm = TRUE), CROPDAMAGEVALUE = sum(CROPDMGVALUE, na.rm = TRUE))
propMax <- eventsDMG$EVTYPE[which.max(eventsDMG$PROPDAMAGEVALUE)]
cropMax <- eventsDMG$EVTYPE[which.max(eventsDMG$CROPDAMAGEVALUE)]
dmg <- melt(eventsDMG, id = "EVTYPE")
names(dmg) <- c("EVTYPE", "DAMAGETYPE", "TOTALVALUE")
dmg$DAMAGETYPE <- as.factor(dmg$DAMAGETYPE)
dmg$TOTALVALUE <- (dmg$TOTALVALUE)/1000000
d <- ggplot(dmg, aes(EVTYPE, TOTALVALUE, fill = DAMAGETYPE)) + xlab("Event type") + ylab("Total value (in millions)")
d <- d + geom_bar(stat="identity",position="dodge") + ggtitle("Damage caused by severe weather from 1957 to 2011")
d <- d +coord_flip()
d