Reproducible Research: Peer graded assignment 2

=================================================================================================================== Synopsis:

The document contains the analyis of severe weather effects in the USA from 1957 to November 2011. Data was obtained from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The aim of the analysis is to establish which weather conditions are most harmful to the human population and cause severe economic consquences. The data download and cleaning steps are included in this document for the purpose of reproducibility. At the end of each results section, conclusion is drawn from the data about the most harmful or consequential event.

Data download

URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(URL, destfile = "storm.data.csv.bz2", method = "curl")

1. Data processing

Data cleaning steps

  • Data to be loaded into r
  • Subset data to include only the required variables where damage or harm occured
  • Clean data based on the valid/official event types provides on the NOAA site
library(stringdist)
rawData <- read.csv("storm.data.csv.bz2")
subData <- subset(rawData, FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0, select = c(BGN_DATE, EVTYPE,FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP))
subData$BGN_DATE <- as.Date(as.character(subData$BGN_DATE),format= "%m/%d/%Y")
subData$EVTYPE <- as.character(subData$EVTYPE)
subData$EVTYPE <- trimws(subData$EVTYPE)
subData$EVTYPE <- toupper(subData$EVTYPE)
officialEvents <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill","Debris Flow", "Dense Fog", "Dense Smoke","Drought", "Dust Devil", "Dust Storm","Excessive Heat", "Extreme Cold/Wind Chill", "Freezing Fog","Flash Flood", "Flood", "Frost/Freeze", "Funnel Cloud", "Lightning", "Hail", "Heat", "Heavy Rain", "Heavy Snow", "High Surf", "High Wind", "Hurricane (Typhoon)", "Ice Storm", "Lake-Effect Snow", "Lakeshore Flood", "Marine Hail", "Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind", "Rip Current", "Seiche", "Sleet", "Storm Surge/Tide", "Strong Wind", "Thunderstorm Wind", "Tornado", "Tropical Depression", "Tropical Storm", "Tsunami","Volcanic Ash", "Waterspout", "Wildfire", "Winter Storm", "Winter Weather")
officialEvents <- toupper(officialEvents)
officialEvents <- trimws(officialEvents)

Event <-subData$EVTYPE
matches <- match(Event, officialEvents)
cleanMatches <- subData[!is.na(matches),]
dirtyMatches <- subData[is.na(matches),]
Dmatch <- dirtyMatches$EVTYPE
for (i in 1:length(Dmatch)){
        original_g <- Dmatch[i]
        for(j in 1: length(officialEvents)){
                if(grepl(officialEvents[j], Dmatch[i], fixed = TRUE) == TRUE){
                        Dmatch[i] <- officialEvents[j]
                        break
                }
                else if(grepl(Dmatch[i], officialEvents[j], fixed = TRUE) == TRUE){
                        Dmatch[i] <- officialEvents[j]
                        break
                }
        }
        if(Dmatch[i] == original_g) {
                Dmatch[i] <- NA
                
        }
}

dirtyMatches_1 <- dirtyMatches[is.na(Dmatch),]
cleanMatches_1 <- dirtyMatches[!is.na(Dmatch),]
cleanMatches_1$EVTYPE <- Dmatch[!is.na(Dmatch)]

Dmatch.1 <- dirtyMatches_1$EVTYPE

for(i in 1: length(Dmatch.1)){
        Dmatch.1[i] <- officialEvents[amatch(Dmatch.1[i], officialEvents, maxDist = 15, weight = c(d = 1, i = 1, s = 1, t = 1))]
}


dirtyMatches_2 <- dirtyMatches_1[is.na(Dmatch.1),]
cleanMatches_2 <- dirtyMatches_1[!is.na(Dmatch.1),]
cleanMatches_2$EVTYPE <- Dmatch.1[!is.na(Dmatch.1)]

dirtyMatches_2$EVTYPE <- officialEvents[c(26, 26, 26, 14, 21, 25)]
cleanMatches_3 <- dirtyMatches_2
cleanAnalysisData <- rbind(cleanMatches,cleanMatches_1,cleanMatches_2, cleanMatches_3)

2. Results

2.1 Population harm ie death or injuries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
grouped <- group_by(cleanAnalysisData,EVTYPE )
humanDamage <- summarise(grouped, TOTFAT = sum(FATALITIES), TOTINJ = sum(INJURIES))
maxFat <- humanDamage$EVTYPE[which.max(humanDamage$TOTFAT)]
maxInj <- humanDamage$EVTYPE[which.max(humanDamage$TOTINJ)]
fat <- ggplot(humanDamage, aes(EVTYPE, TOTFAT))
fat <- fat + geom_bar(stat="identity") +ggtitle("Total number of fatalities by Event type (1957 -2011)")
fat <- fat + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("Number of fatalities") + xlab("Event")
fat <- fat + geom_text(aes(label=TOTFAT), position=position_dodge(width=0.9), size = 2, vjust=-0.25)
fat

inj <- ggplot(humanDamage, aes(EVTYPE, TOTINJ))
inj <- inj + geom_bar(stat="identity") +ggtitle("Total number of injuries by Event type (1957 -2011)")
inj <- inj + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("Number of injuries") + xlab("Event")
inj <- inj + geom_text(aes(label=TOTINJ), position=position_dodge(width=0.9), size = 2, vjust=-0.25)
inj

  • TORNADOS have caused the highest number of deaths and injuries to people’s lives hence most harmful to population health

2.2 Economic consquences ie property damage

  • Convert all exponential values into figures
  • Multiply exponential values with rounded values to get actual values
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
library(ggplot2)
propertyDamage <- subset(cleanAnalysisData, PROPDMG >0 | CROPDMG >0, select = c(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
#Sorting exponential values for Property damage
pdexp <- as.character(propertyDamage$PROPDMGEXP) 
for(i in 1: length(pdexp)){
        ex <- 0:10
        ex <- as.character(ex)
        if(pdexp[i] == "H" | pdexp[i] == "h"){
                pdexp[i] <- 100
        } 
        else if (pdexp[i] == "K" | pdexp[i] == "k"){
                pdexp[i] <- 1000
        } 
        else if (pdexp[i] == "M" | pdexp[i] =="m"){
                pdexp[i] <- 1000000
        } 
        else if (pdexp[i] == "B" | pdexp[i] == "b"){
                pdexp[i] <- 1000000000
        }
        else if (pdexp[i] %in% ex){
                pdexp[i] <- 10^as.numeric(pdexp[i])
        }
        else if (pdexp[i] == ""){
                pdexp[i] <- 1
        }
        else {
                pdexp[i] <- NA
                }
}
propertyDamage$PROPDMGEXP <- as.numeric(pdexp)
# sorting exponential values for crop damage
cdexp <- as.character(propertyDamage$CROPDMGEXP) 
for(i in 1: length(cdexp)){
        exp <- 0:10
        exp <- as.character(exp)
        if(cdexp[i] == "H" | cdexp[i] == "h"){
                cdexp[i] <- 100
        } 
        else if (cdexp[i] == "K" | cdexp[i] == "k"){
                cdexp[i] <- 1000
        } 
        else if (cdexp[i] == "M" | cdexp[i] =="m"){
                cdexp[i] <- 1000000
        } 
        else if (cdexp[i] == "B" | cdexp[i] == "b"){
                cdexp[i] <- 1000000000
        }
        else if (cdexp[i] %in% ex){
                cdexp[i] <- 10^as.numeric(cdexp[i])
        }
        else if (cdexp[i] == ""){
                cdexp[i] <- 1
        }
        else {
                cdexp[i] <- NA
                }
}
propertyDamage$CROPDMGEXP <- as.numeric(cdexp)

propertyDamage <- mutate(propertyDamage, PROPDMGVALUE = PROPDMG*PROPDMGEXP, CROPDMGVALUE = CROPDMG * CROPDMGEXP)
groupDMG <- group_by(propertyDamage, EVTYPE)
eventsDMG <- summarise(groupDMG, PROPDAMAGEVALUE = sum(PROPDMGVALUE, na.rm = TRUE), CROPDAMAGEVALUE = sum(CROPDMGVALUE, na.rm = TRUE))
propMax <- eventsDMG$EVTYPE[which.max(eventsDMG$PROPDAMAGEVALUE)]
cropMax <- eventsDMG$EVTYPE[which.max(eventsDMG$CROPDAMAGEVALUE)]
dmg <- melt(eventsDMG, id = "EVTYPE")
names(dmg) <- c("EVTYPE", "DAMAGETYPE", "TOTALVALUE")
dmg$DAMAGETYPE <- as.factor(dmg$DAMAGETYPE)
dmg$TOTALVALUE <- (dmg$TOTALVALUE)/1000000
d <- ggplot(dmg, aes(EVTYPE, TOTALVALUE, fill = DAMAGETYPE)) + xlab("Event type") + ylab("Total value (in millions)")
d <- d + geom_bar(stat="identity",position="dodge") + ggtitle("Damage caused by severe weather from 1957 to 2011")
d <- d +coord_flip()
d

  • FLOODS have caused the highest property value damage while DROUGHT is the biggest cause of crop damage. Both crop and property damage are considered contributors to economic consquences