Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to find which weather events are host harmful to population health and economic consequences in the US.

Data processing

Install and Load packages needed

library(reshape2)
library(plyr)

Downloading and reading input data

if (!file.exists("storm.data.csv")) {
    fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(fileUrl, destfile="storm.data.csv.bz2", method="curl")
   
}
input1 <- as.data.frame(read.csv(bzfile("storm.data.csv.bz2")))
# exporing database, its structure, column  and row names
dim(input1)
## [1] 902297     37
colnames(input1)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Question 1 - Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

For this question, we will consider the FATALITIES and INJURIES columns of the data as a representation of population health.

Question 2 - Across the United States, which types of events have the greatest economic consequences?

For this question, we will consider the CROP DAMAGE and PROPERTY DAMAGE related columns of the data as a representation of economic consequence.

We first tidy the data

#sort in descending order based on number of FATALITIES and INJURIES
group1 <- plyr::arrange(input1, desc(input1$FATALITIES), desc(input1$INJURIES))

#sort in descending order based on greatest crop damage and property damage
group2 <- plyr::arrange(input1, desc(input1$PROPDMG), desc(CROPDMG))
library(dplyr)

Remove rows that have values = 0 (Remove rows with 0 injuries and 0 fatalities , 0 crop and property damage)

#for Q1 - Injurires and fatalities
group1_nonzero <- dplyr::filter(group1, FATALITIES>0 | INJURIES>0)
group1_zero <- dplyr::filter(group1, FATALITIES==0 & INJURIES==0)

#For Q2 - crop and property damage
group2_nonzero <- dplyr::filter(group2, PROPDMG>0 | CROPDMG>0)
group2_zero <- dplyr::filter(group2, PROPDMG==0 & CROPDMG==0)

From looking at the EVTYPE column in both group1_nonzero and group2_nonzero, we can see that it is very messy. To clean it up, I used the 48 categories given in the supplementary document https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf and created a mapping table that maps the values in the EVTYPE column to one of the official categories listed in this document

Importing this mapping file from github

if (!file.exists("Mapping.txt")) {
    fileUrl <- "https://github.com/kb472/RepRes_PA2/blob/master/Mapping.txt"
    download.file(fileUrl, destfile="Mapping.txt", method="curl")
}
map <- as.data.frame(read.table("Mapping.txt",header = TRUE,sep="\t"))

Mapping the values in the EVTYPE to values in the mapping file using a function and lapply

funcLoop <- function(x) {
    oneKey <- x #assume x is one row
    #converting to lower case to make it case in-sensitive.
    match1 <- match(tolower(oneKey), tolower(map$Name))
    oneValue <- map$Value[match1]
    return((oneValue))
}

Results

Results for Question 1

First formatting data to answer Q1

#Applying the function over the every element in list. Faster than for loop    
event.type <- lapply(group1_nonzero$EVTYPE, FUN = funcLoop)
event.type.matrix <- unlist(event.type) #making format readable
group1_nonzero_new <- {}
group1_nonzero_new <- cbind(group1_nonzero, event.type.matrix) #creating new dataframe

colnames(group1_nonzero_new)[38] <- "eventtype" #proper label

Considering both FATALITIES and INJURIES to impact population health

#Adding fatalities and injuries to form new column "total"
group1_nonzero_new["total"] <- NA # adding new column
group1_nonzero_new$total <- group1_nonzero_new$FATALITIES + group1_nonzero_new$INJURIES

#melting the data frame to get required answer to question "1.    
melt1 <- melt(data = group1_nonzero_new, id.vars = c("eventtype"), measure.vars = "total")
dcast1 <- dcast(melt1, eventtype ~ variable, sum)

dcast1.sort <- plyr::arrange(dcast1, desc(dcast1$total))
#This gives the total fatalities and injuries for each event type

head(dcast1.sort)
##           eventtype total
## 1           Tornado 97100
## 2 Thunderstorm Wind 10239
## 3    Excessive Heat  9325
## 4             Flood  7303
## 5         Lightning  6049
## 6              Heat  3096

We can see that Tornadoes cause the most fatalities and injurires in the US

Results for Question 2

We consider the following columns: CROPDMGEXP (Crop damage exponent), CROPDMG (Drop damage value), PROPDMGEXP (Property damage exponent), PROPDMG (property damage value)

#Applying the function over the every element in list. Faster than for loop    
event.type2 <- lapply(group2_nonzero$EVTYPE, FUN = funcLoop)
event.type.matrix2 <- unlist(event.type2) #making format readable
group2_nonzero_new <- cbind(group2_nonzero, event.type.matrix2) #creating new dataframe

colnames(group2_nonzero_new)[38] <- "eventtype" #proper label

When we examine the PROPDMGEXP and the CROPDMGEXP columns, we can see that it is messy

unique(group2_nonzero_new$PROPDMGEXP)
##  [1] K M 0 B 5   7 + 6 m 3 - 2 4 H h
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(group2_nonzero_new$CROPDMGEXP)
## [1] K   M B ? 0 k m
## Levels:  ? 0 2 B k K m M

Cleaning the PROPDMGEXP and the CROPDMGEXP columns

#make copy of object
group2_nonzero_copy <- group2_nonzero_new

#make a copy of the PROPDMGEXP column
group2_nonzero_copy$PropdmgexpNew <- as.character(group2_nonzero_copy$PROPDMGEXP)

#Cleaning the exponents
group2_nonzero_copy[group2_nonzero_copy$PropdmgexpNew %in% c("K","k"), "PropdmgexpNew"] <- "3"
group2_nonzero_copy[group2_nonzero_copy$PropdmgexpNew %in% c("H","h"), "PropdmgexpNew"] <- "2"
group2_nonzero_copy[group2_nonzero_copy$PropdmgexpNew %in% c("M","m"), "PropdmgexpNew"] <- "6"
group2_nonzero_copy[(group2_nonzero_copy$PropdmgexpNew) %in% c("B","b"), "PropdmgexpNew"] <- "9"

## assume these values are 0, so that are 10^0 = 1
group2_nonzero_copy[group2_nonzero_copy$PropdmgexpNew %in% c("","-","+","?"), "PropdmgexpNew"] <-"0"  

group2_nonzero_copy["PropdmgNew"] <- NA # adding new column
group2_nonzero_copy$PropdmgNew <- group2_nonzero_copy$PROPDMG * 10^as.numeric(group2_nonzero_copy$PropdmgexpNew) #This columns now contains the cleaned Property damage value

#make a copy of the COPDMGEXP column
group2_nonzero_copy$CropdmgexpNew <- as.character(group2_nonzero_copy$CROPDMGEXP) #copy

#Cleaning the exponents
group2_nonzero_copy[group2_nonzero_copy$CropdmgexpNew %in% c("K","k"), "CropdmgexpNew"] <- "3"
group2_nonzero_copy[group2_nonzero_copy$CropdmgexpNew %in% c("H","h"), "CropdmgexpNew"] <- "2"
group2_nonzero_copy[group2_nonzero_copy$CropdmgexpNew %in% c("M","m"), "CropdmgexpNew"] <- "6"
group2_nonzero_copy[(group2_nonzero_copy$CropdmgexpNew) %in% c("B","b"), "CropdmgexpNew"] <- "9"
group2_nonzero_copy[group2_nonzero_copy$CropdmgexpNew %in% c("","-","+","?"), "CropdmgexpNew"] <-"0"  ## assume these values are 0, so that are 10^0 = 1

group2_nonzero_copy["CropdmgNew"] <- NA # adding new column
group2_nonzero_copy$CropdmgNew <- group2_nonzero_copy$CROPDMG * 10^as.numeric(group2_nonzero_copy$CropdmgexpNew) #This columns now contains the cleaned Property damage value

## So group2_nonzero_copy$CropdmgNew and group2_nonzero_copy$PropdmgNew are the cleaned columns

#divide by 1e9 to show values in billions
group2_nonzero_copy$CropdmgNew <- group2_nonzero_copy$CropdmgNew/1e9
group2_nonzero_copy$PropdmgNew <- group2_nonzero_copy$PropdmgNew/1e9

#Taking total of CropdmgNew and PropdmgNew
group2_nonzero_copy["total2"] <- NA # adding new column
group2_nonzero_copy$total2 <- group2_nonzero_copy$CropdmgNew + group2_nonzero_copy$PropdmgNew

Melting the data to get required answer to question 2

#melting the data frame to get required answer to question "1.    Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?"
melt2 <- melt(data = group2_nonzero_copy, id.vars = c("eventtype"), measure.vars = "total2")
dcast2 <- dcast(melt2, eventtype ~ variable, sum)
dcast2.sort <- plyr::arrange(df = dcast2, desc(dcast2$total2))

head(dcast2.sort)
##             eventtype    total2
## 1               Flood 150.83567
## 2 Hurricane (Typhoon)  90.87253
## 3             Tornado  59.03012
## 4    Storm Surge/Tide  47.96558
## 5         Flash Flood  29.35268
## 6                Hail  19.02443

We can see that Floods are the most harmful to the US economy