Reproducible Research, Assignment Course Project 2

Introduction

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage

Data The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. You can download the file from the course web site: Storm Data [47Mb]

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

datafile; repdata-data-StormData.csv

Downloading and extracting database ##link address ##https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2

if(!file.exists("repdata-data-StormData.csv")) {
        if(!file.exists("repdata-data-StormData.csv.bz2")) {
                download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
                              "repdata-data-StormData.csv.bz2",
                              method = "auto")
        }

        library(R.utils)
        bunzip2("repdata-data-StormData.csv.bz2")
}
totalSize <- file.size("repdata-data-StormData.csv")

Total size of data file = 561637449 bytes.

Processing Data Analysis

library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)

TotalData<-read.table("repdata-data-StormData.csv",
                        header=TRUE, 
                        sep=","
                      )
nmRecs<-dim(TotalData)[[1]] 
names(TotalData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

The dataset has a total of 902297 records

Table of weather event types

library(stringr)
TotalData_cleaned <- mutate(TotalData,EVTYPE=toupper(str_trim(EVTYPE)), YEAR=format(strptime(BGN_DATE,format="%m/%d/%Y %T"),format="%Y"))

evtype_total_casualties <- ddply(TotalData_cleaned,.(EVTYPE),
                                 summarize,
                                 totalFatalities=sum(FATALITIES),
                                 totalInjuries=sum(INJURIES),
                                 totalCasualties=sum(FATALITIES+INJURIES))
casualties_sorted <- evtype_total_casualties[order(evtype_total_casualties[,"totalCasualties"],
                                                   decreasing=TRUE),]
print(casualties_sorted[1:10,])
##                EVTYPE totalFatalities totalInjuries totalCasualties
## 750           TORNADO            5633         91346           96979
## 108    EXCESSIVE HEAT            1903          6525            8428
## 771         TSTM WIND             504          6957            7461
## 146             FLOOD             470          6789            7259
## 410         LIGHTNING             816          5230            6046
## 235              HEAT             937          2100            3037
## 130       FLASH FLOOD             978          1777            2755
## 379         ICE STORM              89          1975            2064
## 677 THUNDERSTORM WIND             133          1488            1621
## 880      WINTER STORM             206          1321            1527

Here we can see that tornadoes result in the most casualties among weather event types, with excessive heat a distant second.

Data for fields of interest

df <- subset(TotalData[,c(8,23,24,25,26,27,28)])

Create a table to covert damage expense codes

code <- c("h","H","k","K","m","M","b","B")
multp <- c(100, 100,1000,1000,1000000,1000000,1000000000,1000000000)

Compute damage amounts from PROPDMGEXP & CROPDMGEXP

library(dplyr)

The following objects are masked from package:stats filter, lag The following objects are masked from package:base intersect, setdiff, setequal, union

#create a multiplier field PROPDMGx
df <- mutate(df, PROPDMGx = as.numeric(ifelse(PROPDMGEXP %in% code,
                                             multp,0)))

#create a multiplier field CROPDMGx
df <- mutate(df, CROPDMGx = as.numeric(ifelse(CROPDMGEXP %in% code,
                                              multp,0)))

#Code to compute damage expenses using multiplier
df$PROPDMGEXP.0 <- df$PROPDMG * df$PROPDMGx
df$CROPDMGEXP.0 <- df$CROPDMG * df$CROPDMGx

Field name: HARM = sum of FATALITIES + INJURIES Field name: EXPENSE = sum of PROPDMGEXP.0 + CROPDMGEXP.0

Create summary fields

#Create HARM field is the sum of FATALITIES & INJURIES
df <- mutate(df, HARM = FATALITIES + INJURIES)

#Create EXPENSE field is the sum of PROPDMGEXP & CROPDMGEXP
df <- mutate(df, EXPENSE = PROPDMGEXP.0 + CROPDMGEXP.0)

Results With the data processing completed, the impact of the weather events was analyzed using the new summary fields HARM and EXPENSE.

First, the totals were run by EVTYPE and summarized:

Code for summarizing damage totals by type of weather event

#Analyze HARM
totHarmByEVT<-aggregate(df$HARM, list(df$EVTYPE), sum)
names(totHarmByEVT) <- c("EventType", "TotHarm")
totHarmByEVT<-totHarmByEVT[order(-totHarmByEVT$TotHarm),]

#subset results
topTotHarm <- subset(totHarmByEVT[1:5,])

#Analyze EXPENSE
totExpByEVT<-aggregate(df$EXPENSE, list(df$EVTYPE), sum)
names(totExpByEVT) <- c("EventType", "TotExpense")
totExpByEVT<-totExpByEVT[order(-totExpByEVT$TotExpense),]

#subset to top five by expense
topTotExp <- subset(totExpByEVT[1:5,])

#convert expense amounts to Billions
topTotExp$TotExpense<-topTotExp[,2]/1000000000

Impact to Human Health (HARM) and Economic Impact (EXPENSE) plotting weather event impact and combine

library(ggplot2)
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#Impact to Human Health(HARM)
hrm <- ggplot(topTotHarm, aes(EventType))
hrm <- hrm + geom_bar(aes(weight = TotHarm, fill = EventType)) +
        scale_fill_brewer(palette = "Set3") +
        scale_y_continuous(label = comma) +
        ggtitle("Population Health Impact") +
        theme(legend.position = "none") +
        labs(x = "Event Type", y = "Fatalities + Injuries")

#Economic Impact (EXPENSE)
exs <- ggplot(topTotExp, aes(EventType))
exs <- exs + geom_bar(aes(weight = TotExpense, fill = EventType)) +
        scale_fill_brewer(palette = "Set3") +
        scale_y_continuous(label = comma) +
        ggtitle("Damage Expense Impact") +
        theme(legend.position = "none") +
        labs(x = "Event Type", y = "Property & Crop Expense ($Billions)")

Top 5 event for weather impact

require(gridExtra)
grid.arrange(hrm, exs, ncol=2)

Summary

The picture of events in the database start in the year 1950 and end in November 2011. In the earlier years of the database. and summary of the Weather event impacts top 5 for Population Health Impact and Damage Expense Impact.

End of Report