Data Processing

From the NOAA website shown below, we download the data. A description of the data can be found in the Storm Data Documentation.

## Download the data

if (!file.exists("StormData.csv.bz2")) {
  fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
  download.file(fileUrl, destfile = "StormData.csv.bz2", method = "curl")
  library(tools)            
  sink("download_metadata3.txt")
  print("Download date:")
  print(Sys.time() )
  print("Download URL:")
  print(fileUrl)
  print("Downloaded file Information")
  print(file.info("StormData.csv.bz2"))
  print("Downloaded file md5 Checksum")
  print(md5sum("StormData.csv.bz2"))
  sink()
}

## Read the data

StormData <- read.csv("StormData.csv.bz2")

## Filter and simplify the column names
           StormData <- StormData[c(2:8, 23:28)]
           names(StormData)[1] <- "begin.data"
           names(StormData)[2] <- "begin.time"
           names(StormData)[3] <- "time.zone"
           names(StormData)[4] <- "county"
           names(StormData)[5] <- "county.name"
           names(StormData)[6] <- "state"
           names(StormData)[7] <- "event.type"
           names(StormData)[8] <- "fatalities"
           names(StormData)[9] <- "injuries"
           names(StormData)[10] <- "prop.damage"
           names(StormData)[11] <- "pd.exp"
           names(StormData)[12] <- "crop.damage"
           names(StormData)[13] <- "cd.exp"

##Narrow down data to impact > 0.
StormData <- subset(StormData, StormData$fatalities | StormData$injuries | StormData$crop.damage | StormData$prop.damage > 0)

Assess how much missing data there is:

PCTNA <- mean(is.na(StormData)) * 100

The percentage of the data set that is “NA” is 0% so no adjustments need to be made to the data.

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

## Group Data by Event Type
  library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

  StormData.df = data.frame(StormData)
  StormEvent <- group_by(StormData.df, event.type)

## Calculate sum of fatalities by Event Type and find which event has the most
  SumsFatal <- summarise(StormEvent, fatalities=sum(fatalities))
  MaxFatal <- which.max(SumsFatal$fatalities)
  MaxEventFatal <- SumsFatal[MaxFatal, 1]

## Calculate sum of injuries by Event Type and find which event has the most
  SumsInjury <- summarise(StormEvent, injuries=sum(injuries))
  MaxInjury <- which.max(SumsInjury$injuries)
  MaxEventInjury <- SumsInjury[MaxInjury, 1]

The storm event with the most fatalities across the US is TORNADO and the storm event with the most injuries is TORNADO

The following plot displays the relative harmfulness to humans of the various Storm Event types.

## Create an indicator of what events are most harmful with respect to population health.

StormData.df$Death.and.Injury <- StormData.df$fatalities + StormData.df$injuries

##  Calculate sum of injuries and deaths over the time period of the data set and provide a ranking of top 10 weather events that impact population health.

  StormEvent <- group_by(StormData.df, event.type)
  SumsImpact <- summarise(StormEvent,  Death.and.Injury=sum(Death.and.Injury))
  sorteddata <- SumsImpact[order(-SumsImpact$Death.and.Injury),]
  TopTen <- head(sorteddata, n = 10)  

## Relevel factor so when plotted they appear in descending order of magnitude
  TopTen$event.type <- factor(TopTen$event.type, levels=TopTen$event.type)

## Plot the results
  library(ggplot2)

g <- ggplot(TopTen, aes(x = factor(event.type), y = Death.and.Injury)) 
  g + geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
  xlab("Storm Event") + ylab("Death + Injury") + ggtitle("Impact to Public Health")

Do a similar analysis for top storm events causing the highest property damage and crop damage.

##Apply the multiplier provided in the data set to calculate property and crop damage in US$.  The "prop.exp" and "crop.exp" columns contain a letter symbol that represents the multiplier.
## List the multiplier symbols and create a multiplier from the ones that are valid, then do a substitution for those symbols.  Change muliplier factor to a numeric so they can be multiplie.

  unique(StormData$pd.exp)

##  [1] K M   B m + 0 5 6 4 h 2 7 3 H -
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M

  StormData$pd.exp <- as.character(StormData$pd.exp)
  StormData$pd.exp <- sapply(StormData$pd.exp, switch, '0' = 10, '1' = 10, '2' = 10, '3' = 10, '4' = 10, '5' = 10, '6' = 10, '7' = 10, '8' = 10, 'K' = 1000, 'B' = 1000000000,'M' = 1000000, 'm' = 1000000, 'H' = 100, 'h' = 100, ' ' = 0, '+' = 1, '-' = 0, '?' = 0)
  StormData$pd.exp <- as.character(StormData$pd.exp)
  StormData$pd.exp <- as.numeric(StormData$pd.exp)

## Warning: NAs introduced by coercion

  StormData$prop.damage <- as.character(StormData$prop.damage)
  StormData$prop.damage <- as.numeric(StormData$prop.damage)

  StormData$cd.exp <- as.character(StormData$cd.exp)
  StormData$cd.exp <- sapply(StormData$cd.exp, switch, '0' = 10, 'K' = 1000, 'k' = 1000, 'M' = 1000000, 'm' = 1000000, 'B' = 1000000000, ' ' = 0, '?' = 0, '2' = 0)
  StormData$cd.exp <- as.character(StormData$cd.exp)
  StormData$cd.exp <- as.numeric(StormData$cd.exp)

## Warning: NAs introduced by coercion

  StormData$crop.damage <- as.character(StormData$crop.damage)
  StormData$crop.damage <- as.numeric(StormData$crop.damage)

  StormData$PropCost <- StormData$pd.exp * StormData$prop.damage
  StormData$CropCost <- StormData$cd.exp * StormData$crop.damage

## Calculate overall damage in $
  StormData$Cost <- StormData$CropCost + StormData$PropCost

  StormData.df = data.frame(StormData)
  StormEvent <- group_by(StormData.df, event.type)
  SumsCost <- summarise(StormEvent, Cost=sum(Cost))
  MaxCost <- which.max(SumsCost$Cost)
  MaxEventCost <- SumsCost[MaxCost, 1]
  sorteddata <- SumsCost[order(-SumsCost$Cost),]
  Ranking <- head(sorteddata, n = 10)
  Ranking$event.type <- factor(Ranking$event.type, levels=Ranking$event.type)

library(ggplot2)
library(scales)
options(scipen=999)
g <- ggplot(Ranking, aes(x = factor(event.type), y = Cost))

  g + geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) +
  ylab("Property plus Crop Damage ($)") + 
  xlab("Storm Events") +
  ggtitle("Storm Events with Highest Costs") +
  scale_y_continuous(labels = comma)

Assessment of Effects of Severe Weather Events

Synopsis:

Data Processing