Introduction

This assignment uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database that can be found at the following link.

The database includes various measures, including estimates of fatalities, injuries and damage to property.

Additional information on the database can be found at the National Weather Service Storm Data Documentation and the National Climatic Data Center Storm Events FAQ

Synopsis

The analysis consists of downloading and checking the data, transforming where necessary.

The two questions this analysis set out to answer are:

A brief summary of the top-level findings are reflected in the table below:

Data Processing

Downloading the checking the data. Some of the script chunks have been cached to save processing time.
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("data")) {
        dir.create("data")
}
download.file(data_url, destfile = "./data/StormData.csv.bz2", method = "curl")
Unzipping the downloaded file and reading in the .csv file, identifying the NA strings
library(R.utils)
bunzip2("./data/StormData.csv.bz2", "./data/StormData.csv", remove = TRUE, skip = TRUE)
## [1] "./data/StormData.csv"
## attr(,"temporary")
## [1] FALSE
stormdata <- read.csv("./data/StormData.csv", na.strings = c("","?","-"))
The variables of interest for this analysis are:
  • EVTYPE
  • FATALITIES
  • INJURIES
  • PROPDMG
  • PROPDMGEXP
  • CROPDMG
  • CROPDMGEXP
For convenience sake, these are isolated into a target data subset
stormdata_sub <- stormdata[,c(8,23,24,25,26,27,28)]
Checking the subset for its dimensions and its structure
dim(stormdata_sub)
## [1] 902297      7
str(stormdata_sub)
## 'data.frame':    902297 obs. of  7 variables:
##  $ EVTYPE    : Factor w/ 984 levels "   HIGH SURF ADVISORY",..: 833 833 833 833 833 833 833 833 833 833 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 16 levels "+","0","1","2",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 7 levels "0","2","B","k",..: NA NA NA NA NA NA NA NA NA NA ...
These show what could be errors in two of the key factor variables (PROPDMGEXP and CROPDMGEXP)
Checking the levels of these two key variables and drawing tables to get a sense of the extent of the errors in the data.
levels(stormdata_sub$PROPDMGEXP)
##  [1] "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K" "m" "M"
levels(stormdata_sub$CROPDMGEXP)
## [1] "0" "2" "B" "k" "K" "m" "M"
table(stormdata_sub$PROPDMGEXP)
## 
##      +      0      1      2      3      4      5      6      7      8 
##      5    216     25     13      4      4     28      4      5      1 
##      B      h      H      K      m      M 
##     40      1      6 424665      7  11330
table(stormdata_sub$CROPDMGEXP)
## 
##      0      2      B      k      K      m      M 
##     19      1      9     21 281832      1   1994

From the tables, it looks as if the errors make up a relatively small portion of the dataset.

I will, however make the assumption that small “b”,“m”,“h”, and “k” were meant to be capitalised. I will change these for the sake of convenience, but I will leave the other errors such as the integers as they are. I do not believe they will substantively affect the analysis.

stormdata_sub$PROPDMGEXP <- sub("h", "H", stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("m", "M", stormdata_sub$PROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("k","K", stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("m","M", stormdata_sub$CROPDMGEXP)
In order to answer the second part of the question I will need to transform CROPDMGEXP and PROPDMGEXP variables along with their accompanying columns, CROPDMG and PROPDMG, into a single numeric variable that reflects the total cost of damages:
stormdata_sub$PROPDMGEXP <- sub("B","1000000000",x=stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("M","1000000", x=stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("K","1000", x= stormdata_sub$PROPDMGEXP)
stormdata_sub$PROPDMGEXP <- sub("H","100", x = stormdata_sub$PROPDMGEXP)

stormdata_sub$CROPDMGEXP <- sub("B","1000000000",x=stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("M","1000000", x=stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("K","1000", x= stormdata_sub$CROPDMGEXP)
stormdata_sub$CROPDMGEXP <- sub("H","100", x = stormdata_sub$CROPDMGEXP)

stormdata_sub$PROPDMGEXP <- as.numeric(stormdata_sub$PROPDMGEXP)
## Warning: NAs introduced by coercion
stormdata_sub$CROPDMGEXP <- as.numeric(stormdata_sub$CROPDMGEXP)

stormdata_sub$PropCost <- stormdata_sub$PROPDMGEXP*stormdata_sub$PROPDMG
stormdata_sub$CropCost <- stormdata_sub$CROPDMGEXP*stormdata_sub$CROPDMG
Adding the CropCost and PropCost quantities:(taking care of NAs)
stormdata_sub$Cost <- rowSums(data.frame(stormdata_sub$PropCost, stormdata_sub$CropCost), na.rm = TRUE)
Finally, adding the injuries and fatalities:
stormdata_sub$FAT_INJ <- stormdata_sub$FATALITIES + stormdata_sub$INJURIES

Results

Question 1: Across the United States, which types of events (as indicated in the 𝙴𝚅𝚃𝚈𝙿𝙴 variable) are most harmful with respect to population health?

library(dplyr)
by_type <- group_by(stormdata_sub,EVTYPE)
sum <- summarise(by_type, DEATHS_INJURIES = sum(FAT_INJ))
sum_ordered <- sum[order(-sum$DEATHS_INJURIES),]
sum_ordered_sub <- sum_ordered[sum_ordered$DEATHS_INJURIES > 0,]
head(sum_ordered_sub,50)
## Source: local data frame [50 x 2]
## 
##               EVTYPE DEATHS_INJURIES
##               (fctr)           (dbl)
## 1            TORNADO           96979
## 2     EXCESSIVE HEAT            8428
## 3          TSTM WIND            7461
## 4              FLOOD            7259
## 5          LIGHTNING            6046
## 6               HEAT            3037
## 7        FLASH FLOOD            2755
## 8          ICE STORM            2064
## 9  THUNDERSTORM WIND            1621
## 10      WINTER STORM            1527
## ..               ...             ...

Barplots showing top ten:

library(ggplot2)

sub10 <- sum_ordered_sub[1:10,]
names <- as.character(sub10$EVTYPE)
STORM_TYPE <- factor(names, levels = names)
sub102 <- cbind(STORM_TYPE,sub10)

g1 <- ggplot(sub102,aes(STORM_TYPE))
g1 + geom_bar(aes(weight = DEATHS_INJURIES)) + xlab("STORM TYPE") + ylab("DEATHS & INJURIES") + ggtitle("Storm Types responsible for 10 highest rates of deaths and injuries in the USA")

Question 2: Across the United States, which types of events have the greatest economic consequences?

sum2 <- summarise(by_type, TOTALCOST = sum(Cost,na.rm = TRUE)/1000000)
sum2_ordered <- sum2[order(-sum2$TOTALCOST),]
sum2_ordered_sub <- sum2_ordered[sum2_ordered$TOTALCOST > 0,]
head(sum2_ordered_sub,50)
## Source: local data frame [50 x 2]
## 
##               EVTYPE  TOTALCOST
##               (fctr)      (dbl)
## 1              FLOOD 150319.678
## 2  HURRICANE/TYPHOON  71913.713
## 3            TORNADO  57352.114
## 4        STORM SURGE  43323.541
## 5               HAIL  18758.222
## 6        FLASH FLOOD  17562.129
## 7            DROUGHT  15018.672
## 8          HURRICANE  14610.229
## 9        RIVER FLOOD  10148.405
## 10         ICE STORM   8967.041
## ..               ...        ...

Barplots showing top ten:

library(ggplot2)

cost_sub10 <- sum2_ordered_sub[1:10,]
names <- as.character(cost_sub10$EVTYPE)
STORM_TYPE <- factor(names, levels = names)
cost_sub102 <- cbind(STORM_TYPE,cost_sub10)

g1 <- ggplot(cost_sub102,aes(STORM_TYPE))
g1 + geom_bar(aes(weight = TOTALCOST)) + xlab("STORM TYPE") + ylab("TOTAL COST IN USD (millions)") + ggtitle("Storm Types responsible for 10 highest costs in terms of Property and Crop Damages")