Here the influence of storm events on USA is estimated using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. It was found that across the United States, excessive heat, tornadoes and floods affect public health most considerably. Floods and hurricanes have the greatest economic consequences.

Data Processing

library(knitr)
library(data.table)
if(!file.exists("storm.csv.bz")) {
        URL<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
        download.file(URL, "storm.csv.bz")
        bz<- bzfile("storm.csv.bz")
        storm<- read.csv(bz)
        unlink(bz)
        dateDownloaded<- date()
        }

The file was downloaded at Sat Mar 14 13:13:32 2015. In the code below, the variables BGN_DATE, EVTYPE, FATALITIES:CROPGMGEXP, and REFNUM are selected for the analysis. Other variables aren’t needed for the questions studied. Main damage entries are enough for the goal, so I exclude the remark column.

DT<- as.data.table(storm)
names(DT)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
DT<- DT[,.(BGN_DATE,EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP,REFNUM)]

Next is data cleaning. Fistly, variable names are converted to more convenient form.

suppressWarnings(names(DT)<- c("bg.date", "type", "fatalities", "injuries", "prop.damage", "prop.damage.exp", "crop.damage", "crop.damage.exp", "ref.num"))

Then pre-1996 rows are filtered out since they contain many poor and invalid entries.

DT<- DT[, bg.date := as.character(bg.date)]
DT<- DT[, bg.date := sapply(strsplit(bg.date, " "), "[", 1)]
DT<- DT[, bg.date := as.Date(bg.date, "%M/%d/%Y")]
DT<- DT<- DT[bg.date>as.Date("1996-01-01")]
str(DT)
## Classes 'data.table' and 'data.frame':   653530 obs. of  9 variables:
##  $ bg.date        : Date, format: "1996-03-06" "1996-03-11" ...
##  $ type           : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 972 834 856 856 856 244 359 856 856 856 ...
##  $ fatalities     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ injuries       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prop.damage    : num  380 100 3 5 2 0 400 12 8 12 ...
##  $ prop.damage.exp: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 1 17 17 17 17 ...
##  $ crop.damage    : num  38 0 0 0 0 0 0 0 0 0 ...
##  $ crop.damage.exp: Factor w/ 9 levels "","?","0","2",..: 7 1 1 1 1 1 1 1 1 1 ...
##  $ ref.num        : num  248768 248769 248770 248771 248772 ...
##  - attr(*, ".internal.selfref")=<externalptr>

There’s over 900 invalid types of events and many invalid exponents.

unique(DT$prop.damage.exp)
## [1] K   M B 0
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
# Clean up this column. Only "", "B", "K", and "M" exponents are valid according to the codebook.
DT<- DT[, prop.damage.exp := as.character(prop.damage.exp)]
DT<- DT[prop.damage.exp %in% c("", "B", "K", "M")]
# `crop.damage.exp` too.
DT<- DT[, crop.damage.exp := as.character(crop.damage.exp)]
DT<- DT[crop.damage.exp %in% c("", "B", "K", "M")]

Exponent columns are needed to be transformed into computable form.

DT$prop.damage.exp[DT$prop.damage.exp==""] <- 1
DT$prop.damage.exp[DT$prop.damage.exp=="K"] <- 1000
DT$prop.damage.exp[DT$prop.damage.exp=="M"] <- 1000000
DT$prop.damage.exp[DT$prop.damage.exp=="B"] <- 1000000000
DT<- DT[, prop.damage.exp := as.numeric(prop.damage.exp)]

DT$crop.damage.exp[DT$crop.damage.exp==""] <- 1
DT$crop.damage.exp[DT$crop.damage.exp=="K"] <- 1000
DT$crop.damage.exp[DT$crop.damage.exp=="M"] <- 1000000
DT$crop.damage.exp[DT$crop.damage.exp=="B"] <- 1000000000
DT<- DT[, crop.damage.exp := as.numeric(crop.damage.exp)]

Overall damage is computed below.

DT$prop<- DT$prop.damage*DT$prop.damage.exp
DT$crop<- DT$crop.damage*DT$crop.damage.exp
DT$overall.damage<- DT$prop+DT$crop

Select the variables again

DT<- DT[,.(type, fatalities, injuries, ref.num, overall.damage)]

Filter out events without any damage

DT$any<- DT$fatalities+DT$injuries+DT$overall.damage
DT<- DT[any!=0]
DT$any<- NULL
DT$type<- tolower(as.character(DT$type))

Cleaning of the main misspellings

DT$type<- gsub("tstm", "thunderstorm", DT$type)
DT$type<- gsub("[ ][(][a-z][0-9][0-9][)]", "", DT$type)
DT$type<- gsub("[ ][(][0-9][0-9][)]|[ ][0-9][0-9]", "", DT$type)
DT$type<- gsub("[ ]$", "", DT$type)
DT$type<- gsub("small hail", "hail", DT$type)
DT$type<- gsub("winds", "wind", DT$type)
DT$type<- gsub("cold temperature", "cold", DT$type)
DT$type<- gsub("cold weather", "cold", DT$type)
DT$type<- gsub("rip currents", "rip current", DT$type)
nrow(DT)
## [1] 201318
length(unique(DT$type))
## [1] 166

Number of observations has reduced considerably, and number of event types either. A bulk of event types belongs to proper categories (run table(DT$type) to make sure).

Results

  1. Influence of storm events on public health
dt_f <- DT[,. (f_sum=sum(fatalities)), by=type][order(-f_sum)][1:10]
head(dt_f, n=5)
##              type f_sum
## 1: excessive heat  1797
## 2:        tornado  1511
## 3:    flash flood   887
## 4:      lightning   651
## 5:    rip current   542
dotchart(dt_f$f_sum, labels=dt_f$type, pch=19, main="Top 10 event types with highest fatality rates")

plot of chunk plot1_fatalities

dt_i <- DT[,. (f_sum=sum(injuries)), by=type][order(-f_sum)][1:10]
head(dt_i, n=5)
##                 type f_sum
## 1:           tornado 20667
## 2:             flood  6758
## 3:    excessive heat  6391
## 4: thunderstorm wind  5033
## 5:         lightning  4141
dotchart(dt_i$f_sum, labels=dt_i$type, pch=19, main="Top 10 event types with highest injury rates")

plot of chunk plot2_injuries

  1. Across the United States, which types of events have the greatest economic consequences?
dt_d <- DT[,. (f_sum=sum(overall.damage)), by=type][order(-f_sum)][1:10]
head(dt_d, n=5)
##                 type        f_sum
## 1:             flood 148919611950
## 2: hurricane/typhoon  71913712800
## 3:       storm surge  43193541000
## 4:           tornado  24900370720
## 5:              hail  17092035870
dotchart(dt_d$f_sum, labels=dt_d$type, pch=19, main="Top 10 event types with highest property and crop damage rates")

plot of chunk plot3_damage