Determining Most Hazardous Natural Events from Historic Severe Weather Event Data

library(sqldf); library(stringdist); library(tcltk);

## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI

library("gridBase"); library("grid")
set.seed(1776)
SamplingPercent = 15
setwd("C:\\Users\\Public\\proj2")
writeASample = function(N, data, file) {
  tmp <- data[sort(sample(1:dim(data)[1], N)), ]
  names(tmp) <- names(data)
  write.csv(tmp, file)
}
urlpart1 = "https://d396qusza40orc.cloudfront.net/";
urlpart2 = "repdata%2Fdata%2FStormData.csv.bz2";
url <- sprintf("%s%s", urlpart1, urlpart2);
url <- gsub("https", "http", gsub("%2F", "/", url))
TOPN = 10
colors = c(rainbow(TOPN), "gray")
mypretty = function (x) {sprintf("%10.2s", x)}
myplot = function(data, MTITLE, YLAB) {
  counts <- data$tot
  names(counts) <- data$Event
  midpts <- barplot(counts, col=colors, main=MTITLE, 
                    xlab="", ylab=YLAB, names.arg="", xaxt= 'n',
                    cex.lab=.8, cex.axis=.8, cex.main=.8, cex.sub=.8)
  vps <- baseViewports()
  pushViewport(vps$inner, vps$figure, vps$plot)
  grid.text(names(counts),
            x = unit(midpts, "native"), y=unit(-1, "lines"),
            just="right", rot=30, gp=gpar(col=colors, fontsize=8))
  popViewport(3)
}
postStatus = function(message) {
   print(paste("Status update: ", message))
}

Synopsis

The National Storm Data is available for download at this data location. The data covers losses due to severe weather events from 1950 to 2011. Our goal is to analyse the data and determine:

Across the US, which kinds of events are most harmful with respect to the population health?
Across the US, which types of events have the greatest economic consequences.

The analysis uses the fields FATALITIES, INJURIES, PROPDMG, and CROPDMG to answer these questions. During the course of the analysis we merge some event types that are only superficially or marginally different from each other (e.g. singular and plural forms). There are over 329 unique event types. We report only top 10 events types. The analysis can easily be modified to show more events if needed. This entire report is produced using the code included in this document.

Data Processing

Creating samples of manageable size: Our two questions can be answered by working with a random sample of the data and scaling the resulting numbers for the full population. The program takes a variable SamplingPercent (current value = 15) to create such a random sample. This variable can be set to 100 if numbers for teh full population (without estimation) are required.

Identifying the most important variables: By studying the description of the variables we found the following variables in the data of value for this study and the data was condensed to only these variables: EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, and CROPDMGEXP.

#knit('SevereWeatherEventAnalysis.Rmd', tangle=TRUE)
#source('SevereWeatherEventAnalysis.R')


postStatus("Getting zipped data.")

## [1] "Status update:  Getting zipped data."

if (!file.exists("./StormData.csv.bz2")) {
    download.file(url, "StormData.csv.bz2")
    postStatus("Zipped data downloded and stored locally.")
} else {
  postStatus("Zipped data found locally.")
}

## [1] "Status update:  Zipped data found locally."

if (!file.exists("./StormData.csv")) {
    tmp <- read.csv(bzfile("StormData.csv.bz2"))
    write.csv(tmp, "StormData.csv")
    write.csv(names(tmp), "allCols.csv")
    write.csv(dim(tmp), "dimofdata.csv")
    postStatus("Data Unzipped and stored locally.")
} else {
  postStatus("unzipped data found locally.")
}

## [1] "Status update:  unzipped data found locally."

if (!file.exists("FullDataUsefulCols.csv")) {
    usefulcols = c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",  "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
    tmp <- read.csv("StormData.csv")
    tmp <- tmp[usefulcols]
    write.csv(tmp, "FullDataUsefulCols.csv")
    writeASample(floor(dim(tmp)[1]*.01*SamplingPercent), tmp, 
                 paste(SamplingPercent, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*5), tmp, 
                 paste(5, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*10), tmp,
                 paste(10, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*15), tmp,
                 paste(15, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*20), tmp,
                 paste(20, "percent_Sampl.csv", sep="_"))
    postStatus("Data Unzipped and stored locally.")
} else {
  postStatus("unzipped data found locally.")
}

## [1] "Status update:  unzipped data found locally."

workingFile = paste(SamplingPercent, "percent_Sampl.csv", sep="_")
if (!file.exists(workingFile)) {
    postStatus("Reading unzipped data and creating samples.")
    tmp <- read.csv("FullDataUsefulCols.csv")   
    writeASample(floor(dim(tmp)[1]*.01*SamplingPercent), tmp, workingFile)
} else {
  postStatus("Sample found locally.")
}

## [1] "Status update:  Sample found locally."

postStatus(paste("Reading workig file.", workingFile))

## [1] "Status update:  Reading workig file. 15_percent_Sampl.csv"

sampl <-  read.csv(workingFile)

So we download the file if it is not already downloaded, and create some sample out of this file to be used in the future analysis. If the samples already exist we just read them.

Combining Similar Events: Many event types seem to be very similar to each other. We used R’s stringdist function as shown below to identify the event types that are very similar to each other and replaced them by their canonical form.

postStatus("Doing SQL-esque queries.")

## [1] "Status update:  Doing SQL-esque queries."

v = unique(sampl$EVTYPE)
u <- v
before = ""; after = ""
for (i in 1:(length(u)-1)) {
    for (j in (i+1):length(u)) {
      if ((u[i] != u[j]) && (stringdist(u[i], u[j], method='jaccard', q=2) < .25))
      {
        if (before == "") {
            before = u[j]; after = u[i]
        }
        u[j] = u[i]
      } 
  }
}
envtmp = new.env();
for (i in 1:(length(u))) {
    assign(as.character(v[i]), as.character(u[i]), envir = envtmp)
}

newcol = sampl$EVTYPE
for (i in 1:length(sampl$EVTYPE)) {
    if (newcol[i] != get(as.character(newcol[i]), envtmp)) {
        newcol[i] = get(as.character(newcol[i]), envtmp)
    }
}
length(unique(newcol))

## [1] 315

sampl <- cbind(sampl, newcol)

As a result of the above operation, 383 event types reduced to 315 canonical types. By the way of an example event type [TSTM WINDS] was replaced by its canonical form [TSTM WIND].

We will work with only the following most useful columns:

print(names(sampl))

##  [1] "X.1"        "X"          "EVTYPE"     "FATALITIES" "INJURIES"  
##  [6] "PROPDMG"    "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "newcol"

Finding Events with most severe events:

fatalities <- sqldf(
     paste("select newcol as Event, sum(FATALITIES) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallfat <- sqldf("select sum(FATALITIES) from sampl")
sigmarestfat <- sigmaallfat - sum(fatalities$tot)
fatalities[TOPN+1, 1] <- "all others "
fatalities[TOPN+1, 2] <- sigmarestfat

injuries <- sqldf(
  paste("select newcol as Event, sum(INJURIES) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallinj <- sqldf("select sum(INJURIES) from sampl")
sigmarestinj <- sigmaallinj - sum(injuries$tot)
injuries[TOPN+1, 1] <- "all others "
injuries[TOPN+1, 2] <- sigmarestinj

proploss <- sqldf(
  paste("select newcol as Event, sum(PROPDMG) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallpro <- sqldf("select sum(PROPDMG) from sampl")
sigmarestpro <- sigmaallpro - sum(proploss$tot)
proploss[TOPN+1, 1] <- "all others "
proploss[TOPN+1, 2] <- sigmarestpro

croploss <- sqldf(
  paste("select newcol as Event, sum(CROPDMG) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallcro <- sqldf("select sum(CROPDMG) from sampl")
sigmarestcro <- sigmaallcro - sum(croploss$tot)
croploss[TOPN+1, 1] <- "all others "
croploss[TOPN+1, 2] <- sigmarestcro
postStatus("Data ready for plotting.")

## [1] "Status update:  Data ready for plotting."

fatalities[,2] <- fatalities[,2] * (100/SamplingPercent)
injuries[,2] <- injuries[,2] * (100/SamplingPercent)
proploss[,2] <- proploss[,2] * (100/SamplingPercent)
croploss[,2] <- croploss[,2] * (100/SamplingPercent)
#create full data set estimates

Results

The sections show the events that cause the most life and property damages.

Health damage due to severe weather events

10 sources of fatalities

Event	Total Damage
TORNADO	5853.3333333
EXCESSIVE HEAT	2073.3333333
FLASH FLOOD	1086.6666667
LIGHTNING	880
RIP CURRENT	500
TSTM WIND	446.6666667
FLOOD	380
THUNDERSTORM WINDS	273.3333333
HEAT	253.3333333
HEAT WAVE	253.3333333
all others	2680

10 sources of injuries

Event	Total Damage
TORNADO	9.240666710^{4}
TSTM WIND	7533.3333333
LIGHTNING	5493.3333333
BLIZZARD	4100
FLOOD	4020
THUNDERSTORM WINDS	3373.3333333
EXCESSIVE HEAT	3366.6666667
HEAT	2253.3333333
FLASH FLOOD	1766.6666667
WILDFIRE	1346.6666667
all others	1.130666710^{4}

par(mfrow=c(1,2))

myplot(fatalities, "Total Fatalities (severest events)", "Total Fatalities")
myplot(injuries, "Total Injuries (severest events)", "Total Injuries")

Economic damage due to severe weather events

10 sources of proploss

Event	Total Damage
TORNADO	32
FLASH FLOOD	14
TSTM WIND	13
THUNDERSTORM WINDS	12
FLOOD	87
HAIL	68
LIGHTNING	57
HIGH WINDS	36
WINTER STORM	14
WILDFIRE	99
all others	77

10 sources of croploss

Event	Total Damage
HAIL	59
FLOOD	18
FLASH FLOOD	15
TSTM WIND	11
TORNADO	10
THUNDERSTORM WINDS	64
DROUGHT	37
HIGH WINDS	14
HEAVY RAINS	78
WILDFIRE	70
all others	58

par(mfrow=c(1,2))
myplot(proploss, "Total Prop Loss (severest events)", "Total Property Loss (USD)")
myplot(croploss, "Total Crop Loss (severest events)", "Total Crop Loss (USD)")

Further research recommendation

Adjust property and crop loss estimates for inflation.
Examine recent years more closely. Due to improvement in technology and preventive mesaures, the estimates of bodily and property injuries and harms may have significantly changed in the recent years.

Citations

The National Storm Data is available for download at this data location.
The above data is described in a document that is avilable at the weather service website.