library(sqldf); library(stringdist); library(tcltk); 
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library("gridBase"); library("grid")
set.seed(1776)
SamplingPercent = 15
setwd("C:\\Users\\Public\\proj2")
writeASample = function(N, data, file) {
  tmp <- data[sort(sample(1:dim(data)[1], N)), ]
  names(tmp) <- names(data)
  write.csv(tmp, file)
}
urlpart1 = "https://d396qusza40orc.cloudfront.net/";
urlpart2 = "repdata%2Fdata%2FStormData.csv.bz2";
url <- sprintf("%s%s", urlpart1, urlpart2);
url <- gsub("https", "http", gsub("%2F", "/", url))
TOPN = 10
colors = c(rainbow(TOPN), "gray")
mypretty = function (x) {sprintf("%10.2s", x)}
myplot = function(data, MTITLE, YLAB) {
  counts <- data$tot
  names(counts) <- data$Event
  midpts <- barplot(counts, col=colors, main=MTITLE, 
                    xlab="", ylab=YLAB, names.arg="", xaxt= 'n',
                    cex.lab=.8, cex.axis=.8, cex.main=.8, cex.sub=.8)
  vps <- baseViewports()
  pushViewport(vps$inner, vps$figure, vps$plot)
  grid.text(names(counts),
            x = unit(midpts, "native"), y=unit(-1, "lines"),
            just="right", rot=30, gp=gpar(col=colors, fontsize=8))
  popViewport(3)
}
postStatus = function(message) {
   print(paste("Status update: ", message))
}

Synopsis

The National Storm Data is available for download at this data location. The data covers losses due to severe weather events from 1950 to 2011. Our goal is to analyse the data and determine:

  1. Across the US, which kinds of events are most harmful with respect to the population health?

  2. Across the US, which types of events have the greatest economic consequences.

The analysis uses the fields FATALITIES, INJURIES, PROPDMG, and CROPDMG to answer these questions. During the course of the analysis we merge some event types that are only superficially or marginally different from each other (e.g. singular and plural forms). There are over 329 unique event types. We report only top 10 events types. The analysis can easily be modified to show more events if needed. This entire report is produced using the code included in this document.

Data Processing

Creating samples of manageable size: Our two questions can be answered by working with a random sample of the data and scaling the resulting numbers for the full population. The program takes a variable SamplingPercent (current value = 15) to create such a random sample. This variable can be set to 100 if numbers for teh full population (without estimation) are required.

Identifying the most important variables: By studying the description of the variables we found the following variables in the data of value for this study and the data was condensed to only these variables: EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, and CROPDMGEXP.

#knit('SevereWeatherEventAnalysis.Rmd', tangle=TRUE)
#source('SevereWeatherEventAnalysis.R')


postStatus("Getting zipped data.")
## [1] "Status update:  Getting zipped data."
if (!file.exists("./StormData.csv.bz2")) {
    download.file(url, "StormData.csv.bz2")
    postStatus("Zipped data downloded and stored locally.")
} else {
  postStatus("Zipped data found locally.")
}
## [1] "Status update:  Zipped data found locally."
if (!file.exists("./StormData.csv")) {
    tmp <- read.csv(bzfile("StormData.csv.bz2"))
    write.csv(tmp, "StormData.csv")
    write.csv(names(tmp), "allCols.csv")
    write.csv(dim(tmp), "dimofdata.csv")
    postStatus("Data Unzipped and stored locally.")
} else {
  postStatus("unzipped data found locally.")
}
## [1] "Status update:  unzipped data found locally."
if (!file.exists("FullDataUsefulCols.csv")) {
    usefulcols = c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",  "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
    tmp <- read.csv("StormData.csv")
    tmp <- tmp[usefulcols]
    write.csv(tmp, "FullDataUsefulCols.csv")
    writeASample(floor(dim(tmp)[1]*.01*SamplingPercent), tmp, 
                 paste(SamplingPercent, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*5), tmp, 
                 paste(5, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*10), tmp,
                 paste(10, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*15), tmp,
                 paste(15, "percent_Sampl.csv", sep="_"))
    writeASample(floor(dim(tmp)[1]*.01*20), tmp,
                 paste(20, "percent_Sampl.csv", sep="_"))
    postStatus("Data Unzipped and stored locally.")
} else {
  postStatus("unzipped data found locally.")
}
## [1] "Status update:  unzipped data found locally."
workingFile = paste(SamplingPercent, "percent_Sampl.csv", sep="_")
if (!file.exists(workingFile)) {
    postStatus("Reading unzipped data and creating samples.")
    tmp <- read.csv("FullDataUsefulCols.csv")   
    writeASample(floor(dim(tmp)[1]*.01*SamplingPercent), tmp, workingFile)
} else {
  postStatus("Sample found locally.")
}
## [1] "Status update:  Sample found locally."
postStatus(paste("Reading workig file.", workingFile))
## [1] "Status update:  Reading workig file. 15_percent_Sampl.csv"
sampl <-  read.csv(workingFile)  

So we download the file if it is not already downloaded, and create some sample out of this file to be used in the future analysis. If the samples already exist we just read them.

Combining Similar Events: Many event types seem to be very similar to each other. We used R’s stringdist function as shown below to identify the event types that are very similar to each other and replaced them by their canonical form.

postStatus("Doing SQL-esque queries.")
## [1] "Status update:  Doing SQL-esque queries."
v = unique(sampl$EVTYPE)
u <- v
before = ""; after = ""
for (i in 1:(length(u)-1)) {
    for (j in (i+1):length(u)) {
      if ((u[i] != u[j]) && (stringdist(u[i], u[j], method='jaccard', q=2) < .25))
      {
        if (before == "") {
            before = u[j]; after = u[i]
        }
        u[j] = u[i]
      } 
  }
}
envtmp = new.env();
for (i in 1:(length(u))) {
    assign(as.character(v[i]), as.character(u[i]), envir = envtmp)
}

newcol = sampl$EVTYPE
for (i in 1:length(sampl$EVTYPE)) {
    if (newcol[i] != get(as.character(newcol[i]), envtmp)) {
        newcol[i] = get(as.character(newcol[i]), envtmp)
    }
}
length(unique(newcol))
## [1] 315
sampl <- cbind(sampl, newcol)

As a result of the above operation, 383 event types reduced to 315 canonical types. By the way of an example event type [TSTM WINDS] was replaced by its canonical form [TSTM WIND].

We will work with only the following most useful columns:

print(names(sampl)) 
##  [1] "X.1"        "X"          "EVTYPE"     "FATALITIES" "INJURIES"  
##  [6] "PROPDMG"    "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "newcol"

Finding Events with most severe events:

fatalities <- sqldf(
     paste("select newcol as Event, sum(FATALITIES) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallfat <- sqldf("select sum(FATALITIES) from sampl")
sigmarestfat <- sigmaallfat - sum(fatalities$tot)
fatalities[TOPN+1, 1] <- "all others "
fatalities[TOPN+1, 2] <- sigmarestfat

injuries <- sqldf(
  paste("select newcol as Event, sum(INJURIES) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallinj <- sqldf("select sum(INJURIES) from sampl")
sigmarestinj <- sigmaallinj - sum(injuries$tot)
injuries[TOPN+1, 1] <- "all others "
injuries[TOPN+1, 2] <- sigmarestinj

proploss <- sqldf(
  paste("select newcol as Event, sum(PROPDMG) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallpro <- sqldf("select sum(PROPDMG) from sampl")
sigmarestpro <- sigmaallpro - sum(proploss$tot)
proploss[TOPN+1, 1] <- "all others "
proploss[TOPN+1, 2] <- sigmarestpro

croploss <- sqldf(
  paste("select newcol as Event, sum(CROPDMG) as tot from sampl group by newcol order by tot desc limit", TOPN))
sigmaallcro <- sqldf("select sum(CROPDMG) from sampl")
sigmarestcro <- sigmaallcro - sum(croploss$tot)
croploss[TOPN+1, 1] <- "all others "
croploss[TOPN+1, 2] <- sigmarestcro
postStatus("Data ready for plotting.")
## [1] "Status update:  Data ready for plotting."
fatalities[,2] <- fatalities[,2] * (100/SamplingPercent)
injuries[,2] <- injuries[,2] * (100/SamplingPercent)
proploss[,2] <- proploss[,2] * (100/SamplingPercent)
croploss[,2] <- croploss[,2] * (100/SamplingPercent)
#create full data set estimates

Results

The sections show the events that cause the most life and property damages.

Health damage due to severe weather events

10 sources of fatalities

Event Total Damage
TORNADO 5853.3333333
EXCESSIVE HEAT 2073.3333333
FLASH FLOOD 1086.6666667
LIGHTNING 880
RIP CURRENT 500
TSTM WIND 446.6666667
FLOOD 380
THUNDERSTORM WINDS 273.3333333
HEAT 253.3333333
HEAT WAVE 253.3333333
all others 2680

10 sources of injuries

Event Total Damage
TORNADO 9.240666710^{4}
TSTM WIND 7533.3333333
LIGHTNING 5493.3333333
BLIZZARD 4100
FLOOD 4020
THUNDERSTORM WINDS 3373.3333333
EXCESSIVE HEAT 3366.6666667
HEAT 2253.3333333
FLASH FLOOD 1766.6666667
WILDFIRE 1346.6666667
all others 1.130666710^{4}
par(mfrow=c(1,2))

myplot(fatalities, "Total Fatalities (severest events)", "Total Fatalities")
myplot(injuries, "Total Injuries (severest events)", "Total Injuries")

Economic damage due to severe weather events

10 sources of proploss

Event Total Damage
TORNADO 32
FLASH FLOOD 14
TSTM WIND 13
THUNDERSTORM WINDS 12
FLOOD 87
HAIL 68
LIGHTNING 57
HIGH WINDS 36
WINTER STORM 14
WILDFIRE 99
all others 77

10 sources of croploss

Event Total Damage
HAIL 59
FLOOD 18
FLASH FLOOD 15
TSTM WIND 11
TORNADO 10
THUNDERSTORM WINDS 64
DROUGHT 37
HIGH WINDS 14
HEAVY RAINS 78
WILDFIRE 70
all others 58
par(mfrow=c(1,2))
myplot(proploss, "Total Prop Loss (severest events)", "Total Property Loss (USD)")
myplot(croploss, "Total Crop Loss (severest events)", "Total Crop Loss (USD)")

Further research recommendation

  1. Adjust property and crop loss estimates for inflation.

  2. Examine recent years more closely. Due to improvement in technology and preventive mesaures, the estimates of bodily and property injuries and harms may have significantly changed in the recent years.

Citations

  1. The National Storm Data is available for download at this data location.

  2. The above data is described in a document that is avilable at the weather service website.