Synopsis

The goal of this analysis is to determine which type of weather events across the US are the most harmful to public health and which have the greatest economic consequences. I used the NOAA Storm Database which tracked weather events from 1950 until 2011. I determined that tornados are the most significant storm event with respect to harming people and causing economic damage.

Data Processing

Reading and loading data

 if( !file.exists('data')){
    dir.create('data')
    fileUrl <- 'http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
    download.file(fileUrl, destfile = 'data/sampledata.csv.bz2')  
 }
 data <- read.csv('data/sampledata.csv.bz2', header=TRUE)
 #head(data, 5) # display the first 5 rows

Type Checking

I am interested in 5 columns from the whole data set:

  • EVTYPE : Weather Event type

  • FATALITIES : no. of deaths

  • INJURIES : no. of injured

  • CROPDMG : no. of crop damage

  • PROPDMG : no. of property damage

 sapply(data, class) # Retrieve name and class of all columns in the data.frame called "data"
##    STATE__   BGN_DATE   BGN_TIME  TIME_ZONE     COUNTY COUNTYNAME 
##  "numeric"   "factor"   "factor"   "factor"  "numeric"   "factor" 
##      STATE     EVTYPE  BGN_RANGE    BGN_AZI BGN_LOCATI   END_DATE 
##   "factor"   "factor"  "numeric"   "factor"   "factor"   "factor" 
##   END_TIME COUNTY_END COUNTYENDN  END_RANGE    END_AZI END_LOCATI 
##   "factor"  "numeric"  "logical"  "numeric"   "factor"   "factor" 
##     LENGTH      WIDTH          F        MAG FATALITIES   INJURIES 
##  "numeric"  "numeric"  "integer"  "numeric"  "numeric"  "numeric" 
##    PROPDMG PROPDMGEXP    CROPDMG CROPDMGEXP        WFO STATEOFFIC 
##  "numeric"   "factor"  "numeric"   "factor"   "factor"   "factor" 
##  ZONENAMES   LATITUDE  LONGITUDE LATITUDE_E LONGITUDE_    REMARKS 
##   "factor"  "numeric"  "numeric"  "numeric"  "numeric"   "factor" 
##     REFNUM 
##  "numeric"

Apparently, required data are technically in the correct form.

Check Data Consistency

  • Check NA and INF for numerical data
  is.special <- function(x){
     if(is.numeric(x)) !is.finite(x)  else { is.na(x) || is.null(x)}
  }
  test <- sapply(data$FATALITIES, is.special)
  test <- factor(test) # coerce test to factor
  levels(test) # return FALSE only indicating no NA, INF, NaN or NULL values exist
## [1] "FALSE"
  • Check data levels for categorical data
# levels(data$EVTYPE) # Lots of inconsistent levels appear 
  • Fixing data for input inconsistency
data$EVTYPE <- toupper(data$EVTYPE)
data$EVTYPE <- gsub('.*TSTM.*', 'THUNDERSTORM', data$EVTYPE)
data$EVTYPE <- gsub('.*THUNDERSTORM.*','THUNDERSTORM', data$EVTYPE)
data$EVTYPE <- gsub('.*HEAT.*', 'HEAT', data$EVTYPE)
data$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('.*FLD.*', 'FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('.*LIGHTNING.*', 'LIGHTNING', data$EVTYPE)
data$EVTYPE <- gsub('.*TROPICAL STORM.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*HURRICANE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*CYCLONE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TROPICAL DEPRESSION.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TYPHOON.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*STORM SURGE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', data$EVTYPE)
data$EVTYPE <- gsub('.*SNOW.*', 'SNOW', data$EVTYPE)
data$EVTYPE <- gsub('.*WINTER.*', 'WINTER', data$EVTYPE)
data$EVTYPE <- gsub('.*ICE.*', 'ICE', data$EVTYPE)
data$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', data$EVTYPE)
data$EVTYPE <- gsub('.*COLD.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*ICY.*', 'ICE', data$EVTYPE)
data$EVTYPE <- gsub('.*FREEZE.*', 'FREEZE', data$EVTYPE)
data$EVTYPE <- gsub('.*WIND.*', 'WINDS', data$EVTYPE)
data$EVTYPE <- gsub('.*RAIN.*', 'RAIN', data$EVTYPE)
data$EVTYPE <- gsub('.*SHOWER.*', 'HEAVY RAIN', data$EVTYPE)
data$EVTYPE <- gsub('.*FIRE.*', 'FIRE', data$EVTYPE)
data$EVTYPE <- gsub('.*SLIDE.*', 'LANDSLIDE', data$EVTYPE)
data$EVTYPE <- gsub('.*FOG.*', 'FOG', data$EVTYPE)
data$EVTYPE <- gsub('.*HAIL.*', 'HAIL', data$EVTYPE)

Now, data is ready for analysis…

Results

Q.1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

  • Proposed Hypothesis: \[ Human Harm = Deaths + Injuries \]
  library(data.table)
 # select required data
  health <- data.table(subset(data, select = c('EVTYPE', 'FATALITIES', 'INJURIES'))) 
 # calculate sum of dead and injured people grouped by EVTYPE  
  health.totals <- health[,list(FATALITIES=sum(FATALITIES), INJURIES=sum(INJURIES)), by='EVTYPE']
 # add a new column called HUMAN_HARM that stores the sum of dead and injured people  
  health.totals$HUMAN_HARM <- health.totals$FATALITIES + health.totals$INJURIES
 # sort the data descendingly and store the top 5 rows
  health.top <- head(subset(health.totals[order(health.totals$HUMAN_HARM, decreasing=T)]), 5)

Now, we plot the sorted data (health.top)

 # convert data.table to data.frame
 health.top <- data.frame(health.top)
  
 # plot bar chart

 library(ggplot2)
  
 ggplot(data=health.top, aes(x=EVTYPE, y = HUMAN_HARM)) + 
        geom_bar(color="black", fill="red", width=0.7, stat="identity") + 
        xlab("Event Type") +  ylab("Human Harm")  + 
        ggtitle("Human Harm vs. Event Type")

plot of chunk unnamed-chunk-7

 # store the worst event
 event <- toString(with(health.top, EVTYPE[1])) 
 event
## [1] "TORNADO"

Apparently, TORNADO is the worst event. Now, the break down of the human harm needs to be checked…

  ## draw a bar chart for 3 quantitative data
  library(data.table)  
  names(health.top) <- c("Event", "Deaths", "Injuries", "Human_Harm")
  library(reshape2)
  melted  <- melt(health.top ,id.vars = 'Event')
  ggplot(melted, aes(x = Event , y = value)) + 
        geom_bar(aes(fill = variable ),position = "dodge",stat="identity", width=0.7) + 
        labs(x ='EVENT TYPE' , y = 'TOTAL',title = 'TOP 5 Fatalities/Injuries Count by Event Type')

plot of chunk unnamed-chunk-8

From the above plots, we may conclude that the most harmful event is TORNADO because it causes the maximum number of injuries is 9.1407 × 104 and the maximum number of death is 5636. The sum of all the aforementioned human harms is 9.7043 × 104.

Q.2. Across the United States, which types of events have the greatest economic consequences?

  • Proposed Hypothesis: \[ Economic Harm = Crop Damage + Property Damage \]
# select required data
library(data.table)
property <- data.table(subset(data, select=c('EVTYPE', 'CROPDMG', 'PROPDMG')))
# calculate sum of crop and property damages grouped by EVTYPE
property.totals <- property[,list(CROPDMG=sum(CROPDMG), PROPDMG=sum(PROPDMG)), by='EVTYPE']
# add a new column called HUMAN_HARM that stores the sum of crop and property damages
property.totals$ECONOMIC_HARM <-  property.totals$CROPDMG + property.totals$PROPDMG
# sort the data descendingly and store the top 5 rows
property.top <- head(subset(property.totals[order(property.totals$ECONOMIC_HARM, decreasing=T)]), 5)

Now, we plot the sorted data (property.top)

 # convert data.table to data.frame
 property.top <- data.frame(property.top)
 
 # plot bar chart
 library(ggplot2)
 names(property.top) <- c("Event", "Crop_Damage", "Property_Damage", "Economic_Harm")
 library(reshape2)
 melted  <- melt(property.top ,id.vars = 'Event')
 # plot bar chart (Note: divide y by 10^6 to scale to millions)
 ggplot(melted, aes(x = Event , y = value/10**6)) + 
        geom_bar(aes(fill = variable ),position = "dodge",stat="identity", width=0.7) + 
        labs(x ='EVENT TYPE' , y = 'TOTAL Dollars Lost (millions)', title = 'TOP 5 Crop/Prop Damages by Event Type')

plot of chunk unnamed-chunk-10

 # store the worst event
 event1 <- toString(with(property.top, Event[1])) 
 event1
## [1] "TORNADO"

From the above plot, we may conclude that the most harmful event is TORNADO because it causes the maximum economical loss : 3.3158 million dollars..

Hardware and Software Profile for Conducting This Analysis

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
## 
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4     evaluate_0.5.5   formatR_1.0      htmltools_0.2.4 
## [5] knitr_1.6        rmarkdown_0.2.68 stringr_0.6.2    tools_3.1.1     
## [9] yaml_2.1.13