Data Processing

The file is downloaded and read in to a data frame using read.csv

if(!"RCurl" %in% rownames(installed.packages())) install.packages("RCurl")
library(RCurl)
## Loading required package: bitops
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "./StormData.csv.bz2", method="curl")
OriginalData <- read.csv("./StormData.csv.bz2")

Look at the data

names(OriginalData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Select only the column that matter to our anlysis. Also, since the data before Jan 1996 does not have all event types records, those missing data can make comparing different type of events unfair. Therefore, data before Jan 1966 is neglected.

UsefulData <- subset(OriginalData, as.Date(BGN_DATE,"%m/%d/%Y") < as.Date("1/1/1996", "%m/%d/%Y"), select = c(EVTYPE,INJURIES,FATALITIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP))

Now it is left with

summary(UsefulData)
##                 EVTYPE         INJURIES           FATALITIES      
##  TSTM WIND         :91278   Min.   :   0.0000   Min.   :  0.0000  
##  HAIL              :80946   1st Qu.:   0.0000   1st Qu.:  0.0000  
##  TORNADO           :37498   Median :   0.0000   Median :  0.0000  
##  THUNDERSTORM WINDS:20843   Mean   :   0.3318   Mean   :  0.0258  
##  FLASH FLOOD       : 3278   3rd Qu.:   0.0000   3rd Qu.:  0.0000  
##  LIGHTNING         : 2551   Max.   :1700.0000   Max.   :583.0000  
##  (Other)           :12373                                         
##     PROPDMG         PROPDMGEXP        CROPDMG           CROPDMGEXP    
##  Min.   :  0.00          :189749   Min.   :  0.0000          :245344  
##  1st Qu.:  0.00   K      : 54727   1st Qu.:  0.0000   K      :  3146  
##  Median :  0.00   M      :  3956   Median :  0.0000   M      :   223  
##  Mean   : 13.04   0      :   215   Mean   :  0.7074   k      :    21  
##  3rd Qu.:  0.00   5      :    28   3rd Qu.:  0.0000   0      :    19  
##  Max.   :900.00   1      :    25   Max.   :750.0000   ?      :     7  
##                   (Other):    67                      (Other):     7

Look at the type of events (EVTPE)

head(unique(UsefulData$EVTYPE),n=20)
##  [1] TORNADO                   TSTM WIND                
##  [3] HAIL                      FREEZING RAIN            
##  [5] SNOW                      ICE STORM/FLASH FLOOD    
##  [7] SNOW/ICE                  WINTER STORM             
##  [9] HURRICANE OPAL/HIGH WINDS THUNDERSTORM WINDS       
## [11] RECORD COLD               HURRICANE ERIN           
## [13] HURRICANE OPAL            HEAVY RAIN               
## [15] LIGHTNING                 THUNDERSTORM WIND        
## [17] DENSE FOG                 RIP CURRENT              
## [19] THUNDERSTORM WINS         FLASH FLOOD              
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND

According to the official document there are only 48 types, same type of events are labeled slightly differently

Load the official event type list from the documentation pdf file

if(!"pdftools" %in% rownames(installed.packages())) install.packages("pdftools")
library(pdftools)

txt <- pdf_text("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf")
page <- sub(".*Storm Data Event Table *(.*?) *Legend.*", "\\1", txt[6])

conn <- textConnection(page)
NameList <- do.call(rbind, lapply(readLines(conn), function(u) strsplit(u,'\\s{2,}')[[1]]))
NameList <- NameList[-1,c(1,3)]
NameListS <- c(NameList[,1],NameList[,2])
NameListS <- toupper(NameListS)

Approximate matching EVTYPE from out data to the official list, choosing the closest official event as the event type

if(!"stringdist" %in% rownames(installed.packages())) install.packages("stringdist")
library(stringdist)

chooseType <- function(x) {NameListS[amatch(x,NameListS,maxDist=25)]}

EventType <- sapply(UsefulData$EVTYPE,chooseType)

For the crop damage and the property damage, the column CROPDMG and CROPDMGEXP are used to calculate the ecrop damage. Likewise for property damage.

According to the supplementary article: https://rstudio-pubs-static.s3.amazonaws.com/58957_37b6723ee52b455990e149edde45e5b6.html

B or b = Billion, M or m = Million, K or k = Thousand, H or h = Hundred. The number from one to ten represent the multiplier of ten then adding the number. The symbols “-”, “+” and “?” can be ignored.

getValue <- function(dmg,exp) {
  dmg <- as.numeric(dmg)
  exp <- as.character(exp)
  if ((exp %in% "B") | (exp %in% "b")) dmg*10^9
  else if ((exp %in% "M") | (exp %in% "m")) dmg*10^6
  else if ((exp %in% "K") | (exp %in% "k")) dmg*10^3
  else if ((0 <= exp) & (exp <= 9)) dmg*10+as.numeric(exp)
  else dmg
}

recordCount <- length(UsefulData$CROPDMG)

EconDamage <- vector(length = recordCount)

for (i in (1:recordCount)){
  CropDamage <- getValue(UsefulData$CROPDMG[i],UsefulData$CROPDMGEXP[i])
  PropDamage <- getValue(UsefulData$PROPDMG[i],UsefulData$PROPDMGEXP[i])
  EconDamage[i] <- CropDamage + PropDamage
  }

Now everything can be put together and aggregate by event type.

cleanData <- cbind(EventType,sum(UsefulData$INJURIES,UsefulData$FATALITIES),EconDamage)
cleanData <- as.data.frame(cleanData)
names(cleanData) <- c("EventType","Injuries_Fatalities","EconDamage")
cleanData <- aggregate(cbind(Injuries_Fatalities,EconDamage)~EventType,data=cleanData,sum)
head(cleanData,n=10)
##          EventType Injuries_Fatalities EconDamage
## 1        AVALANCHE                  11        940
## 2         BLIZZARD                  90       6806
## 3    COASTAL FLOOD                 199      16623
## 4  COLD/WIND CHILL                  30        966
## 5      DEBRIS FLOW                  41       1699
## 6        DENSE FOG                 106       3168
## 7      DENSE SMOKE                   6        298
## 8          DROUGHT                  87       3071
## 9       DUST DEVIL                  15       2974
## 10      DUST STORM                  18       1363

Results

Across United Satae, which events are most harmful with respect to population health?

with(cleanData,barplot(height=Injuries_Fatalities,main="Public Heath Harm of different events",xlab="events",ylab="number of incidents"))

Only 8 of them have a significance in popultaion health, finding the top 8 events causing the most injuries in decending order.

if(!"dplyr" %in% rownames(installed.packages())) install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
arrData <- arrange(cleanData,desc(Injuries_Fatalities))
head(arrData[,-3],n=8)
##           EventType Injuries_Fatalities
## 1         HIGH WIND               93204
## 2              HAIL               81061
## 3           TORNADO               37537
## 4 THUNDERSTORM WIND               22303
## 5       FLASH FLOOD                5137
## 6         LIGHTNING                2563
## 7        HEAVY SNOW                1868
## 8             FLOOD                1573

Across United Satae, which events have the greatest economic consequences?

with(cleanData,barplot(height=EconDamage,main="Economic Damage of different events",xlab="events",ylab="amount of damgae (in US dollars)"))

Looking at the top 8 of the events causing greatest economic consequences in orders.

arrData <- arrange(cleanData,desc(EconDamage))
head(arrData[,-2],n=8)
##           EventType EconDamage
## 1           TORNADO    4663031
## 2 THUNDERSTORM WIND    3124983
## 3              HAIL     863250
## 4       FLASH FLOOD     695801
## 5         LIGHTNING     407261
## 6         HIGH WIND     328380
## 7             FLOOD     204386
## 8        HEAVY SNOW     101394

Compare to the previous table, the top 8 events are the same but in different orders.

In conclution, these are the 8 events which has the biggest impact on puplic health and economical damage.