Coursera repdata-016: Project2

Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

(cited from coursera course project instructions)

R sessionInfo

sessionInfo()

## R version 3.1.2 (2014-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_HK.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_HK.UTF-8        LC_COLLATE=en_HK.UTF-8    
##  [5] LC_MONETARY=en_HK.UTF-8    LC_MESSAGES=en_HK.UTF-8   
##  [7] LC_PAPER=en_HK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_HK.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4     evaluate_0.5.5   formatR_1.0      htmltools_0.2.6 
## [5] knitr_1.8        rmarkdown_0.3.10 stringr_0.6.2    tools_3.1.2     
## [9] yaml_2.1.13

Data Processing

download and load the raw data

# download data
library(RCurl)

## Loading required package: bitops

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = 'data.csv.bz2',method='curl')
# extract csv data
#library(R.utils)
#bunzip2("data.csv.bz2",destname = 'data.csv', overwrite=TRUE, remove=F)
# read in data
data<-read.table(bzfile('data.csv.bz2'),header=T,sep=',')
dim(data)

## [1] 902297     37

names(data)

##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

process the data

population health
use “data$FATALITIES+data$INJURIES” as measure of population health

data$health = data$FATALITIES+data$INJURIES
health=aggregate(data$health,by=list(data$EVTYPE),sum,na.rm=T)
names(health)=c('type','num')
# select top 10 types
health=health[order(health$num,decreasing = T),]
health=health[1:10,]
print(health)

##                  type   num
## 830           TORNADO 96979
## 123    EXCESSIVE HEAT  8428
## 854         TSTM WIND  7461
## 164             FLOOD  7259
## 452         LIGHTNING  6046
## 269              HEAT  3037
## 147       FLASH FLOOD  2755
## 424         ICE STORM  2064
## 759 THUNDERSTORM WIND  1621
## 972      WINTER STORM  1527

economic
use PROPDMG and CROPDMG as measure of economic loss

# process the raw PROPDMGEXP and CROPDMGEXP 
unique(data$PROPDMGEXP)

##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M

unique(data$CROPDMGEXP)

## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M

library(plyr)
data$propexpnum <- as.numeric(as.character((revalue(data$PROPDMGEXP, c("B"=9, "M" = 6, "m" = 6,"K" = 3,"H" = 2, "h" = 2,"-"= NA, "+" =NA, "?" = NA)))))
data$croexpnum <- as.numeric(as.character(revalue(data$CROPDMGEXP, c("B"=9, "M" = 6, "m" = 6,"K" = 3,"k" = 3, "?" =NA))))
data$econ = data$PROPDMG*10^data$propexpnum+data$CROPDMG*10^data$croexpnum
# top 10 types
econ=aggregate(data$econ,by=list(data$EVTYPE),sum,na.rm=T)
names(econ)=c('type','num')
# select top 10 types
econ=econ[order(econ$num,decreasing = T),]
econ=econ[1:10,]
print(econ)

##                  type          num
## 164             FLOOD 138007444500
## 406 HURRICANE/TYPHOON  29348167800
## 830           TORNADO  16570326363
## 397         HURRICANE  12405268000
## 586       RIVER FLOOD  10108369000
## 238              HAIL  10048596590
## 147       FLASH FLOOD   8716525177
## 424         ICE STORM   5925150850
## 667  STORM SURGE/TIDE   4641493000
## 759 THUNDERSTORM WIND   3813647990

Results

par(mar=c(12,4,4,2))
barplot(health$num,names.arg = health$type,main='population health',las=2)

barplot(econ$num,names.arg = econ$type,main='economic',las=2)

So, - population health: tornado - economic consequences: flood