Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
(cited from coursera course project instructions)
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_HK.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_HK.UTF-8 LC_COLLATE=en_HK.UTF-8
## [5] LC_MONETARY=en_HK.UTF-8 LC_MESSAGES=en_HK.UTF-8
## [7] LC_PAPER=en_HK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_HK.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.4 evaluate_0.5.5 formatR_1.0 htmltools_0.2.6
## [5] knitr_1.8 rmarkdown_0.3.10 stringr_0.6.2 tools_3.1.2
## [9] yaml_2.1.13
download and load the raw data
# download data
library(RCurl)
## Loading required package: bitops
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = 'data.csv.bz2',method='curl')
# extract csv data
#library(R.utils)
#bunzip2("data.csv.bz2",destname = 'data.csv', overwrite=TRUE, remove=F)
# read in data
data<-read.table(bzfile('data.csv.bz2'),header=T,sep=',')
dim(data)
## [1] 902297 37
names(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
process the data
data$health = data$FATALITIES+data$INJURIES
health=aggregate(data$health,by=list(data$EVTYPE),sum,na.rm=T)
names(health)=c('type','num')
# select top 10 types
health=health[order(health$num,decreasing = T),]
health=health[1:10,]
print(health)
## type num
## 830 TORNADO 96979
## 123 EXCESSIVE HEAT 8428
## 854 TSTM WIND 7461
## 164 FLOOD 7259
## 452 LIGHTNING 6046
## 269 HEAT 3037
## 147 FLASH FLOOD 2755
## 424 ICE STORM 2064
## 759 THUNDERSTORM WIND 1621
## 972 WINTER STORM 1527
# process the raw PROPDMGEXP and CROPDMGEXP
unique(data$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(data$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
library(plyr)
data$propexpnum <- as.numeric(as.character((revalue(data$PROPDMGEXP, c("B"=9, "M" = 6, "m" = 6,"K" = 3,"H" = 2, "h" = 2,"-"= NA, "+" =NA, "?" = NA)))))
data$croexpnum <- as.numeric(as.character(revalue(data$CROPDMGEXP, c("B"=9, "M" = 6, "m" = 6,"K" = 3,"k" = 3, "?" =NA))))
data$econ = data$PROPDMG*10^data$propexpnum+data$CROPDMG*10^data$croexpnum
# top 10 types
econ=aggregate(data$econ,by=list(data$EVTYPE),sum,na.rm=T)
names(econ)=c('type','num')
# select top 10 types
econ=econ[order(econ$num,decreasing = T),]
econ=econ[1:10,]
print(econ)
## type num
## 164 FLOOD 138007444500
## 406 HURRICANE/TYPHOON 29348167800
## 830 TORNADO 16570326363
## 397 HURRICANE 12405268000
## 586 RIVER FLOOD 10108369000
## 238 HAIL 10048596590
## 147 FLASH FLOOD 8716525177
## 424 ICE STORM 5925150850
## 667 STORM SURGE/TIDE 4641493000
## 759 THUNDERSTORM WIND 3813647990
par(mar=c(12,4,4,2))
barplot(health$num,names.arg = health$type,main='population health',las=2)
barplot(econ$num,names.arg = econ$type,main='economic',las=2)
So, - population health: tornado - economic consequences: flood