# data loading
if(file.exists('rawdata.csv.bz2')){
url = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(url,destfile='rawdata.csv.bz2',method='curl')
}
data<-read.csv(bzfile('rawdata.csv.bz2'))
# a quick summary of the data
evtype <- levels(data$EVTYPE)
# cleanning data
tempstr = data$EVTYPE
newlevel<-c('-AVALANCHE','-ASTRONOMICAL HIGH TIDE','-ASTRONOMICAL LOW TIDE','-FROST/FREEZE','-RIP CURRENT','-HEAVY SNW','-FLD','-FLASH-FLD','-STORM SURGE','-DUST STORM','-DROUGHT','-MICROBURST','-DENSE SMOKE','-DENSE FOG','-FREEZING FOG','-ICE-JAM','-FRZ FOG','-SLEET','-DEBRIS FLOW','-DAM BREAK','-COLD/WIND CHILL','-EXT COLD','-COASTAL-FLD','-LAKESHORE-FLD','-LAKE-EFFECT SNW','-OTHER','-BLIZZARD','-HAIL','-ICE STORM','-FUNNEL CLOUD','-DUST DEVIL','-WATERSPOUT','-TROPICAL STORM','-TROPICAL DEPRESSION','-WILDFIRE','-LIGHTNING','-MARINE-StrW','-MARINE-HW','-MARINE-HAL','-MARINE-OTHER','-THUNDERSTORM WIND','-MARINE-THUSTRM','-HURRICANE','-VOLCANIC ASH','-TORNADO','-WINTRY MIX','-WINTER STORM','-TSUNAMI','-HIGH SURF','-SEICHE','-VOG','-WET','-EXT HEAT','-HEAT','-HEAVY RAIN','-HEAVY WIND')
tempstr <- factor(tempstr, levels = c(levels(tempstr), newlevel))
# this is going to be messy, and I will do it manually. and start from the event sounds seriously and unique, e.g. Tonado, "Volcanic", ...
ind <- grep("(TORNDAO|TORNADO|Whirlwind|ROTATING WALL CLOUD|WALL CLOUD)",data$EVTYPE,ignore.case=TRUE)
tempstr[ind]='-TORNADO'
ind <- grep("(VOLCANIC)",tempstr,ignore.case=TRUE)
tempstr[ind]='-VOLCANIC ASH'
ind <- grep("(HURRICANE|TYPHOON)",tempstr,ignore.case=TRUE)
tempstr[ind]='-HURRICANE'
ind <- grep("(MARINE TSTM)|(MARINE THU)",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MARINE-THUSTRM'
ind <- grep("(TSTM|THUNDERSTORM|TUNDERSTORM|THUNDEERSTORM|THUDERSTORM|THUNDESTROM|THUNDERTORM|THUNDERESTORM|THUNDERSTROM|THUNDERTSORM|THUNERSTORM|THUNDESTORM|Downburst|gustnadoes)",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-THUNDERSTORM WIND'
ind <- grep("MARINE Accident|MARINE MISHAP",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MARINE-OTHER'
ind <- grep("MARINE HAIL",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MARINE-HAL'
ind <- grep("MARINE HIGH",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MARINE-HW'
ind <- grep("MARINE STRONG",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MARINE-StrW'
ind <- grep("lightning|lighting|ligntning",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-LIGHTNING'
ind <- grep("fire",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-WILDFIRE'
ind <- grep("tropical storm",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-TROPICAL STORM'
ind <- grep("tropical depression",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-TROPICAL DEPRESSION'
ind <- grep("water s|waters|WAYTERSPOUT",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-WATERSPOUT'
ind <- grep("dust devil|dust devel|landspout|duststorm",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DUST DEVIL'
ind <- grep("avalance|avalanche",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-AVALANCHE'
ind <- grep("fun",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FUNNEL CLOUD'
ind <- grep("glaze/ice storm",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-ICE STORM'
ind <- grep("sleet|pellets|black ice|glaze",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-SLEET'
ind <- grep("hail",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-HAIL'
ind <- grep('bli',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-BLIZZARD'
ind <- grep('summary|other|\\?|none',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-OTHER'
ind <- grep('lake effect|lake snow|lake-effect',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-LAKE-EFFECT SNW'
ind <- grep('lake flood|lakeshore flood|lakeshre flood',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-LAKESHORE-FLD'
ind <- grep('coast|Cstl Flood|beach flood|BEACH',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-COASTAL-FLD'
ind <- which( (grepl('(chill|cold)',tempstr,ignore.case=TRUE) &
grepl('wind',tempstr,ignore.case=TRUE)) &
!(grepl('extr|prolon|record|UNSEASONAB|UNUSUAL|severe',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-COLD/WIND CHILL'
ind <- grep('dam ',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DAM BREAK'
ind <- grep('mud|slide|landslump',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DEBRIS FLOW'
ind <- grep("road",tempstr,ignore.case=TRUE)
tempstr[ind] <- '-SLEET'
ind <- grep('freezing fog|ice fog|freezing drizzle|freezing spray',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FRZ FOG'
ind <- grep('ice patchy|patchy ice|ice jam|ice floes',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-ICE-JAM'
ind <- intersect(grep('(ice)',tempstr,ignore.case=TRUE),
grep('snow|storm',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-ICE STORM'
ind <- intersect(grep('(ice)',tempstr,ignore.case=TRUE),
grep('winds',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-ICE STORM'
ind <- which(tempstr == 'ICE')
tempstr[ind] <- "-ICE STORM"
ind <- grep('freezing fog|fog and cold',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FREEZING FOG'
ind <- grep('dense fog',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DENSE FOG'
ind <- which(tempstr == 'FOG')
tempstr[ind] <- '-DENSE FOG'
ind <- grep('smoke',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DENSE SMOKE'
ind <- grep('microburst|mircoburst|micoburst',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-MICROBURST'
ind <- grep('dro|dry|driest',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DROUGHT'
ind <- grep('dust storm|blowing dust|dust storm|saharan',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-DUST STORM'
ind <- grep(' sea|wave|swell|surge|blow-out tide|high wind and high tide|high tides|tidal',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-STORM SURGE'
ind <- grep('flash',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FLASH-FLD'
ind <- grep('flood| fld',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FLD'
ind <- grep('heavy snow',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-HEAVY SNW'
ind <- grep('rip|current',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-RIP CURRENT'
ind <- grep('frost|freeze',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-FROST/FREEZE'
ind <- grep('winter mix|winter weather|wintery mix|wintry',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-WINTRY MIX'
ind <- grep('winter',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-WINTER STORM'
ind <- grep('surf',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-HIGH SURF'
ind <- grep('tsunami',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-TSUNAMI'
ind <- grep('lack of snow',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-OTHER'
ind <- intersect(grep('RECORD|heavy|excessive',tempstr,ignore.case=TRUE),grep('snow',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-HEAVY SNW'
ind <- grep('snow',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-WINTER STORM'
ind <- which( (grepl('cold',tempstr,ignore.case=TRUE)&grepl('excessive|ext|record|prolong|severe|unseas|unusually',tempstr,ignore.case=TRUE)) & ! grepl('-',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-EXT COLD'
ind <- which( (grepl('chill',tempstr,ignore.case=TRUE)&grepl('extreme',tempstr,ignore.case=TRUE)) & ! grepl('-',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-EXT COLD'
ind <- which( (grepl('cold',tempstr,ignore.case=TRUE))&!(grepl('-',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-COLD/WIND CHILL'
ind <- grep('SEICHE',tempstr,ignore.case=TRUE)
tempstr[ind] <- '-SEICHE'
ind <- which(grepl('wet',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-WET'
ind <- which(grepl('vog',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-VOG'
ind <- which( (grepl('warm|heat|hot|temperature',tempstr,ignore.case=TRUE) &
grepl('abnormal|very|unusual|spell|exc|ext|rec|unseason|burst',tempstr,ignore.case=TRUE))
&!(grepl('-|low',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-EXT HEAT'
ind <- which( (grepl('warm|heat|hot|temperature',tempstr,ignore.case=TRUE) )
&!(grepl('-|low',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-HEAT'
ind <- which( (grepl('cold|temp|cool spell',tempstr,ignore.case=TRUE) )
&!(grepl('-',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-EXT COLD'
ind <- which(grepl('thermia',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-EXT HEAT'
ind <- which(grepl('ASTRONOMICAL HIGH TIDE',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-ASTRONOMICAL HIGH TIDE'
ind <- which(grepl('ASTRONOMICAL LOW TIDE',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-ASTRONOMICAL LOW TIDE'
ind <- which( (grepl('rain|water|perci|stream|shower|precip',tempstr,ignore.case=TRUE) )
&!(grepl('-',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-HEAVY RAIN'
ind <- which( (grepl('wnd|wind|storm',tempstr,ignore.case=TRUE) )
&!(grepl('-',tempstr,ignore.case=TRUE)))
tempstr[ind] <- '-HEAVY WIND'
ind <- which(grepl('NON-SEVERE WIND',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-OTHER'
ind <- which(!grepl('-',tempstr,ignore.case=TRUE))
tempstr[ind] <- '-OTHER'
#store the standardized EVTYPE to data$altEVTYPE, and print out it's levels.
data$altEVTYPE <- factor(tempstr)
levels(data$altEVTYPE)
## [1] "-AVALANCHE" "-ASTRONOMICAL HIGH TIDE"
## [3] "-ASTRONOMICAL LOW TIDE" "-FROST/FREEZE"
## [5] "-RIP CURRENT" "-HEAVY SNW"
## [7] "-FLD" "-FLASH-FLD"
## [9] "-STORM SURGE" "-DUST STORM"
## [11] "-DROUGHT" "-MICROBURST"
## [13] "-DENSE SMOKE" "-DENSE FOG"
## [15] "-FREEZING FOG" "-ICE-JAM"
## [17] "-FRZ FOG" "-SLEET"
## [19] "-DEBRIS FLOW" "-DAM BREAK"
## [21] "-COLD/WIND CHILL" "-EXT COLD"
## [23] "-COASTAL-FLD" "-LAKESHORE-FLD"
## [25] "-LAKE-EFFECT SNW" "-OTHER"
## [27] "-BLIZZARD" "-HAIL"
## [29] "-ICE STORM" "-FUNNEL CLOUD"
## [31] "-DUST DEVIL" "-WATERSPOUT"
## [33] "-TROPICAL STORM" "-TROPICAL DEPRESSION"
## [35] "-WILDFIRE" "-LIGHTNING"
## [37] "-MARINE-StrW" "-MARINE-HW"
## [39] "-MARINE-HAL" "-THUNDERSTORM WIND"
## [41] "-MARINE-THUSTRM" "-HURRICANE"
## [43] "-VOLCANIC ASH" "-TORNADO"
## [45] "-WINTRY MIX" "-WINTER STORM"
## [47] "-TSUNAMI" "-HIGH SURF"
## [49] "-SEICHE" "-VOG"
## [51] "-WET" "-EXT HEAT"
## [53] "-HEAT" "-HEAVY RAIN"
## [55] "-HEAVY WIND"
damage.human <- aggregate(cbind(FATALITIES,INJURIES)~altEVTYPE,data,FUN=sum)
damage.human$SUM <- damage.human$FATALITIES+damage.human$INJURIES
names(damage.human) = c('EVTYPE','Fatalities','Injuries','Total')
indHuman = sort(damage.human$Total,decreasing=TRUE,index.return=TRUE)
sdamage.human=damage.human[indHuman$ix,]
sdamage.human$index = 1:55
head(sdamage.human,10)
## EVTYPE Fatalities Injuries Total index
## 44 -TORNADO 5662 91407 97069 1
## 40 -THUNDERSTORM WIND 711 9510 10221 2
## 52 -EXT HEAT 2030 6747 8777 3
## 7 -FLD 512 6873 7385 4
## 36 -LIGHTNING 817 5231 6048 5
## 53 -HEAT 937 2102 3039 6
## 8 -FLASH-FLD 1035 1800 2835 7
## 55 -HEAVY WIND 428 1840 2268 8
## 29 -ICE STORM 100 2141 2241 9
## 46 -WINTER STORM 252 1480 1732 10
#unify the unit.
index=c('h','k','m','B')
factor=c(1/10000.,1/1000.,1.,1000.)
data$NormPROPDMG = data$PROPDMG * 0.0
data$NormCROPDMG = data$CROPDMG * 0.0
for (i in 1:4){
ind = which(grepl(index[i],data$PROPDMGEXP,ignore.case=TRUE))
data$NormPROPDMG[ind] = data$PROPDMG[ind]*factor[i]
ind = which(grepl(index[i],data$CROPDMGEXP,ignore.case=TRUE))
data$NormCROPDMG[ind] = data$CROPDMG[ind]*factor[i]
}
#summing up
damage.property<- aggregate(cbind(NormPROPDMG,NormCROPDMG)~altEVTYPE,data,FUN=sum)
damage.property$SUM<-damage.property$NormPROPDMG+damage.property$NormCROPDMG
names(damage.property) = c('EVTYPE','Property','Crop','Total')
indProp = sort(damage.property$Total,decreasing=TRUE,index.return=TRUE)
sdamage.property=damage.property[indProp$ix,]
sdamage.property$index = 1:55
head(sdamage.property,10)
## EVTYPE Property Crop Total index
## 7 -FLD 150218 10851.314 161069 1
## 42 -HURRICANE 85356 5516.118 90873 2
## 44 -TORNADO 58593 417.461 59011 3
## 9 -STORM SURGE 47976 6.405 47983 4
## 28 -HAIL 15975 3046.888 19021 5
## 8 -FLASH-FLD 16907 1532.197 18439 6
## 11 -DROUGHT 1046 13972.622 15019 7
## 40 -THUNDERSTORM WIND 10971 1271.664 12243 8
## 29 -ICE STORM 3967 5022.114 8989 9
## 35 -WILDFIRE 8497 403.282 8900 10
library(ggplot2)
plotDamageHuman=data.frame(index=rep(1:55,2),EVTYPE=rep(sdamage.human$EVTYPE,2),y=c(sdamage.human$Fatalities,sdamage.human$Total), case=rep(c('Falalties','Total'),each=55))
qplot(index, y, data=plotDamageHuman,log='y',geom=c('point','line'),color=case,ylab='number of human')
plotDamageProperty=data.frame(index=rep(1:55,3),EVTYPE=rep(sdamage.property$EVTYPE,3),y=c(sdamage.property$Property,sdamage.property$Crop,sdamage.property$Total), case=rep(c('Property','Crop','Total'),each=55))
qplot(index, y, data=plotDamageProperty,log='y',geom=c('point','line'),color=case,ylab='damage (mUSD)')
[1] Storm Data FAQ https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf
[2] Storm Data Documentation: https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf
[3] State Code from Wikipedia: http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations