R Environment Setting

library(knitr)
library(plyr)
library(ggplot2)
Sys.setlocale(category="LC_ALL", locale="English") #this can avoid a strange problem
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i386-w64-mingw32/i386 (32-bit)
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_1.0.0 plyr_1.8.1    knitr_1.6    
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4 digest_0.6.4     evaluate_0.5.5   formatR_1.0     
##  [5] grid_3.1.1       gtable_0.1.2     htmltools_0.2.6  MASS_7.3-33     
##  [9] munsell_0.4.2    proto_0.3-10     Rcpp_0.11.2      reshape2_1.4    
## [13] rmarkdown_0.2.68 scales_0.2.4     stringr_0.6.2    tools_3.1.1     
## [17] yaml_2.1.13

Reading and Clearing Data

#download("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","StormData.csv.bz2")
theData <- read.csv('StormData.csv.bz2', stringsAsFactors=F)
str(theData)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

I substract some fields that I think useful and droped others.

theData <- theData[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

Fields “xxEXP” means the exponents, so I use a number to replace them.

theData$PROPDMGEXP <- tolower(theData$PROPDMGEXP)
theData$CROPDMGEXP <- tolower(theData$CROPDMGEXP)
replaceEXP <- function(x){
  if(x %in% c("","-","?","+","0")){
    0
  }else if(x == "h"){
    100
  }else if(x == "k"){
    10^3
  }else if(x == "m"){
    10^6
  }else if(x == "b"){
    10^9
  }else{
    10^as.numeric(x)
  }
}
theData$PROPDMG <- theData$PROPDMG*sapply(theData$PROPDMGEXP,replaceEXP)
theData$CROPDMG <- theData$CROPDMG*sapply(theData$CROPDMGEXP,replaceEXP)
theData <- theData[c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")]

Data Analyzing

Figure out influence of each evtype.

fatalities <- aggregate(FATALITIES ~ EVTYPE,theData,sum)
injuries <- aggregate(INJURIES ~ EVTYPE,theData,sum)
propdamage <- aggregate(PROPDMG ~ EVTYPE,theData,sum)
cropdamage <- aggregate(CROPDMG ~ EVTYPE,theData,sum)

Then sort them.

fatalities <- fatalities[order(fatalities$FATALITIES,decreasing=T),]
injuries <- injuries[order(injuries$INJURIES,decreasing=T),]
propdamage <- propdamage[order(propdamage$PROPDMG,decreasing=T),]
cropdamage <- cropdamage[order(cropdamage$CROPDMG,decreasing=T),]

OK, now we can plot them and see which influence most.

Result

Top 10 events that are harmful for health

par(mfrow=c(1,2),mar=c(8,4,2,2))
barplot(injuries$INJURIES[1:10],names.arg=injuries$EVTYPE[1:10],las=2,main="injuries")
barplot(fatalities$FATALITIES[1:10],names.arg=fatalities$EVTYPE[1:10],las=2,main="fatalities")

plot of chunk unnamed-chunk-7 ### Top 10 events that are harmful for economy

par(mfrow=c(1,2),mar=c(8,4,2,2))
barplot(cropdamage$CROPDMG[1:10],names.arg=cropdamage$EVTYPE[1:10],las=2,main="crop damage")
barplot(propdamage$PROPDMG[1:10],names.arg=cropdamage$EVTYPE[1:10],las=2,main="prop damage")

plot of chunk unnamed-chunk-8