library(knitr)
library(plyr)
library(ggplot2)
Sys.setlocale(category="LC_ALL", locale="English") #this can avoid a strange problem
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_1.0.0 plyr_1.8.1 knitr_1.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_1.0
## [5] grid_3.1.1 gtable_0.1.2 htmltools_0.2.6 MASS_7.3-33
## [9] munsell_0.4.2 proto_0.3-10 Rcpp_0.11.2 reshape2_1.4
## [13] rmarkdown_0.2.68 scales_0.2.4 stringr_0.6.2 tools_3.1.1
## [17] yaml_2.1.13
#download("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","StormData.csv.bz2")
theData <- read.csv('StormData.csv.bz2', stringsAsFactors=F)
str(theData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
I substract some fields that I think useful and droped others.
theData <- theData[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Fields “xxEXP” means the exponents, so I use a number to replace them.
theData$PROPDMGEXP <- tolower(theData$PROPDMGEXP)
theData$CROPDMGEXP <- tolower(theData$CROPDMGEXP)
replaceEXP <- function(x){
if(x %in% c("","-","?","+","0")){
0
}else if(x == "h"){
100
}else if(x == "k"){
10^3
}else if(x == "m"){
10^6
}else if(x == "b"){
10^9
}else{
10^as.numeric(x)
}
}
theData$PROPDMG <- theData$PROPDMG*sapply(theData$PROPDMGEXP,replaceEXP)
theData$CROPDMG <- theData$CROPDMG*sapply(theData$CROPDMGEXP,replaceEXP)
theData <- theData[c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")]
Figure out influence of each evtype.
fatalities <- aggregate(FATALITIES ~ EVTYPE,theData,sum)
injuries <- aggregate(INJURIES ~ EVTYPE,theData,sum)
propdamage <- aggregate(PROPDMG ~ EVTYPE,theData,sum)
cropdamage <- aggregate(CROPDMG ~ EVTYPE,theData,sum)
Then sort them.
fatalities <- fatalities[order(fatalities$FATALITIES,decreasing=T),]
injuries <- injuries[order(injuries$INJURIES,decreasing=T),]
propdamage <- propdamage[order(propdamage$PROPDMG,decreasing=T),]
cropdamage <- cropdamage[order(cropdamage$CROPDMG,decreasing=T),]
OK, now we can plot them and see which influence most.
par(mfrow=c(1,2),mar=c(8,4,2,2))
barplot(injuries$INJURIES[1:10],names.arg=injuries$EVTYPE[1:10],las=2,main="injuries")
barplot(fatalities$FATALITIES[1:10],names.arg=fatalities$EVTYPE[1:10],las=2,main="fatalities")
### Top 10 events that are harmful for economy
par(mfrow=c(1,2),mar=c(8,4,2,2))
barplot(cropdamage$CROPDMG[1:10],names.arg=cropdamage$EVTYPE[1:10],las=2,main="crop damage")
barplot(propdamage$PROPDMG[1:10],names.arg=cropdamage$EVTYPE[1:10],las=2,main="prop damage")