## Synopsis In this document we present analysis of the last 60 years storms that occured in the United States. We highlight storm types that have the highest human and economic impact.
library(ggplot2)
library(reshape2)
setwd("/Users/martin/Dev/datasciencecoursera-repos/datasciencecoursera-represearch-a2")
con <- bzfile("data/repdata-data-StormData.csv.bz2", "r")
csv <- read.csv(con)
close(con)
Cleaning data:
csv$EVTYPE <- tolower(csv$EVTYPE)
typeScores <- aggregate(cbind(csv$INJURIES, csv$FATALITIES) ~ csv$EVTYPE, csv, sum) # 10s
fullZero <- typeScores[,2] == 0 & typeScores[,3] == 0
cleanTypeScores <- typeScores[!fullZero,]
names(cleanTypeScores) <- cbind("type", "injuries", "fatalities")
rankInjuries <- order(cleanTypeScores[,2], decreasing=T)
rankFatalities <- order(cleanTypeScores[,3], decreasing=T)
toPlot <- cleanTypeScores[rankFatalities, ]
toPlot <- toPlot[1:20, ]
toPlot <- melt(toPlot, "type")
ggplot(toPlot, aes(x = type, y = value, fill = variable)) + geom_histogram(position = "dodge") +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) + labs(title = "10 most injuring storm types in US - 1950 to 2011")
## Mapping a variable to y and also using stat="bin".
## With stat="bin", it will attempt to set the y value to the count of cases in each group.
## This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
## If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
## If you want y to represent values in the data, use stat="identity".
## See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)
# TODO : ordonner les levels des facteurs
# TODO : Vérifier si ça a évolué avec le temps?
We focus on event types implying Billion dollars of property and crop destructions.
The list of event type is given bellow:
eventsProp <- sort(unique(csv$EVTYPE[csv$PROPDMGEXP == "B"]))
eventsCrop <- sort(unique(csv$EVTYPE[csv$PROPDMGEXP == "B"]))
events <- sort(unique(rbind(eventsProp, eventsCrop)))
print(events)
## [1] "flash flood" "flood"
## [3] "hail" "heavy rain/severe weather"
## [5] "high wind" "hurricane"
## [7] "hurricane opal" "hurricane opal/high winds"
## [9] "hurricane/typhoon" "river flood"
## [11] "severe thunderstorm" "storm surge"
## [13] "storm surge/tide" "tornado"
## [15] "tornadoes, tstm wind, hail" "tropical storm"
## [17] "wild/forest fire" "wildfire"
## [19] "winter storm"
The analysis sources can be found here : https://github.com/jzy3d/datasciencecoursera-represearch-a2
The data was provided by coursera here : https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
The analysis was produced with the following computer configuration:
sessionInfo()
## R version 3.0.1 (2013-05-16)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] fr_FR.UTF-8/fr_FR.UTF-8/fr_FR.UTF-8/C/fr_FR.UTF-8/fr_FR.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape2_1.2.2 ggplot2_0.9.3.1 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_0.10
## [5] grid_3.0.1 gtable_0.1.2 labeling_0.2 MASS_7.3-31
## [9] munsell_0.4.2 plyr_1.8.1 proto_0.3-10 Rcpp_0.11.1
## [13] scales_0.2.4 stringr_0.6.2 tools_3.0.1