Synopsis: The goal of this project is to analyze the NOAA Storm Database. The datas cover a period from 1950 to November 2011. Two questions will be treated during this analysis:


Data Processing


Let’s download the datas and have a quick overview

if (!file.exists("StormData.csv.bz2")) {
    download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile="StormData.csv.bz2")
}
datas <- read.csv("StormData.csv.bz2")
head(datas)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
## 4 TORNADO         0                                               0
## 5 TORNADO         0                                               0
## 6 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
## 4         NA         0                       0.0   100 2   0          0
## 5         NA         0                       0.0   150 2   0          0
## 6         NA         0                       1.5   177 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
## 4        2     2.5          K       0                                    
## 5        2     2.5          K       0                                    
## 6        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6
dim(datas)
## [1] 902297     37

Let’s process and prepare all the datas for the first question about population health impact

# POPHELATHIMPACT will contain the sum of fatalities and injuries
# We create a dedicated dataset 
# We sort the dataset descending by sum of injury and fatality
# Let's limit the dataset to the top ten event types

datas$POPHEALTHIMPACT = datas$FATALITIES + datas$INJURIES

populationHealthImpactDatas = aggregate(datas$POPHEALTHIMPACT, by=list((datas$EVTYPE)),sum)
colnames(populationHealthImpactDatas) = c("EVTYPE", "SUMFATINJ")

populationHealthImpactDatas = populationHealthImpactDatas[with(populationHealthImpactDatas, order(-SUMFATINJ)),]

populationHealthImpactDatas = populationHealthImpactDatas[1:10,]

Let’s process and prepare all the datas for the second question about economic consequences

# TOTALDMG will contain the sum of property and crops damages
# We create a dedicated dataset 
# We sort the dataset descending by sum of property and crops damages
# Let's limit the dataset to the top ten event types

datas$TOTALDMG = datas$PROPDMG + datas$CROPDMG

economicConsequencesImpactDatas = aggregate(datas$TOTALDMG, by=list((datas$EVTYPE)),sum)
colnames(economicConsequencesImpactDatas) = c("EVTYPE", "SUMECODMG")

economicConsequencesImpactDatas = economicConsequencesImpactDatas[with(economicConsequencesImpactDatas, order(-SUMECODMG)),]

economicConsequencesImpactDatas = economicConsequencesImpactDatas[1:10,]

Results

We create the bar plot for fatalities and injuries

par(mar = c(12, 10, 3, 7), mgp = c(5, 1, 0))
barplot(populationHealthImpactDatas$SUMFATINJ, las = 2, main = "10 types of events most harmful with respect to population health", names.arg = populationHealthImpactDatas$EVTYPE, ylab = "Sum of injuries and fatalities", col = c('red'))

We create the bar plot for properties and crops economic consequences

par(mar = c(12, 8, 3, 7), mgp = c(5, 1, 0))
barplot(economicConsequencesImpactDatas$SUMECODMG, las = 2, main = "10 types of events with the greatest economic consequences", names.arg = economicConsequencesImpactDatas$EVTYPE, ylab = "Sum of properties and crops consequences", col = c('blue'))