#Reproducible Research - Course Project 2 : NOAA Storm Database analysis / Population health and economic impact of weather events accross the United States.

##Synopsis: The aim of this project is to analyze the NOAA Storm Database. The datas cover a period from 1950 to November 2011. Two questions will be treated during this analysis.

Across the United States, which types of events (as indicated in the 𝙴𝚅𝚃𝚈𝙿𝙴 variable) are most harmful with respect to population health?

Across the United States, which types of events have the greatest economic consequences? In the first part, Data Processing, we will discover the different steps to process and transform the datas. In the second part, Results, we will treat the two questions above

Data Processing

dataset = read.csv("c:/datascience/05/project2/repdata_data_StormData.csv")

names(dataset)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Using Dplyr & Plot results

library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")

dataset$EVTYPE <- as.factor(dataset$EVTYPE)

df_top10_fatalities <- dataset %>% 
    group_by(EVTYPE) %>% 
    summarize(PeopleAffected = sum(FATALITIES) + sum(INJURIES)) %>%
    arrange(desc(PeopleAffected)) %>%
    head(10)
df_top10_fatalities
## # A tibble: 10 x 2
##    EVTYPE            PeopleAffected
##    <fct>                      <dbl>
##  1 TORNADO                    96979
##  2 EXCESSIVE HEAT              8428
##  3 TSTM WIND                   7461
##  4 FLOOD                       7259
##  5 LIGHTNING                   6046
##  6 HEAT                        3037
##  7 FLASH FLOOD                 2755
##  8 ICE STORM                   2064
##  9 THUNDERSTORM WIND           1621
## 10 WINTER STORM                1527

Results

ggplot(data=df_top10_fatalities, aes(fill="red", x = reorder(EVTYPE, -PeopleAffected), y = PeopleAffected)) + 
    geom_bar(stat = "identity") + 
    ggtitle("Event type most harmful w.r.t population health") + 
    theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))

df_top10_damage <- dataset %>% 
    group_by(EVTYPE) %>% 
    summarize(EconomicDamage = sum(PROPDMG) + sum(CROPDMG)) %>%
    arrange(desc(EconomicDamage)) %>%
    head(10)

df_top10_damage
## # A tibble: 10 x 2
##    EVTYPE             EconomicDamage
##    <fct>                       <dbl>
##  1 TORNADO                  3312277.
##  2 FLASH FLOOD              1599325.
##  3 TSTM WIND                1445168.
##  4 HAIL                     1268290.
##  5 FLOOD                    1067976.
##  6 THUNDERSTORM WIND         943636.
##  7 LIGHTNING                 606932.
##  8 THUNDERSTORM WINDS        464978.
##  9 HIGH WIND                 342015.
## 10 WINTER STORM              134700.
ggplot(data=df_top10_damage, aes(fill=3, x = reorder(EVTYPE, -EconomicDamage), y = EconomicDamage)) + 
    geom_bar(stat = "identity") + 
    ggtitle("Event type most harmful w.r.t economic damage") + 
    theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))