Synopsis

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

A) DATA PROCESSING

Data are in a csv compress file, for 48MB total in the raw format. They are obtained from the course website and are automatically downloaded from the course website with the download.file() function while read_csv() unzips and read the data.

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   STATE__ = col_double(),
##   COUNTY = col_double(),
##   BGN_RANGE = col_double(),
##   COUNTY_END = col_double(),
##   END_RANGE = col_double(),
##   LENGTH = col_double(),
##   WIDTH = col_double(),
##   F = col_integer(),
##   MAG = col_double(),
##   FATALITIES = col_double(),
##   INJURIES = col_double(),
##   PROPDMG = col_double(),
##   CROPDMG = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   LATITUDE_E = col_double(),
##   LONGITUDE_ = col_double(),
##   REFNUM = col_double()
## )
## See spec(...) for full column specifications.
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
## # A tibble: 6 x 37
##   STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
##     <dbl> <chr>    <chr>    <chr>      <dbl> <chr>      <chr> <chr> 
## 1       1 4/18/19~ 0130     CST           97 MOBILE     AL    TORNA~
## 2       1 4/18/19~ 0145     CST            3 BALDWIN    AL    TORNA~
## 3       1 2/20/19~ 1600     CST           57 FAYETTE    AL    TORNA~
## 4       1 6/8/195~ 0900     CST           89 MADISON    AL    TORNA~
## 5       1 11/15/1~ 1500     CST           43 CULLMAN    AL    TORNA~
## 6       1 11/15/1~ 2000     CST           77 LAUDERDALE AL    TORNA~
## # ... with 29 more variables: BGN_RANGE <dbl>, BGN_AZI <chr>,
## #   BGN_LOCATI <chr>, END_DATE <chr>, END_TIME <chr>, COUNTY_END <dbl>,
## #   COUNTYENDN <chr>, END_RANGE <dbl>, END_AZI <chr>, END_LOCATI <chr>,
## #   LENGTH <dbl>, WIDTH <dbl>, F <int>, MAG <dbl>, FATALITIES <dbl>,
## #   INJURIES <dbl>, PROPDMG <dbl>, PROPDMGEXP <chr>, CROPDMG <dbl>,
## #   CROPDMGEXP <chr>, WFO <chr>, STATEOFFIC <chr>, ZONENAMES <chr>,
## #   LATITUDE <dbl>, LONGITUDE <dbl>, LATITUDE_E <dbl>, LONGITUDE_ <dbl>,
## #   REMARKS <chr>, REFNUM <dbl>

The NOAA dataset contains 37 fields. Most relevant for our analysis are the fatalities, injuries, propdmg, propdmgexp, cropdmg, cropdmgexp, evtype fields

There is an inconsistency in lower/uppercase use that could be problematic. We select relevant fields and convert all the character variable to uppercase values (k->K,h->H etc).

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# select relevant fields and convert character variable to uppercase
NOAA <- myData %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>% mutate_if(is.character, toupper) 

Q1)Across the United States, which types of events are most harmful with respect to population health?

library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
NOAA_by_EVTYPE<-NOAA %>% 
        select(EVTYPE, FATALITIES, INJURIES) %>% 
        group_by(EVTYPE) %>% 
        summarise_all(sum) %>% 
        arrange(desc(FATALITIES))
tmp<- NOAA_by_EVTYPE%>% select(EVTYPE, FATALITIES)

top_fatalities<-head(tmp, n=5)
p1<-top_fatalities %>% 
  ggplot(aes(x=EVTYPE, y=FATALITIES))+
  coord_flip() +
  geom_bar(stat = "identity") + 
  theme_bw()+
  labs(title="Fatalities by Event Type", caption = "Source: NOAA database")

tmp<- NOAA_by_EVTYPE%>% select(EVTYPE, INJURIES)
top_injuries<-head(tmp, n=5)

p2<-top_injuries %>% 
  ggplot(aes(x=EVTYPE, y=INJURIES))+ 
  geom_bar(stat = "identity")+ 
  coord_flip() +
  theme_bw()+
  labs(title="Injuries by Event Type", caption = "Source: NOAA database")

grid.arrange(p1,p2, ncol=2, top="Top 5 most harmful events")

Tornados are by far the most harmful events, followed by excessive heat.

Q2) Across the United States, which types of events have the greatest economic consequences?

To calculate the economic consequences, we’ll need to convert propdmg, cropdmg using the propdmgexp, cropdmgexcodes (h = hundred, k = thousand, m = million, b = billion),

unique(NOAA$CROPDMGEXP)
## [1] NA  "M" "K" "B" "?" "0" "2"
unique(NOAA$PROPDMGEXP)
##  [1] "K" "M" NA  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
# (h = hundred, k = thousand, m = million, b = billion)
NOAA<-NOAA %>% mutate(PROPDMGEXP2=0,CROPDMGEXP2=0 )


H=100
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="H")]= H
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="H")]= H

K=1000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="K")] = K
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="K")] = K

M=1000000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="M")] = M
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="M")] = M

B=1000000000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="B")] = B
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="B")] = B



NOAA_damages<-NOAA %>% 
        select(EVTYPE,PROPDMG,CROPDMG,PROPDMGEXP2, CROPDMGEXP2) %>% 
        filter(PROPDMGEXP2>0 | CROPDMGEXP2>0) %>% 
        mutate(PROPDMG2=PROPDMG*PROPDMGEXP2, CROPDMG2=CROPDMG*CROPDMGEXP2)


NOAA_damages_evtype<- NOAA_damages %>% 
  group_by(EVTYPE) %>% 
  summarise(PROPDMG2 = sum(PROPDMG2), CROPDMG2=sum(CROPDMG2)) %>% 
  mutate(TOTDAMAGE=PROPDMG2+CROPDMG2)
# Top n total damages
NOAA_damages_evtype_topT<-NOAA_damages_evtype %>% 
  group_by(EVTYPE) %>% 
  top_n(n=10,TOTDAMAGE) %>% 
  arrange(desc(TOTDAMAGE)) 
top_damages<-head(NOAA_damages_evtype_topT, n=5)

B) RESULTS

We can recap the Health and Economic Damages as follows:

library(DT)
library(reshape2)

datatable(top_fatalities, caption = "Top Fatalities",options = list(pageLength = 5))
datatable(top_injuries, caption = "Top Injuries",options = list(pageLength = 5))
datatable(top_damages, caption= "Top Damages")
top_damages_reshape<-melt(top_damages[,c("EVTYPE", "PROPDMG2", "CROPDMG2","TOTDAMAGE")],id.vars = 1)

Final Plot

Economic damages can be also plotted, with detail of Properties and Crop Damages. The numbers are terrifying:over 1,500 U$ Billion of damages from the floods only, plus the related fatalities and injuries.

P3<-top_damages_reshape %>% 
  ggplot(aes(x=EVTYPE, y=value))+ 
  geom_bar(aes(fill=variable), stat = "identity", position="dodge")+ 
  theme_bw()+ 
  coord_flip() +
  labs(title="Total Damages by Event Type, Top 5", 
       caption = "Source: NOAA database") +
       xlab("Event Type") +
      ylab("US$ Bilion")
P3

Marcello Del Bono, Data Analysis and Data Mining