This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
Data are in a csv compress file, for 48MB total in the raw format. They are obtained from the course website and are automatically downloaded from the course website with the download.file() function while read_csv() unzips and read the data.
## Parsed with column specification:
## cols(
## .default = col_character(),
## STATE__ = col_double(),
## COUNTY = col_double(),
## BGN_RANGE = col_double(),
## COUNTY_END = col_double(),
## END_RANGE = col_double(),
## LENGTH = col_double(),
## WIDTH = col_double(),
## F = col_integer(),
## MAG = col_double(),
## FATALITIES = col_double(),
## INJURIES = col_double(),
## PROPDMG = col_double(),
## CROPDMG = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## LATITUDE_E = col_double(),
## LONGITUDE_ = col_double(),
## REFNUM = col_double()
## )
## See spec(...) for full column specifications.
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
## # A tibble: 6 x 37
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 4/18/19~ 0130 CST 97 MOBILE AL TORNA~
## 2 1 4/18/19~ 0145 CST 3 BALDWIN AL TORNA~
## 3 1 2/20/19~ 1600 CST 57 FAYETTE AL TORNA~
## 4 1 6/8/195~ 0900 CST 89 MADISON AL TORNA~
## 5 1 11/15/1~ 1500 CST 43 CULLMAN AL TORNA~
## 6 1 11/15/1~ 2000 CST 77 LAUDERDALE AL TORNA~
## # ... with 29 more variables: BGN_RANGE <dbl>, BGN_AZI <chr>,
## # BGN_LOCATI <chr>, END_DATE <chr>, END_TIME <chr>, COUNTY_END <dbl>,
## # COUNTYENDN <chr>, END_RANGE <dbl>, END_AZI <chr>, END_LOCATI <chr>,
## # LENGTH <dbl>, WIDTH <dbl>, F <int>, MAG <dbl>, FATALITIES <dbl>,
## # INJURIES <dbl>, PROPDMG <dbl>, PROPDMGEXP <chr>, CROPDMG <dbl>,
## # CROPDMGEXP <chr>, WFO <chr>, STATEOFFIC <chr>, ZONENAMES <chr>,
## # LATITUDE <dbl>, LONGITUDE <dbl>, LATITUDE_E <dbl>, LONGITUDE_ <dbl>,
## # REMARKS <chr>, REFNUM <dbl>
The NOAA dataset contains 37 fields. Most relevant for our analysis are the fatalities, injuries, propdmg, propdmgexp, cropdmg, cropdmgexp, evtype fields
There is an inconsistency in lower/uppercase use that could be problematic. We select relevant fields and convert all the character variable to uppercase values (k->K,h->H etc).
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# select relevant fields and convert character variable to uppercase
NOAA <- myData %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>% mutate_if(is.character, toupper)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
NOAA_by_EVTYPE<-NOAA %>%
select(EVTYPE, FATALITIES, INJURIES) %>%
group_by(EVTYPE) %>%
summarise_all(sum) %>%
arrange(desc(FATALITIES))
tmp<- NOAA_by_EVTYPE%>% select(EVTYPE, FATALITIES)
top_fatalities<-head(tmp, n=5)
p1<-top_fatalities %>%
ggplot(aes(x=EVTYPE, y=FATALITIES))+
coord_flip() +
geom_bar(stat = "identity") +
theme_bw()+
labs(title="Fatalities by Event Type", caption = "Source: NOAA database")
tmp<- NOAA_by_EVTYPE%>% select(EVTYPE, INJURIES)
top_injuries<-head(tmp, n=5)
p2<-top_injuries %>%
ggplot(aes(x=EVTYPE, y=INJURIES))+
geom_bar(stat = "identity")+
coord_flip() +
theme_bw()+
labs(title="Injuries by Event Type", caption = "Source: NOAA database")
grid.arrange(p1,p2, ncol=2, top="Top 5 most harmful events")
Tornados are by far the most harmful events, followed by excessive heat.
To calculate the economic consequences, we’ll need to convert propdmg, cropdmg using the propdmgexp, cropdmgexcodes (h = hundred, k = thousand, m = million, b = billion),
unique(NOAA$CROPDMGEXP)
## [1] NA "M" "K" "B" "?" "0" "2"
unique(NOAA$PROPDMGEXP)
## [1] "K" "M" NA "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
# (h = hundred, k = thousand, m = million, b = billion)
NOAA<-NOAA %>% mutate(PROPDMGEXP2=0,CROPDMGEXP2=0 )
H=100
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="H")]= H
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="H")]= H
K=1000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="K")] = K
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="K")] = K
M=1000000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="M")] = M
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="M")] = M
B=1000000000
NOAA$PROPDMGEXP2[which(NOAA$PROPDMGEXP =="B")] = B
NOAA$CROPDMGEXP2[which(NOAA$CROPDMGEXP =="B")] = B
NOAA_damages<-NOAA %>%
select(EVTYPE,PROPDMG,CROPDMG,PROPDMGEXP2, CROPDMGEXP2) %>%
filter(PROPDMGEXP2>0 | CROPDMGEXP2>0) %>%
mutate(PROPDMG2=PROPDMG*PROPDMGEXP2, CROPDMG2=CROPDMG*CROPDMGEXP2)
NOAA_damages_evtype<- NOAA_damages %>%
group_by(EVTYPE) %>%
summarise(PROPDMG2 = sum(PROPDMG2), CROPDMG2=sum(CROPDMG2)) %>%
mutate(TOTDAMAGE=PROPDMG2+CROPDMG2)
# Top n total damages
NOAA_damages_evtype_topT<-NOAA_damages_evtype %>%
group_by(EVTYPE) %>%
top_n(n=10,TOTDAMAGE) %>%
arrange(desc(TOTDAMAGE))
top_damages<-head(NOAA_damages_evtype_topT, n=5)
We can recap the Health and Economic Damages as follows:
library(DT)
library(reshape2)
datatable(top_fatalities, caption = "Top Fatalities",options = list(pageLength = 5))
datatable(top_injuries, caption = "Top Injuries",options = list(pageLength = 5))
datatable(top_damages, caption= "Top Damages")
top_damages_reshape<-melt(top_damages[,c("EVTYPE", "PROPDMG2", "CROPDMG2","TOTDAMAGE")],id.vars = 1)
Economic damages can be also plotted, with detail of Properties and Crop Damages. The numbers are terrifying:over 1,500 U$ Billion of damages from the floods only, plus the related fatalities and injuries.
P3<-top_damages_reshape %>%
ggplot(aes(x=EVTYPE, y=value))+
geom_bar(aes(fill=variable), stat = "identity", position="dodge")+
theme_bw()+
coord_flip() +
labs(title="Total Damages by Event Type, Top 5",
caption = "Source: NOAA database") +
xlab("Event Type") +
ylab("US$ Bilion")
P3