This project explores the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. The data was directly downloaded from the link using r, and analyzed with library “dplyr”, “tidyr”, “data.table”, “mgsu”, and ploted with ggplot2. The plots showed that tornado caused the most harm to people’s health, and floods caused the most economic loss in US from 1950 to November of 2011.
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
destfile <- "storm.csv.bz2"
if(!file.exists(destfile)){
setInternet2(TRUE)
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile, "curl")
load("storm.csv.bz2")
}
storm <- fread("storm.csv.bz2")
dim(storm)
## [1] 902297 37
head(storm, 2)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1: 0 0 NA
## 2: 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1: 0 14 100 3 0 0 15 25.0
## 2: 0 2 150 2 0 0 0 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1: K 0 3040 8812
## 2: K 0 3042 8755
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3051 8806 1
## 2: 0 0 2
health_damage <- storm %>% select(EVTYPE, FATALITIES, INJURIES)
dim(health_damage)
## [1] 902297 3
sum(is.na(health_damage$FATALITIES))
## [1] 0
sum(is.na(health_damage$INJURIES))
## [1] 0
health_damage_sum <- health_damage %>% group_by(EVTYPE) %>% summarize(totalF = sum(FATALITIES), totalI = sum(INJURIES), total = sum(FATALITIES + INJURIES)) %>% arrange(desc(total))
## `summarise()` ungrouping output (override with `.groups` argument)
dim(health_damage_sum)
## [1] 985 4
health_damage_sum_top5 <- head(health_damage_sum, 5)
ggplot(data = health_damage_sum_top5, aes(x = reorder(EVTYPE, - total), y = total, fill = interaction(total, EVTYPE))) +
geom_bar(stat = "identity") + labs(title = "Top 5 Severe Weather Envents Harming Population Health in US", x = "Events", y = "No. of death and injuries") + theme(legend.position = "none") +
theme(plot.title = element_text(hjust = 0.5))
economic_damage <- storm %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
sum(is.na(economic_damage$PROPDMG))
## [1] 0
sum(is.na(economic_damage$PROPDMGEXP))
## [1] 0
sum(is.na(economic_damage$CROPDMG))
## [1] 0
sum(is.na(economic_damage$CROPDMGEXP))
## [1] 0
dim(economic_damage)
## [1] 902297 5
head(economic_damage)
## EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1: TORNADO 25.0 K 0
## 2: TORNADO 2.5 K 0
## 3: TORNADO 25.0 K 0
## 4: TORNADO 2.5 K 0
## 5: TORNADO 2.5 K 0
## 6: TORNADO 2.5 K 0
library(mgsub)
economic_damage$PROPDMGEXP <- mgsub(economic_damage$PROPDMGEXP, c("K", "M", "B"), c("1000", "1e+06", "1e+09"))
economic_damage$CROPDMGEXP <- mgsub(economic_damage$CROPDMGEXP, c("K", "M", "B"), c("1000", "1e+06", "1e+09"))
economic_damage$PROPDMGEXP <- as.numeric(economic_damage$PROPDMGEXP)
## Warning: NAs introduced by coercion
economic_damage$CROPDMGEXP <- as.numeric(economic_damage$CROPDMGEXP)
## Warning: NAs introduced by coercion
str(economic_damage)
## Classes 'data.table' and 'data.frame': 902297 obs. of 5 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: num 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
library(tidyr)
economic_damage <- mutate_all(economic_damage, ~replace_na(., 0))
economic_damage <- economic_damage %>% mutate(Damage_value = PROPDMG * PROPDMGEXP + CROPDMG * CROPDMGEXP)
head(economic_damage)
## EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP Damage_value
## 1: TORNADO 25.0 1000 0 0 25000
## 2: TORNADO 2.5 1000 0 0 2500
## 3: TORNADO 25.0 1000 0 0 25000
## 4: TORNADO 2.5 1000 0 0 2500
## 5: TORNADO 2.5 1000 0 0 2500
## 6: TORNADO 2.5 1000 0 0 2500
economic_damage_top5 <- economic_damage %>% group_by(EVTYPE) %>% summarise(E_D_value = sum(Damage_value)) %>% arrange(desc(E_D_value)) %>% head(5)
## `summarise()` ungrouping output (override with `.groups` argument)
str(economic_damage_top5)
## tibble [5 x 2] (S3: tbl_df/tbl/data.frame)
## $ EVTYPE : chr [1:5] "FLOOD" "HURRICANE/TYPHOON" "TORNADO" "STORM SURGE" ...
## $ E_D_value: num [1:5] 1.50e+11 7.19e+10 5.73e+10 4.33e+10 1.88e+10
head(economic_damage_top5)
## # A tibble: 5 x 2
## EVTYPE E_D_value
## <chr> <dbl>
## 1 FLOOD 150319678250
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57340614101
## 4 STORM SURGE 43323541000
## 5 HAIL 18752904320
ggplot(data = economic_damage_top5, aes(x = reorder(EVTYPE, -E_D_value), y = E_D_value, fill = interaction(E_D_value, EVTYPE))) +
geom_bar(stat = "identity") + labs(title = "Top 5 Severe Weather Envents Harming Economics in US", x = "Events", y = "Economic Damage Value") + theme(legend.position = "none") +
theme(plot.title = element_text(hjust = 0.5))