The dataset for this projects comes from U.S. National Oceanic and Atmospheric Administration’s(NOAA).
Here can read full documentation.
Here is the dataset(47Mb)
The events in the database start in the year 1950 and end in November 2011.
The goal of this analysis is to answer the following question.
Across the US, which types of events are most harmful with respect to population health.
Across the US, which types of events have the greatest economic consequences?
In summary, Torando causes greatest fatalities and injuries and flood, drought cause highest impact upon economy. The followings are how did I get those result using with R codes.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(fuzzyjoin)
url <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(url, destfile = 'data.csv.bz2')
data <- data.table::fread('data.csv.bz2')
From the documentation of the dataset, the official events type are 48 and there were about 1000 events in the dataset at EVTYPE column! All that is just typo. So, first clean the EVTYPE column.
The official events are as follows. So, I make a data frame and try to match with EVTYPE column.
events <- c('ASTRONOMICAL LOW TIDE', 'AVALANCHE', 'BLIZZARD', 'COASTAL FLOOD', 'COLD/WIND CHILL', 'DEBRIS FLOW', 'DENSE FOG', 'DENSE SMOKE', 'DROUGHT', 'DUST DEVIL', 'DUST STORM', 'EXCESSIVE HEAT', 'EXTREME COLD/WIND CHILL', 'FLASH FLOOD', 'FLOOD', 'FROST/FREEZE', 'FUNNEL CLOUD', 'HAIL', 'HEAT', 'HEAVY RAIN', 'HEAVY SNOW', 'HIGH SURF', 'HIGH WIND', 'HURRICANE(TYPHOON)', 'ICE STORM', 'LAKE-EFFECT SNOW', 'LAKESHORE FLOOD', 'LIGHTNING', 'MARINE HAIL', 'MARINE HIGH WIND', 'MARINE STRONG WIND', 'MARINE THUNDERSTORM WIND', 'RIP CURRENT', 'SEICHE', 'SLEET', 'STORM SURGE/TIDE', 'STORM WIND', 'THUNDERSTORM WIND', 'TORNADO', 'TROPICAL DEPRESSION', 'TROPICAL STORM', 'TSUNAMI', 'VOLCANIC ASH', 'WATERSPOUT', 'WILDFIRE', 'WINTER STROM', 'WINTER WEATHER')
event_df <- data.frame(EVTYPE = events)
# to mark the original sequence
data$ID <- seq.int(nrow(data))
# separate from the original dataset as it is too big.
storm_events <- data %>% select(c( 'ID', 'EVTYPE'))
# before that code, I tried to clean the dataset with 1000 sample data for faster and I saw that most words 'THUNDERSTORM' are written as 'TSTM' so I first replace them.
storm_events$EVTYPE <- str_replace_all(storm_events$EVTYPE, 'TSTM', 'THUNDERSTORM')
# match with official names.
storm_events_test <- stringdist_left_join(storm_events, event_df, by='EVTYPE', method='dl')
# after these two steps there were only 21361 rows left to clean. If I tried to increase the max_dist, then most of the words with length less than 6 are changed. for example, HAIL is true, but it changed to HEAT, etc. So, I try to subset with length more than 7.
storm_events_test_2 <- storm_events_test %>% filter(is.na(EVTYPE.y) & str_count(EVTYPE.x) > 7)
# match with official names, this time, max string distance with 6.
storm_events_test_2 <- stringdist_left_join(storm_events_test_2, event_df, by=c('EVTYPE.x' = 'EVTYPE'), method='lcs', max_dist=6)
# select only required columns.
storm_events_test_2 <- storm_events_test_2 %>% select(c("ID", EVTYPE.y = 'EVTYPE'))
# combine them all
clean_storm_words <- storm_events_test %>%
select(c('ID', EVTYPE = 'EVTYPE.y')) %>%
full_join(storm_events_test_2, by='ID') %>%
mutate(EVTYPE = ifelse(is.na(EVTYPE.y),EVTYPE, EVTYPE.y)) %>%
select(c('ID', 'EVTYPE'))
# combine with original data frame
data <- data %>% inner_join(clean_storm_words, by='ID')
data <- data %>% filter(!is.na(EVTYPE.y))
dim(data)
## [1] 890274 39
# only loss about 12000 rows from original data frame of over 900000 rows and with 46 unique names.
unique(data$EVTYPE.y)
## [1] "TORNADO" "THUNDERSTORM WIND"
## [3] "HAIL" "SEICHE"
## [5] "WINTER STROM" "HEAVY RAIN"
## [7] "LIGHTNING" "DENSE FOG"
## [9] "RIP CURRENT" "FLASH FLOOD"
## [11] "HIGH WIND" "FUNNEL CLOUD"
## [13] "HEAT" "FLOOD"
## [15] "WATERSPOUT" "BLIZZARD"
## [17] "COLD/WIND CHILL" "HEAVY SNOW"
## [19] "COASTAL FLOOD" "ICE STORM"
## [21] "AVALANCHE" "MARINE HAIL"
## [23] "HIGH SURF" "DUST STORM"
## [25] "SLEET" "DUST DEVIL"
## [27] "EXCESSIVE HEAT" "STORM WIND"
## [29] "WILDFIRE" "WINTER WEATHER"
## [31] "DROUGHT" "STORM SURGE/TIDE"
## [33] "TROPICAL STORM" "EXTREME COLD/WIND CHILL"
## [35] "LAKESHORE FLOOD" "LAKE-EFFECT SNOW"
## [37] "FROST/FREEZE" "VOLCANIC ASH"
## [39] "TROPICAL DEPRESSION" "MARINE THUNDERSTORM WIND"
## [41] "HURRICANE(TYPHOON)" "MARINE HIGH WIND"
## [43] "TSUNAMI" "DENSE SMOKE"
## [45] "MARINE STRONG WIND" "ASTRONOMICAL LOW TIDE"
It is ready to answer the question 1.
# Selected the required columns.
data_select <- data %>% select(c('FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP', EVENT = 'EVTYPE.y'))
# group by event and them sum all the fatalities, and all the injuries and select top 10 of each type.
health_impact <- data_select %>%
group_by(EVENT) %>%
summarise(fatalities = sum(FATALITIES),
injuries = sum(INJURIES)) %>%
gather(key='health_impact', value='total',-EVENT) %>%
group_by(health_impact) %>%
top_n(10,total)
ggplot(health_impact, aes(EVENT, total)) +
geom_col(fill='darkblue') +
facet_wrap(~ health_impact, scales='free') +
theme_bw() +
theme(axis.text.x = element_text(angle=90)) +
labs(x = 'EVENTS', y = 'TOTAL IMPACTS', title = 'TOP 10 HEALTH IMPACTS BY NATURAL DISASTERS')
First, need to calculate the Economic consequences. In the dataset it is not given as numbers and just exponent columns and number column. So, change to numbers and calculate the total cost.
unique(data_select$PROPDMGEXP)
## [1] "K" "M" "" "B" "+" "0" "5" "m" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
unique(data_select$CROPDMGEXP)
## [1] "" "K" "M" "B" "?" "0" "k" "2"
#changes these letter ... H,h = hundreds, K,k=thousands, M,m=Millions, B,b=billions, + = 1, - = 0, ? = 0, blank = 0, numeric, 0...8 = 10
data_select <- data_select %>%
mutate(PROPDMGNUM = case_when(
PROPDMGEXP == 'K' ~ 1000,
PROPDMGEXP == 'M' | PROPDMGEXP == 'm' ~ 1000000,
PROPDMGEXP == 'B' ~ 1000000000,
PROPDMGEXP == 'h' | PROPDMGEXP == 'H' ~ 100,
PROPDMGEXP == '+' ~ 1,
PROPDMGEXP == '-' | PROPDMGEXP == '?' | PROPDMGEXP == "" ~ 0,
TRUE ~ 10),
CROPDMGNUM = case_when(
CROPDMGEXP == 'K' | CROPDMGEXP == 'k' ~ 1000,
CROPDMGEXP == 'M' ~ 1000000,
CROPDMGEXP == 'B' ~ 1000000000,
CROPDMGEXP == '?' | CROPDMGEXP == "" ~ 0,
TRUE ~ 10),
PROPDMG_TOTAL = PROPDMG * PROPDMGNUM,
CROPDMG_TOTAL = CROPDMG * CROPDMGNUM)
economic_impact_byevent <- data_select %>%
group_by(EVENT) %>%
summarize(CROP_LOSS = sum(CROPDMG_TOTAL),
PROPERTY_LOSS = sum(PROPDMG_TOTAL)) %>%
gather(key='damage', value='total_loss', -'EVENT') %>%
group_by(damage) %>%
top_n(10, total_loss)
ggplot(economic_impact_byevent, aes(fct_rev(fct_reorder(EVENT,total_loss)), total_loss/1000000000)) +
geom_col(color='blue',fill='darkblue') +
labs(x='EVENTS', y='Total Economic Loss in Billions', title='TOP 10 ECONOMIC LOSS BY NATURAL DISASTERS') +
facet_wrap(~ damage, scales='free') +
theme_bw() +
theme(axis.text.x = element_text(angle=90)) +
scale_y_continuous(labels= function(x){format(x, scientific=FALSE)})
There are many things left to explore from this dataset. For this assignment, I will stop here. Thanks a lot for your time.