I will describe which types of events are most harmful with the fatalities and the economical amount, based on the simple histograms to show the ranking.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## Warning: package 'ggplot2' was built under R version 4.4.1
## Warning: package 'tibble' was built under R version 4.4.1
## Warning: package 'tidyr' was built under R version 4.4.1
## Warning: package 'readr' was built under R version 4.4.1
## Warning: package 'purrr' was built under R version 4.4.1
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'stringr' was built under R version 4.4.1
## Warning: package 'forcats' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.1
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.1
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
df <- fread("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2")
head(df)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## <num> <char> <char> <char> <num> <char> <char>
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3: 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4: 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5: 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6: 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## <char> <num> <char> <char> <char> <char> <num> <lgcl>
## 1: TORNADO 0 0 NA
## 2: TORNADO 0 0 NA
## 3: TORNADO 0 0 NA
## 4: TORNADO 0 0 NA
## 5: TORNADO 0 0 NA
## 6: TORNADO 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES
## <num> <char> <char> <num> <num> <int> <num> <num> <num>
## 1: 0 14.0 100 3 0 0 15
## 2: 0 2.0 150 2 0 0 0
## 3: 0 0.1 123 2 0 0 2
## 4: 0 0.0 100 2 0 0 2
## 5: 0 0.0 150 2 0 0 2
## 6: 0 1.5 177 2 0 0 6
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE
## <num> <char> <num> <char> <char> <char> <char> <num>
## 1: 25.0 K 0 3040
## 2: 2.5 K 0 3042
## 3: 25.0 K 0 3340
## 4: 2.5 K 0 3458
## 5: 2.5 K 0 3412
## 6: 2.5 K 0 3450
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## <num> <num> <num> <char> <num>
## 1: 8812 3051 8806 1
## 2: 8755 0 0 2
## 3: 8742 0 0 3
## 4: 8626 0 0 4
## 5: 8642 0 0 5
## 6: 8748 0 0 6
#summary by event type
Fatality_table <- df %>%
group_by(EVTYPE) %>%
summarise(FATALITIES_EACH = sum(FATALITIES, na.rm = TRUE))
print(Fatality_table)
## # A tibble: 985 × 2
## EVTYPE FATALITIES_EACH
## <chr> <dbl>
## 1 " HIGH SURF ADVISORY" 0
## 2 " COASTAL FLOOD" 0
## 3 " FLASH FLOOD" 0
## 4 " LIGHTNING" 0
## 5 " TSTM WIND" 0
## 6 " TSTM WIND (G45)" 0
## 7 " WATERSPOUT" 0
## 8 " WIND" 0
## 9 "?" 0
## 10 "ABNORMAL WARMTH" 0
## # ℹ 975 more rows
#remove 0 fatality and change order
#only top 10 included
Fatality_table_ranking <- Fatality_table %>%
filter(FATALITIES_EACH!=0) %>%
arrange(desc(FATALITIES_EACH)) %>%
slice_head(n = 10)
print(Fatality_table_ranking)
## # A tibble: 10 × 2
## EVTYPE FATALITIES_EACH
## <chr> <dbl>
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
library(ggplot2)
# Change EVTYPE into factor for ordering
Fatality_table_ranking$EVTYPE <- factor(
Fatality_table_ranking$EVTYPE,
levels = Fatality_table_ranking$EVTYPE[order(Fatality_table_ranking$FATALITIES_EACH, decreasing = TRUE)]
)
# Histgram
ggplot(Fatality_table_ranking, aes(x = EVTYPE, y = FATALITIES_EACH)) +
geom_bar(stat = "identity", fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Event ranking", x = "Event type", y = "Fatalities") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
The most harmful type of events for population health is TORNADO.
#create numeric prices
df <- df %>% mutate(PROPDMG_PRICE = case_when(
PROPDMGEXP == "K" ~ PROPDMG*1000,
PROPDMGEXP == "M" ~ PROPDMG*1000000,
PROPDMGEXP == "B" ~ PROPDMG*1000000000
))
head(df$PROPDMG_PRICE)
## [1] 25000 2500 25000 2500 2500 2500
#summary by event type
Economy_table <- df %>%
group_by(EVTYPE) %>%
summarise(Economy_EACH = sum(PROPDMG_PRICE, na.rm = TRUE))
print(Economy_table)
## # A tibble: 985 × 2
## EVTYPE Economy_EACH
## <chr> <dbl>
## 1 " HIGH SURF ADVISORY" 200000
## 2 " COASTAL FLOOD" 0
## 3 " FLASH FLOOD" 50000
## 4 " LIGHTNING" 0
## 5 " TSTM WIND" 8100000
## 6 " TSTM WIND (G45)" 8000
## 7 " WATERSPOUT" 0
## 8 " WIND" 0
## 9 "?" 5000
## 10 "ABNORMAL WARMTH" 0
## # ℹ 975 more rows
#remove 0 economy and change order
#only top 10 included
Economy_table_ranking <- Economy_table %>%
filter(Economy_EACH!=0) %>%
arrange(desc(Economy_EACH)) %>%
slice_head(n = 10)
print(Economy_table_ranking)
## # A tibble: 10 × 2
## EVTYPE Economy_EACH
## <chr> <dbl>
## 1 FLOOD 144657709800
## 2 HURRICANE/TYPHOON 69305840000
## 3 TORNADO 56925660480
## 4 STORM SURGE 43323536000
## 5 FLASH FLOOD 16140811510
## 6 HAIL 15727366720
## 7 HURRICANE 11868319010
## 8 TROPICAL STORM 7703890550
## 9 WINTER STORM 6688497250
## 10 HIGH WIND 5270046260
# Change EVTYPE into factor for ordering
Economy_table_ranking$EVTYPE <- factor(
Economy_table_ranking$EVTYPE,
levels = Economy_table_ranking$EVTYPE[order(Economy_table_ranking$Economy_EACH, decreasing = TRUE)]
)
# Histgram
ggplot(Economy_table_ranking, aes(x = EVTYPE, y = Economy_EACH)) +
geom_bar(stat = "identity", fill = "red", color = "black", alpha = 0.7) +
labs(title = "Event ranking", x = "Event type", y = "Economy damage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
The most harmful type of events for economy is FLOOD.