library(dplyr)
library(readr)
library(stingr)
library(ggplot2)
library(gridExtra) #to arrange plots into a single figure
library(scales)
data_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "StormData.csv.bz2"
if(!file.exists(destfile)){
download.file(data_url, destfile, mode = "wb")
}
storm <- read.csv(destfile)
dim(storm)
glimpse(storm, width = 60)
exp_to_multiplier <- funtion(e) {
e <- toupper(trimws(as.character(e)))
sapply(e, function(x) {
if (is.na(x) || x == "") return(1)
# common codes seen in StormData: K, M, B
if (x %in% c("K")) return(1e3)
if (x %in% c("M")) return(1e6)
if (x %in% c("B")) return(1e9)
# Sometimes numeric "2", "3" meaning 10^n
if (grepl("^[0-9]+$", x)) return(10^(as.numeric(x)))
# plus some other single-character codes that appear in data (e.g., h, H)
# many datasets include "+", "-", "?", "0", etc. We'll treat unknown as 1.
return(1)
})
}
storm <- storm %>%
mutate(
PROPDMG_MULT = exp_to_multiplier(PROPDMGEXP),
CROPDMG_MULT = exp_to_multiplier(CROPDMGEXP),
PROP_DMG = as.numeric(PROPDMG) * PROPDMG_MULT,
CROP_DMG = as.numeric(CROPDMG) * CROPDMG_MULT,
CASUALTIES = coalesce(FATALITIES, 0) + coalesce(INJURIES, 0),
ECONOMIC_DMG = coalesce(PROP_DMG, 0) + coalesce(CROP_DMG, 0)
)
# Quick Checks
summary(storm$PROP_DMG)
summary(storm$CROP_DMG)
summary(storm&CASUALTIES)
standardize_evtype <- function(ev) {
ev0 <- toupper(trimws(as.character(ev)))
ev0 <- str_replace_all(ev0, "[^A-Z0-9\s]", "")
ev0 <- str_squish(ev0)
case_when(
str_detect(ev0, "TORNADO") ~ "TORNADO",
str_detect(ev0, "(HURRICANE|HU )|^TROPICAL STORM|TROPICAL CYCLONE|TSUNAMI") ~ "HURRICANE/TROPICAL STORM",
str_detect(ev0, "(TSUNAMI)") ~ "TSUNAMI",
str_detect(ev0, "THUNDERSTORM|TSTM|TSTMWIND|TSTM WIND|TSTM W") & str_detect(ev0, "WIND|GUST") ~ "THUNDERSTORM WIND",
# Some entries are "WIND" or "HIGH WIND" not necessarily thunderstorm-caused
str_detect(ev0, "\b(WIND|HIGH WIND|GUST)\b") & !str_detect(ev0, "THUNDER") ~ "HIGH WIND",
str_detect(ev0, "\b(HAIL)\b") ~ "HAIL",
str_detect(ev0, "FLOOD|FLASH FLOOD|RIVER FLOOD|URBAN|STREAM|COASTAL FLOOD") ~ "FLOOD",
str_detect(ev0, "HEAT|WARM") ~ "HEAT",
str_detect(ev0, "COLD|FREEZ|FROST|WINTER|HYPOTHERM|LOW TEMP") ~ "COLD/WINTER",
str_detect(ev0, "WINTER STORM|SNOW|SLEET|BLIZZARD|ICE STORM|ICE|FREEZING RAIN") ~ "WINTER STORM",
str_detect(ev0, "LIGHTNING") ~ "LIGHTNING",
str_detect(ev0, "DROUGHT|DRY") ~ "DROUGHT",
str_detect(ev0, "WILDFIRE|WILD FIRE|FOREST FIRE|BRUSH FIRE") ~ "WILDFIRE",
str_detect(ev0, "AVALANCHE") ~ "AVALANCHE",
str_detect(ev0, "LANDSLIDE|MUDSLIDE") ~ "LANDSLIDE",
str_detect(ev0, "FIRE") ~ "FIRE",
str_detect(ev0, "FOG") ~ "FOG",
str_detect(ev0, "TORNADO") ~ "TORNADO",
str_detect(ev0, "VOLCAN") ~ "VOLCANIC ACTIVITY",
# Default to original cleaned value for specificity, but shortened
TRUE ~ ev0
)
}
storm <- storm %>%
mutate(EVTYPE_CLEAN = standardize_evtype(EVTYPE))
storm %>%
count(EVTYPE_CLEAN, sort = TRUE) %>%
slice_head(n = 20)
impact_by_event <- storm %>%
group_by(EVTYPE_CLEAN) %>%
summarise(
total_casualties = sum(CASUALTIES, na.rm = TRUE),
total_fatalities = sum(coalesce(FATALITIES, 0), na.rm = TRUE),
total_injuries = sum(coalesce(INJURIES, 0 ), na.rm = TRUE),
total_economic = sum(ECONOMIC_DMG, na.rm = TRUE),
n_reports = n()
) %>%
ungroup()
# Top 10 by casualties
top_casualties <- impact_by_event %>%
arrange(desc(total_casualties)) %>%
slice_head(n = 10)
# Top 10 by economic damage
top_economic <- impact_by_event %>%
arrange(desc(total_economic)) %>%
slice_head(n = 10)
top_casualties
top_economic
# Bar plot for casualities
p1 <- ggplot(top_casualties, aes(x = reorder(EVTYPE_CLEAN, total_casualties), y = total_casualties)) +
geom_col() +
coord_flip()+
labs(title = "Top 10 Event Types by Population Impact(Fatalities + Injuries)", x = "", y = "Total Casualties(fatalities + injuries)") +
scale_y_continuous(labels = comma)
# Bar plot for economic damage
p2 <- ggplot(top_economic, aes(x = reorder(EVTYPE_CLEAN, total_economic), y = total_economic/1e6)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 Even Types by Economic Damage", x = "", y = "Total Economic Damage (million USD)") +
scale_y_continuous(labels = comma)
# Arrange side by side in one figure
grid.arrange(p1, p2, ncol = 2)