Natural and technological disasters pose significant threats to human life, economic stability, and sustainable development across the globe. Asia, being the world’s largest and most populous continent, experiences a disproportionately high share of global disasters, making it a critical region for disaster risk analysis. This comprehensive study analyzes disaster patterns, impacts, and trends across Asian countries using the Emergency Events Database (EM-DAT) maintained by the Centre for Research on the Epidemiology of Disasters (CRED).
The EM-DAT database is internationally recognized as the most comprehensive global disaster database, systematically collecting and validating information on natural and technological disasters since 1900. For this analysis, we focus on the Asian subset of the EM-DAT database, covering the period from 2000 to 2025. This dataset encompasses:
The dataset contains over 40 variables capturing multiple dimensions of disaster impacts:
Following this introduction, the report is organized into five main sections:
This analysis provides critical insights for policymakers, disaster management agencies, and international organizations working toward disaster risk reduction and resilience building in Asia.
The objectives planned for the mini project are as below:
ggplot2# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read.csv("extracted_data/Asia.csv", stringsAsFactors = FALSE)
disaster_data <- data
# Display basic information
cat("Dataset Dimensions:", nrow(disaster_data), "rows and", ncol(disaster_data), "columns\n")
## Dataset Dimensions: 6531 rows and 46 columns
cat("Number of Countries:", length(unique(disaster_data$Country)), "\n")
## Number of Countries: 50
cat("Number of unique disaster types:", length(unique(disaster_data$`Disaster Type`)), "\n\n")
## Number of unique disaster types: 0
# Check column names
cat("Column names in dataset:\n")
## Column names in dataset:
names(disaster_data)
## [1] "DisNo."
## [2] "Historic"
## [3] "Classification.Key"
## [4] "Disaster.Group"
## [5] "Disaster.Subgroup"
## [6] "Disaster.Type"
## [7] "Disaster.Subtype"
## [8] "External.IDs"
## [9] "Event.Name"
## [10] "ISO"
## [11] "Country"
## [12] "Subregion"
## [13] "Region"
## [14] "Location"
## [15] "Origin"
## [16] "Associated.Types"
## [17] "OFDA.BHA.Response"
## [18] "Appeal"
## [19] "Declaration"
## [20] "AID.Contribution...000.US.."
## [21] "Magnitude"
## [22] "Magnitude.Scale"
## [23] "Latitude"
## [24] "Longitude"
## [25] "River.Basin"
## [26] "Start.Year"
## [27] "Start.Month"
## [28] "Start.Day"
## [29] "End.Year"
## [30] "End.Month"
## [31] "End.Day"
## [32] "Total.Deaths"
## [33] "No..Injured"
## [34] "No..Affected"
## [35] "No..Homeless"
## [36] "Total.Affected"
## [37] "Reconstruction.Costs...000.US.."
## [38] "Reconstruction.Costs..Adjusted...000.US.."
## [39] "Insured.Damage...000.US.."
## [40] "Insured.Damage..Adjusted...000.US.."
## [41] "Total.Damage...000.US.."
## [42] "Total.Damage..Adjusted...000.US.."
## [43] "CPI"
## [44] "Admin.Units"
## [45] "Entry.Date"
## [46] "Last.Update"
library(tidyverse)
library(lubridate)
# Convert all empty strings to NA
disaster_data[disaster_data == ""] <- NA
disaster_clean <- disaster_data %>%
mutate(
# --- SAFE DATE CONSTRUCTION ---
Start_Date = make_date(
year = as.integer(Start.Year),
month = as.integer(Start.Month),
day = as.integer(Start.Day)
),
End_Date = make_date(
year = as.integer(End.Year),
month = as.integer(End.Month),
day = as.integer(End.Day)
),
Duration_Days = as.numeric(End_Date - Start_Date),
# --- TEMPORAL VARIABLES ---
Year = as.integer(Start.Year),
Month = as.integer(Start.Month),
Season = case_when(
Month %in% c(12,1,2) ~ "Winter",
Month %in% c(3,4,5) ~ "Spring",
Month %in% c(6,7,8) ~ "Summer",
Month %in% c(9,10,11) ~ "Fall",
TRUE ~ NA_character_
),
Decade = case_when(
Year >= 2000 & Year <= 2010 ~ "2000-2010",
Year >= 2011 & Year <= 2020 ~ "2011-2020",
Year >= 2021 & Year <= 2025 ~ "2021-2025",
TRUE ~ NA_character_
),
Duration_Category = case_when(
is.na(Duration_Days) ~ "Unknown",
Duration_Days == 0 ~ "Single Day",
Duration_Days <= 7 ~ "Short (1-7 days)",
Duration_Days <= 30 ~ "Medium (8-30 days)",
Duration_Days <= 90 ~ "Long (31-90 days)",
Duration_Days > 90 ~ "Very Long (>90 days)"
),
# --- NUMERIC CLEANING ---
Total_Deaths = as.numeric(gsub(",", "", Total.Deaths)),
Injured = as.numeric(gsub(",", "", No..Injured)),
Affected = as.numeric(gsub(",", "", No..Affected)),
Homeless = as.numeric(gsub(",", "", No..Homeless)),
Total_Affected = as.numeric(gsub(",", "", Total.Affected)),
Total_Damage = as.numeric(gsub(",", "", Total.Damage...000.US..)),
Total_Damage_Adjusted = as.numeric(gsub(",", "", Total.Damage..Adjusted...000.US..)),
Reconstruction_Costs = as.numeric(gsub(",", "", Reconstruction.Costs...000.US..)),
Insured_Damage = as.numeric(gsub(",", "", Insured.Damage...000.US..)),
# --- MAGNITUDE ---
Magnitude_Value = suppressWarnings(as.numeric(gsub("[^0-9\\.]", "", Magnitude))),
# --- GEO COORDINATES ---
Lat = as.numeric(Latitude),
Lon = as.numeric(Longitude),
# --- KEEP SAME NAMES AS YOUR ORIGINAL PIPELINE EXPECTS ---
Disaster_Group = Disaster.Group,
Disaster_Type = Disaster.Type,
Disaster_Subtype = Disaster.Subtype
)
cat("Data cleaning completed successfully.\n")
## Data cleaning completed successfully.
cat("Rows:", nrow(disaster_clean), "\n")
## Rows: 6531
cat("Columns:", ncol(disaster_clean), "\n")
## Columns: 69
# Yearly trends
yearly_trends <- disaster_clean %>%
group_by(Year) %>%
summarise(
Count = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
Affected = sum(Total_Affected, na.rm = TRUE),
.groups = 'drop'
)
# Trend model
trend_model <- lm(Count ~ Year, data = yearly_trends)
cat("Annual Trend Analysis:\n")
## Annual Trend Analysis:
cat("Average increase per year:", round(coef(trend_model)[2], 2), "disasters\n")
## Average increase per year: -6.87 disasters
cat("R-squared:", round(summary(trend_model)$r.squared, 3), "\n\n")
## R-squared: 0.634
# Monthly and seasonal patterns
monthly_pattern <- disaster_clean %>%
group_by(Month) %>%
summarise(Count = n(), .groups = 'drop')
seasonal_pattern <- disaster_clean %>%
count(Season)
# Visualizations
p1 <- ggplot(yearly_trends, aes(x = Year, y = Count)) +
geom_line(color = "darkblue", size = 1) +
geom_point(color = "red") +
geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
labs(title = "Yearly Disaster Frequency Trend", x = "Year", y = "Number of Disasters") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- ggplot(monthly_pattern, aes(x = factor(Month), y = Count)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Monthly Distribution", x = "Month", y = "Frequency") +
scale_x_discrete(labels = month.abb) +
theme_minimal()
p3 <- ggplot(seasonal_pattern, aes(x = Season, y = n, fill = Season)) +
geom_bar(stat = "identity") +
labs(title = "Seasonal Pattern", x = "Season", y = "Frequency") +
theme_minimal() +
theme(legend.position = "none")
print(p1)
## `geom_smooth()` using formula = 'y ~ x'
print(p2)
print(p3)
### 2. Disaster Classification (Objective 3)
# Classification summary
classification_summary <- disaster_clean %>%
group_by(Disaster_Group, Disaster_Type) %>%
summarise(
Count = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
Economic_Loss = sum(Total_Damage, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Count))
# Top disaster types
top_types <- classification_summary %>%
group_by(Disaster_Type) %>%
summarise(Total_Count = sum(Count), .groups = 'drop') %>%
arrange(desc(Total_Count)) %>%
head(10)
# Visualization
ggplot(top_types, aes(x = reorder(Disaster_Type, Total_Count), y = Total_Count)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Disaster Types in Asia", x = "Disaster Type", y = "Frequency") +
theme_minimal()
cat("\nDisaster Classification Summary:\n")
##
## Disaster Classification Summary:
cat("Natural Disasters:", sum(disaster_clean$Disaster_Group == "Natural", na.rm = TRUE), "\n")
## Natural Disasters: 4015
cat("Technological Disasters:", sum(disaster_clean$Disaster_Group == "Technological", na.rm = TRUE), "\n")
## Technological Disasters: 2516
# Country analysis
country_impact <- disaster_clean %>%
group_by(Country) %>%
summarise(
Disasters = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
Affected = sum(Total_Affected, na.rm = TRUE),
Economic_Loss = sum(Total_Damage, na.rm = TRUE),
Avg_Lat = mean(Lat, na.rm = TRUE),
Avg_Lon = mean(Lon, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Disasters))
top10_countries <- head(country_impact, 10)
# Bar chart for top 10 countries
ggplot(top10_countries, aes(x = reorder(Country, Disasters), y = Disasters)) +
geom_bar(stat = "identity", fill = "darkred") +
coord_flip() +
labs(title = "Top 10 Disaster-Prone Countries", x = "", y = "Number of Disasters") +
theme_minimal()
print("Top 5 Most Affected Countries:")
## [1] "Top 5 Most Affected Countries:"
print(top10_countries[1:5, c("Country", "Disasters", "Deaths")])
## # A tibble: 5 × 3
## Country Disasters Deaths
## <chr> <int> <dbl>
## 1 China 1355 134151
## 2 India 820 104492
## 3 Indonesia 564 196463
## 4 Philippines 474 30528
## 5 Pakistan 321 93464
# LEAFLET MAP 1: Disaster Hotspots by Location
library(leaflet) # Ensure leaflet is loaded
map_data <- disaster_clean %>%
filter(!is.na(Lat) & !is.na(Lon))
hotspot_map <- leaflet(map_data) %>%
addTiles() %>%
setView(lng = 100, lat = 20, zoom = 3) %>% # Center on Asia
addCircleMarkers(
~Lon, ~Lat,
radius = ~ifelse(Total_Deaths > 0, log(Total_Deaths + 1), 1),
color = ~ifelse(Disaster_Group == "Natural", "red", "blue"),
fillOpacity = 0.5,
stroke = TRUE,
weight = 1,
popup = ~paste(
"<b>Location:</b>", Location, "<br>",
"<b>Country:</b>", Country, "<br>",
"<b>Type:</b>", Disaster_Type, "<br>",
"<b>Year:</b>", Year, "<br>",
"<b>Deaths:</b>", Total_Deaths, "<br>",
"<b>Affected:</b>", Total_Affected
),
clusterOptions = markerClusterOptions()
) %>%
leaflet::addLegend(
position = "bottomright",
colors = c("red", "blue"),
labels = c("Natural Disasters", "Technological Disasters"),
title = "Disaster Type"
)
# Display the hotspot map
hotspot_map
# LEAFLET MAP 2: Country-level Summary Map
country_map_data <- country_impact %>%
filter(!is.na(Avg_Lat) & !is.na(Avg_Lon))
country_map <- leaflet(country_map_data) %>%
addTiles() %>%
setView(lng = 100, lat = 20, zoom = 3) %>%
addCircles(
~Avg_Lon, ~Avg_Lat,
radius = ~sqrt(Disasters) * 30000, # Scale circle size by disasters
color = "orange",
fillColor = "yellow",
fillOpacity = 0.6,
popup = ~paste(
"<b>Country:</b>", Country, "<br>",
"<b>Total Disasters:</b>", Disasters, "<br>",
"<b>Total Deaths:</b>", format(Deaths, big.mark = ","), "<br>",
"<b>Total Affected:</b>", format(Affected, big.mark = ","), "<br>",
"<b>Economic Loss:</b> $", format(Economic_Loss/1000, big.mark = ","), "M"
),
label = ~Country
)
# Display country map
country_map
# LEAFLET MAP 3: Simple point map as alternative
point_map <- leaflet(map_data) %>%
addTiles() %>%
setView(lng = 100, lat = 20, zoom = 3) %>%
addMarkers(
~Lon, ~Lat,
popup = ~paste(Location, "-", Disaster_Type, "-", Year),
clusterOptions = markerClusterOptions()
)
# Display point map
point_map
human_impact <- disaster_clean %>%
group_by(Disaster_Type) %>%
summarise(
Total_Deaths = sum(Total_Deaths, na.rm = TRUE),
Total_Injured = sum(Injured, na.rm = TRUE),
Total_Affected = sum(Affected, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Total_Deaths)) %>%
head(5)
# Visualization
human_impact %>%
pivot_longer(cols = -Disaster_Type, names_to = "Impact", values_to = "Count") %>%
ggplot(aes(x = Disaster_Type, y = Count, fill = Impact)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_log10() +
labs(title = "Human Impact by Disaster Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
economic_impact <- disaster_clean %>%
filter(!is.na(Total_Damage)) %>%
group_by(Year) %>%
summarise(
Nominal = sum(Total_Damage, na.rm = TRUE),
Adjusted = sum(Total_Damage_Adjusted, na.rm = TRUE),
.groups = 'drop'
)
ggplot(economic_impact, aes(x = Year)) +
geom_line(aes(y = Nominal, color = "Nominal"), size = 1) +
geom_line(aes(y = Adjusted, color = "CPI-Adjusted"), size = 1) +
labs(title = "Economic Impact Over Time", y = "Damage ('000 USD)") +
theme_minimal()
correlation_data <- disaster_clean %>%
filter(!is.na(Magnitude_Value) & !is.na(Total_Deaths))
cor_value <- cor(correlation_data$Magnitude_Value,
correlation_data$Total_Deaths,
use = "complete.obs")
cat("Correlation between Magnitude and Deaths:", round(cor_value, 3), "\n")
## Correlation between Magnitude and Deaths: -0.02
ggplot(correlation_data, aes(x = Magnitude_Value, y = log(Total_Deaths + 1))) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Magnitude vs Deaths Relationship") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### 7. Top Disasters (Objective 9)
# Top 10 deadliest
top_deadly <- disaster_clean %>%
arrange(desc(Total_Deaths)) %>%
select(Year, Country, Disaster_Type, Total_Deaths) %>%
head(10)
# Top 10 economic
top_economic <- disaster_clean %>%
arrange(desc(Total_Damage)) %>%
select(Year, Country, Disaster_Type, Total_Damage) %>%
head(10)
print("Top 10 Deadliest Disasters:")
## [1] "Top 10 Deadliest Disasters:"
print(top_deadly)
## Year Country Disaster_Type Total_Deaths
## 1 2004 Indonesia Earthquake 165708
## 2 2008 Myanmar Storm 138366
## 3 2008 China Earthquake 87476
## 4 2005 Pakistan Earthquake 73338
## 5 2023 Türkiye Earthquake 53000
## 6 2004 Sri Lanka Earthquake 35399
## 7 2003 Iran (Islamic Republic of) Earthquake 26796
## 8 2001 India Earthquake 20005
## 9 2011 Japan Earthquake 19846
## 10 2004 India Earthquake 16389
# Boxplot
disaster_clean %>%
filter(Total_Deaths > 0) %>%
ggplot(aes(x = Disaster_Group, y = log(Total_Deaths), fill = Disaster_Group)) +
geom_boxplot() +
labs(title = "Death Distribution by Group") +
theme_minimal()
# Histogram
disaster_clean %>%
filter(!is.na(Total_Damage) & Total_Damage > 0) %>%
ggplot(aes(x = log(Total_Damage))) +
geom_histogram(bins = 30, fill = "darkgreen", alpha = 0.7) +
labs(title = "Economic Damage Distribution") +
theme_minimal()
# Density
disaster_clean %>%
filter(!is.na(Total_Affected) & Total_Affected > 0) %>%
ggplot(aes(x = log(Total_Affected))) +
geom_density(fill = "purple", alpha = 0.5) +
labs(title = "Affected Population Density") +
theme_minimal()
### 9. Interactive Maps (Objectives 11, 12)
# Prepare map data
map_data <- disaster_clean %>%
filter(!is.na(Lat) & !is.na(Lon))
# Create interactive disaster map
disaster_map <- leaflet(map_data) %>%
addTiles() %>%
setView(lng = 100, lat = 20, zoom = 3) %>%
addCircleMarkers(
~Lon, ~Lat,
radius = ~log(Total_Deaths + 1),
color = ~ifelse(Disaster_Group == "Natural", "red", "blue"),
fillOpacity = 0.5,
popup = ~paste(
"Location:", Location, "<br>",
"Type:", Disaster_Type, "<br>",
"Deaths:", Total_Deaths
),
clusterOptions = markerClusterOptions()
) %>%
leaflet::addLegend(
colors = c("red", "blue"),
labels = c("Natural", "Technological"),
title = "Disaster Type"
)
disaster_map
# Create heatmap using ggplot2 (since heatmap function might cause issues)
heatmap_data <- disaster_clean %>%
group_by(Subregion, Year) %>%
summarise(Count = n(), .groups = 'drop')
ggplot(heatmap_data %>% filter(Year >= 2010),
aes(x = Year, y = Subregion, fill = Count)) +
geom_tile() +
scale_fill_gradient2(low = "white", mid = "orange", high = "darkred") +
labs(title = "Disaster Intensity Heatmap") +
theme_minimal()
developed <- c("Japan", "South Korea", "Singapore", "Israel", "UAE", "Qatar")
development <- disaster_clean %>%
mutate(Status = ifelse(Country %in% developed, "Developed", "Developing")) %>%
group_by(Status) %>%
summarise(
Disasters = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
.groups = 'drop'
)
ggplot(development, aes(x = Status, y = Disasters, fill = Status)) +
geom_bar(stat = "identity") +
labs(title = "Developed vs Developing Nations") +
theme_minimal()
### 11. Decade Analysis (Objective 14)
decade_analysis <- disaster_clean %>%
group_by(Decade) %>%
summarise(
Disasters = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
.groups = 'drop'
)
ggplot(decade_analysis, aes(x = Decade, y = Disasters, fill = Decade)) +
geom_bar(stat = "identity") +
labs(title = "Decade-wise Analysis") +
theme_minimal()
### 12. Duration Analysis (Objective 15)
duration <- disaster_clean %>%
filter(!is.na(Duration_Days)) %>%
mutate(
Duration_Category = case_when(
Duration_Days == 0 ~ "Single Day",
Duration_Days <= 7 ~ "Short",
Duration_Days <= 30 ~ "Medium",
TRUE ~ "Long"
)
)
ggplot(duration, aes(x = Duration_Category)) +
geom_bar(fill = "steelblue") +
labs(title = "Disaster Duration Distribution") +
theme_minimal()
# Identify outliers
Q1 <- quantile(disaster_clean$Total_Deaths, 0.25, na.rm = TRUE)
Q3 <- quantile(disaster_clean$Total_Deaths, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
outliers <- disaster_clean %>%
filter(Total_Deaths > Q3 + 1.5 * IQR) %>%
arrange(desc(Total_Deaths)) %>%
head(10)
print("Top Outliers:")
## [1] "Top Outliers:"
print(outliers[, c("Year", "Country", "Disaster_Type", "Total_Deaths")])
## Year Country Disaster_Type Total_Deaths
## 1 2004 Indonesia Earthquake 165708
## 2 2008 Myanmar Storm 138366
## 3 2008 China Earthquake 87476
## 4 2005 Pakistan Earthquake 73338
## 5 2023 Türkiye Earthquake 53000
## 6 2004 Sri Lanka Earthquake 35399
## 7 2003 Iran (Islamic Republic of) Earthquake 26796
## 8 2001 India Earthquake 20005
## 9 2011 Japan Earthquake 19846
## 10 2004 India Earthquake 16389
missing <- disaster_clean %>%
summarise_all(~sum(is.na(.))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "Missing") %>%
mutate(Percentage = Missing / nrow(disaster_clean) * 100) %>%
filter(Percentage > 0) %>%
arrange(desc(Percentage))
ggplot(missing %>% head(10),
aes(x = reorder(Variable, Percentage), y = Percentage)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(title = "Missing Data Patterns") +
theme_minimal()
facet_data <- disaster_clean %>%
filter(Disaster_Type %in% c("Flood", "Storm", "Earthquake", "Drought")) %>%
count(Subregion, Disaster_Type)
ggplot(facet_data, aes(x = Disaster_Type, y = n, fill = Disaster_Type)) +
geom_bar(stat = "identity") +
facet_wrap(~Subregion) +
labs(title = "Disaster Types by Subregion") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### 16. Dashboard for Top 20 Countries (Objectives 19, 20)
top20 <- disaster_clean %>%
group_by(Country) %>%
summarise(
Disasters = n(),
Deaths = sum(Total_Deaths, na.rm = TRUE),
Affected = sum(Total_Affected, na.rm = TRUE),
Economic_Loss = sum(Total_Damage, na.rm = TRUE),
.groups = 'drop'
) %>%
arrange(desc(Disasters)) %>%
head(20)
# Multi-metric visualization
ggplot(top20 %>% head(10),
aes(x = log(Deaths + 1), y = log(Economic_Loss + 1),
size = Affected/1e6, color = Disasters)) +
geom_point(alpha = 0.7) +
geom_text(aes(label = Country), size = 3, check_overlap = TRUE) +
scale_color_gradient(low = "yellow", high = "red") +
labs(title = "Integrated Impact Analysis",
size = "Affected (M)", color = "Disasters") +
theme_minimal()