Student 1: Shivam Kumar (Reg. No.: 12324700) Student 2: Siddharth Shukla (Reg. No.: 12302293)
This project explores how rainfall and related environmental factors contribute to urban flood risk. Using R, we analyze relationships between rainfall intensity, elevation, drainage density, and flood occurrence to understand whether rainfall plays a significant role in triggering urban flooding.
The very first step towards our analysis would be loading up the data set.
data_raw <- read.csv("C:\\Users\\Shivam\\Downloads\\urban_pluvial_flood_risk_dataset.csv", stringsAsFactors = FALSE, na.strings = c("", "NA", "NaN"))
This step fixes inconsistent column names, converts text numbers to numeric types, fills missing values safely, and creates a simple target (flood_binary) for classification. Clean, scaled data reduces errors and makes statistical models more reliable.
names(data_raw) <- names(data_raw) %>% tolower() %>% str_replace_all("\\s+", "_")
to_num <- c("historical_rainfall_intensity_mm_hr","elevation_m","drainage_density_km_per_km2",
"storm_drain_proximity_m","return_period_years")
for (col in to_num) {
if (col %in% names(data_raw)) {
data_raw[[col]] <- as.numeric(as.character(data_raw[[col]]))
}
}
is_num <- sapply(data_raw, is.numeric)
num_cols <- names(data_raw)[is_num]
cat_cols <- names(data_raw)[!is_num]
cat("Numeric columns:\n"); print(num_cols)
## Numeric columns:
## [1] "latitude" "longitude"
## [3] "elevation_m" "drainage_density_km_per_km2"
## [5] "storm_drain_proximity_m" "historical_rainfall_intensity_mm_hr"
## [7] "return_period_years"
cat("Categorical columns:\n"); print(cat_cols)
## Categorical columns:
## [1] "segment_id" "city_name" "admin_ward" "catchment_id"
## [5] "dem_source" "land_use" "soil_group" "storm_drain_type"
## [9] "rainfall_source" "risk_labels"
# Remove rows with missing values
data <- data_raw
data <- na.omit(data)
# Confirm no missing values remain
sum(is.na(data))
## [1] 0
if ("risk_labels" %in% names(data)) {
data$risk_labels <- as.character(data$risk_labels)
data$flood_binary <- ifelse(str_detect(tolower(data$risk_labels), "extreme|ponding|low_lying|hotspot|high"), "High", "Low")
}
data$flood_binary <- as.factor(data$flood_binary)
data_num_scaled <- data %>% select(all_of(num_cols)) %>% scale() %>% as.data.frame()
colnames(data_num_scaled) <- num_cols
# Show cleaned data summary
glimpse(data)
## Rows: 1,821
## Columns: 18
## $ segment_id <chr> "SEG-00003", "SEG-00004", "SEG-000…
## $ city_name <chr> "Ahmedabad, India", "Hong Kong, Ch…
## $ admin_ward <chr> "Sector 12", "Sector 14", "Sector …
## $ latitude <dbl> 23.019473, 22.302602, -29.887602, …
## $ longitude <dbl> 72.63858, 114.07867, 30.91101, 100…
## $ catchment_id <chr> "CAT-023", "CAT-168", "CAT-171", "…
## $ elevation_m <dbl> 30.88, 24.28, 35.70, 15.36, 15.80,…
## $ dem_source <chr> "SRTM_3arc", "SRTM_3arc", "SRTM_3a…
## $ land_use <chr> "Industrial", "Residential", "Indu…
## $ soil_group <chr> "B", "B", "C", "C", "A", "C", "B",…
## $ drainage_density_km_per_km2 <dbl> 11.00, 7.32, 4.50, 8.97, 8.25, 5.8…
## $ storm_drain_proximity_m <dbl> 152.5, 37.0, 292.4, 30.0, 43.0, 31…
## $ storm_drain_type <chr> "OpenChannel", "Manhole", "OpenCha…
## $ rainfall_source <chr> "IMD", "ERA5", "ERA5", "LocalGauge…
## $ historical_rainfall_intensity_mm_hr <dbl> 16.3, 77.0, 20.8, 120.5, 39.3, 74.…
## $ return_period_years <dbl> 5, 10, 5, 50, 10, 10, 25, 5, 10, 5…
## $ risk_labels <chr> "monitor", "monitor", "monitor", "…
## $ flood_binary <fct> Low, Low, Low, High, Low, Low, Low…
summary(data[num_cols])
## latitude longitude elevation_m
## Min. :-36.999 Min. :-123.2930 Min. : -3.00
## 1st Qu.: 6.513 1st Qu.: -0.1862 1st Qu.: 9.06
## Median : 23.800 Median : 46.5207 Median : 26.17
## Mean : 19.188 Mean : 33.3082 Mean : 38.13
## 3rd Qu.: 37.915 3rd Qu.: 101.7380 3rd Qu.: 59.71
## Max. : 55.821 Max. : 174.9113 Max. :266.70
## drainage_density_km_per_km2 storm_drain_proximity_m
## Min. : 1.370 Min. : 0.2
## 1st Qu.: 4.680 1st Qu.: 48.4
## Median : 6.280 Median : 90.5
## Mean : 6.275 Mean :122.9
## 3rd Qu.: 7.830 3rd Qu.:159.4
## Max. :12.070 Max. :688.8
## historical_rainfall_intensity_mm_hr return_period_years
## Min. : 5.4 Min. : 2.00
## 1st Qu.: 25.4 1st Qu.: 5.00
## Median : 37.4 Median : 10.00
## Mean : 43.4 Mean : 18.87
## 3rd Qu.: 54.9 3rd Qu.: 25.00
## Max. :150.0 Max. :100.00
table(data$flood_binary)
##
## High Low
## 528 1293
Visual exploration helps identify patterns and relationships among variables.
rain_col <- "historical_rainfall_intensity_mm_hr"
hist(data[[rain_col]], breaks = 25, col = "skyblue",
main = "Histogram of Historical Rainfall Intensity",
xlab = "Rainfall Intensity (mm per hr)")
ggplot(data, aes(x = flood_binary, y = .data[[rain_col]], fill = flood_binary)) +
geom_boxplot() +
labs(title = "Rainfall distribution by Flood Binary", x = "Flood Binary", y = "Rainfall Intensity (mm per hr)")+
theme_minimal()
drainage_col <- "drainage_density_km_per_km2"
ggplot(data, aes(x = .data[[rain_col]], y = .data[[drainage_col]])) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = paste(rain_col, "vs", drainage_col), x = rain_col, y = drainage_col)
## `geom_smooth()` using formula = 'y ~ x'
if (length(num_cols) >= 2) {
M <- cor(data[num_cols])
corrplot(M, method = "color", tl.cex = 0.8)
} else {
plot.new(); text(0.5, 0.5, "Need at least 2 numeric columns for correlation heatmap.")
}
data_elev_drain <- data_num_scaled %>%
select(elevation_m, drainage_density_km_per_km2)
set.seed(123)
km <- kmeans(data_elev_drain, centers = 4, nstart = 25)
plot_df <- data_elev_drain %>%
mutate(cluster = factor(km$cluster))
ggplot(plot_df, aes(x = elevation_m, y = drainage_density_km_per_km2, color = cluster)) +
geom_point(size = 3) +
labs(
title = "K-means Clusters: Elevation vs Drainage Density",
x = "Elevation (m)",
y = "Drainage Density (km/km²)",
color = "Cluster"
) +
theme_minimal() +
scale_color_brewer(palette = "Set2")
## Insights
Rainfall Histogram: Most rainfall events are moderate, with rare extreme high intensities.
Rainfall by Flood Binary: High flood risk areas tend to have higher median rainfall than low-risk areas.
Rainfall vs Drainage Density Scatter: Drainage density moderately influences how rainfall contributes to flooding.
Correlation Heatmap: Elevation, drainage, and rainfall show patterns that help identify key flood predictors.
K-means Clusters (Elevation vs Drainage): Clustering reveals distinct flood risk groups based on elevation and drainage combinations.
Modeling and analysis is the process of using data to uncover patterns, make predictions, and derive meaningful insights that support informed decision-making.
lm_model <- lm(return_period_years ~ historical_rainfall_intensity_mm_hr, data = data)
summary(lm_model)
##
## Call:
## lm(formula = return_period_years ~ historical_rainfall_intensity_mm_hr,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.040 -13.428 -9.441 6.352 87.995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.03547 1.09362 8.262 2.73e-16 ***
## historical_rainfall_intensity_mm_hr 0.22670 0.02174 10.427 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.6 on 1819 degrees of freedom
## Multiple R-squared: 0.0564, Adjusted R-squared: 0.05588
## F-statistic: 108.7 on 1 and 1819 DF, p-value: < 2.2e-16
data$pred_lm <- predict(lm_model)
# Plot
ggplot(data, aes(x = historical_rainfall_intensity_mm_hr, y = return_period_years)) +
geom_point(color = "blue", alpha = 0.5) + # actual data points
geom_line(aes(y = pred_lm), color = "red", linewidth = 1) + # regression line
labs(
title = "Linear Regression: Return Period vs Rainfall Intensity",
x = "Historical Rainfall Intensity (mm/hr)",
y = "Return Period (years)"
) +
theme_minimal()
glm_model <- glm(flood_binary~
historical_rainfall_intensity_mm_hr,
data = data, family = binomial)
print(summary(glm_model))
##
## Call:
## glm(formula = flood_binary ~ historical_rainfall_intensity_mm_hr,
## family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.486507 0.125925 19.75 <2e-16 ***
## historical_rainfall_intensity_mm_hr -0.034551 0.002403 -14.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2192.9 on 1820 degrees of freedom
## Residual deviance: 1930.7 on 1819 degrees of freedom
## AIC: 1934.7
##
## Number of Fisher Scoring iterations: 4
cat("\nOdds ratios:\n"); print(exp(coef(glm_model)))
##
## Odds ratios:
## (Intercept) historical_rainfall_intensity_mm_hr
## 12.0192182 0.9660389
data$pred_prob <- predict(glm_model, type = "response")
# Plot
ggplot(data, aes(x = historical_rainfall_intensity_mm_hr, y = flood_binary)) +
geom_jitter(height = 0.05, alpha = 0.4, color = "blue") +
geom_line(aes(y = pred_prob), color = "red", size = 1) +
labs(
title = "Logistic Regression: Probability of Flood vs Rainfall Intensity",
x = "Historical Rainfall Intensity (mm/hr)",
y = "Flood Occurrence (0 = No, 1 = Yes)"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
aov_elev <- aov(elevation_m ~ flood_binary, data = data)
summary(aov_elev)
## Df Sum Sq Mean Sq F value Pr(>F)
## flood_binary 1 732199 732199 657.1 <2e-16 ***
## Residuals 1819 2026737 1114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data$pred_elev <- predict(aov_elev)
# Plot
ggplot(data, aes(x = factor(flood_binary), y = elevation_m, fill = factor(flood_binary))) +
geom_boxplot() +
geom_point(aes(y = pred_elev), color = "red", size = 2, position = position_jitter(width = 0.1)) +
labs(
title = "ANOVA: Elevation vs Flood Binary",
x = "Flood (0 = No, 1 = Yes)",
y = "Elevation (m)",
fill = "Flood"
) +
theme_minimal()
data$cluster <- factor(km$cluster)
ggplot(data, aes(x = elevation_m, y = drainage_density_km_per_km2, color = cluster)) +
geom_point(size = 3, alpha = 0.7) +
labs(
title = "K-Means Clusters: Elevation vs Drainage Density",
x = "Elevation (m)",
y = "Drainage Density (km/km²)",
color = "Cluster"
) +
theme_minimal() +
scale_color_brewer(palette = "Set2")
df_knn <- data %>% select(all_of(num_cols), flood_binary)
df_knn$flood_binary <- as.factor(df_knn$flood_binary)
# Train-test split
set.seed(123)
idx <- createDataPartition(df_knn$flood_binary, p = 0.7, list = FALSE)
train <- df_knn[idx, ]
test <- df_knn[-idx, ]
# Ensure factors in train and test
train$flood_binary <- as.factor(train$flood_binary)
test$flood_binary <- as.factor(test$flood_binary)
# Scale features
sc <- preProcess(train %>% select(all_of(num_cols)), method = c("center", "scale"))
trainX <- predict(sc, train %>% select(all_of(num_cols))) %>% as.matrix()
testX <- predict(sc, test %>% select(all_of(num_cols))) %>% as.matrix()
# Set k safely
k <- min(5, min(table(train$flood_binary)))
# KNN prediction
knn_pred <- class::knn(train = trainX, test = testX, cl = train$flood_binary, k = k)
# Confusion matrix
cat("KNN confusion matrix:\n")
## KNN confusion matrix:
print(table(Predicted = knn_pred, Actual = test$flood_binary))
## Actual
## Predicted High Low
## High 104 28
## Low 54 359
In this analysis, we explored the factors influencing flooding using both descriptive and predictive approaches. We applied K-means clustering to identify patterns in elevation and drainage density, performed linear and logistic regression to quantify the relationship between rainfall intensity and flood occurrence, and implemented a K-Nearest Neighbors (KNN) model to classify areas as high or low flood risk. The models and visualizations provide insights into flood-prone areas, helping in risk assessment and informed decision-making for flood management