1 Introduction

This project explores how rainfall and related environmental factors contribute to urban flood risk. Using R, we analyze relationships between rainfall intensity, elevation, drainage density, and flood occurrence to understand whether rainfall plays a significant role in triggering urban flooding.

2 Load data

The very first step towards our analysis would be loading up the data set.

data_raw <- read.csv("C:\\Users\\Shivam\\Downloads\\urban_pluvial_flood_risk_dataset.csv", stringsAsFactors = FALSE, na.strings = c("", "NA", "NaN"))

3 Data cleaning & preprocessing

This step fixes inconsistent column names, converts text numbers to numeric types, fills missing values safely, and creates a simple target (flood_binary) for classification. Clean, scaled data reduces errors and makes statistical models more reliable.

3.1 Standardize column names (lowercase & replace spaces)

names(data_raw) <- names(data_raw) %>% tolower() %>% str_replace_all("\\s+", "_")

3.2 Convert non-numeric columns to numeric

to_num <- c("historical_rainfall_intensity_mm_hr","elevation_m","drainage_density_km_per_km2",
"storm_drain_proximity_m","return_period_years")
for (col in to_num) {
if (col %in% names(data_raw)) {
data_raw[[col]] <- as.numeric(as.character(data_raw[[col]]))
}
}

3.3 Identify numeric and categorical columns

is_num <- sapply(data_raw, is.numeric)
num_cols <- names(data_raw)[is_num]
cat_cols <- names(data_raw)[!is_num]

cat("Numeric columns:\n"); print(num_cols)
## Numeric columns:
## [1] "latitude"                            "longitude"                          
## [3] "elevation_m"                         "drainage_density_km_per_km2"        
## [5] "storm_drain_proximity_m"             "historical_rainfall_intensity_mm_hr"
## [7] "return_period_years"
cat("Categorical columns:\n"); print(cat_cols)
## Categorical columns:
##  [1] "segment_id"       "city_name"        "admin_ward"       "catchment_id"    
##  [5] "dem_source"       "land_use"         "soil_group"       "storm_drain_type"
##  [9] "rainfall_source"  "risk_labels"

3.4 Handling the missing values simply

# Remove rows with missing values
data <- data_raw
data <- na.omit(data)

# Confirm no missing values remain
sum(is.na(data))
## [1] 0

3.5 Create a binary flood target (flood_binary)

if ("risk_labels" %in% names(data)) {

data$risk_labels <- as.character(data$risk_labels)
data$flood_binary <- ifelse(str_detect(tolower(data$risk_labels), "extreme|ponding|low_lying|hotspot|high"), "High", "Low")
}

data$flood_binary <- as.factor(data$flood_binary)

3.6 Normalized numeric dataframe for models that benefit from scaling

data_num_scaled <- data %>% select(all_of(num_cols)) %>% scale() %>% as.data.frame()
colnames(data_num_scaled) <- num_cols

# Show cleaned data summary

glimpse(data)
## Rows: 1,821
## Columns: 18
## $ segment_id                          <chr> "SEG-00003", "SEG-00004", "SEG-000…
## $ city_name                           <chr> "Ahmedabad, India", "Hong Kong, Ch…
## $ admin_ward                          <chr> "Sector 12", "Sector 14", "Sector …
## $ latitude                            <dbl> 23.019473, 22.302602, -29.887602, …
## $ longitude                           <dbl> 72.63858, 114.07867, 30.91101, 100…
## $ catchment_id                        <chr> "CAT-023", "CAT-168", "CAT-171", "…
## $ elevation_m                         <dbl> 30.88, 24.28, 35.70, 15.36, 15.80,…
## $ dem_source                          <chr> "SRTM_3arc", "SRTM_3arc", "SRTM_3a…
## $ land_use                            <chr> "Industrial", "Residential", "Indu…
## $ soil_group                          <chr> "B", "B", "C", "C", "A", "C", "B",…
## $ drainage_density_km_per_km2         <dbl> 11.00, 7.32, 4.50, 8.97, 8.25, 5.8…
## $ storm_drain_proximity_m             <dbl> 152.5, 37.0, 292.4, 30.0, 43.0, 31…
## $ storm_drain_type                    <chr> "OpenChannel", "Manhole", "OpenCha…
## $ rainfall_source                     <chr> "IMD", "ERA5", "ERA5", "LocalGauge…
## $ historical_rainfall_intensity_mm_hr <dbl> 16.3, 77.0, 20.8, 120.5, 39.3, 74.…
## $ return_period_years                 <dbl> 5, 10, 5, 50, 10, 10, 25, 5, 10, 5…
## $ risk_labels                         <chr> "monitor", "monitor", "monitor", "…
## $ flood_binary                        <fct> Low, Low, Low, High, Low, Low, Low…
summary(data[num_cols])
##     latitude         longitude          elevation_m    
##  Min.   :-36.999   Min.   :-123.2930   Min.   : -3.00  
##  1st Qu.:  6.513   1st Qu.:  -0.1862   1st Qu.:  9.06  
##  Median : 23.800   Median :  46.5207   Median : 26.17  
##  Mean   : 19.188   Mean   :  33.3082   Mean   : 38.13  
##  3rd Qu.: 37.915   3rd Qu.: 101.7380   3rd Qu.: 59.71  
##  Max.   : 55.821   Max.   : 174.9113   Max.   :266.70  
##  drainage_density_km_per_km2 storm_drain_proximity_m
##  Min.   : 1.370              Min.   :  0.2          
##  1st Qu.: 4.680              1st Qu.: 48.4          
##  Median : 6.280              Median : 90.5          
##  Mean   : 6.275              Mean   :122.9          
##  3rd Qu.: 7.830              3rd Qu.:159.4          
##  Max.   :12.070              Max.   :688.8          
##  historical_rainfall_intensity_mm_hr return_period_years
##  Min.   :  5.4                       Min.   :  2.00     
##  1st Qu.: 25.4                       1st Qu.:  5.00     
##  Median : 37.4                       Median : 10.00     
##  Mean   : 43.4                       Mean   : 18.87     
##  3rd Qu.: 54.9                       3rd Qu.: 25.00     
##  Max.   :150.0                       Max.   :100.00
table(data$flood_binary)
## 
## High  Low 
##  528 1293

4 Visualizations

Visual exploration helps identify patterns and relationships among variables.

4.1 Distribution of Historical Rainfall Intensity

rain_col <- "historical_rainfall_intensity_mm_hr"

hist(data[[rain_col]], breaks = 25, col = "skyblue",
main = "Histogram of Historical Rainfall Intensity",
xlab = "Rainfall Intensity (mm per hr)")

4.2 Boxplot of rainfall by flood_binary

ggplot(data, aes(x = flood_binary, y = .data[[rain_col]], fill = flood_binary)) +
geom_boxplot() +
labs(title = "Rainfall distribution by Flood Binary", x = "Flood Binary", y = "Rainfall Intensity (mm per hr)")+
  theme_minimal()

4.3 Scatter: rainfall vs drainage_density

drainage_col <- "drainage_density_km_per_km2"

ggplot(data, aes(x = .data[[rain_col]], y = .data[[drainage_col]])) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = paste(rain_col, "vs", drainage_col), x = rain_col, y = drainage_col)
## `geom_smooth()` using formula = 'y ~ x'

4.4 Correlation heatmap of numeric features

if (length(num_cols) >= 2) {
M <- cor(data[num_cols])
corrplot(M, method = "color", tl.cex = 0.8)
} else {
plot.new(); text(0.5, 0.5, "Need at least 2 numeric columns for correlation heatmap.")
}

4.5 K-means cluster plot

data_elev_drain <- data_num_scaled %>% 
  select(elevation_m, drainage_density_km_per_km2)

set.seed(123)


km <- kmeans(data_elev_drain, centers = 4, nstart = 25)

plot_df <- data_elev_drain %>% 
  mutate(cluster = factor(km$cluster))

ggplot(plot_df, aes(x = elevation_m, y = drainage_density_km_per_km2, color = cluster)) +
  geom_point(size = 3) +
  labs(
    title = "K-means Clusters: Elevation vs Drainage Density",
    x = "Elevation (m)",
    y = "Drainage Density (km/km²)",
    color = "Cluster"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")

## Insights

Rainfall Histogram: Most rainfall events are moderate, with rare extreme high intensities.

Rainfall by Flood Binary: High flood risk areas tend to have higher median rainfall than low-risk areas.

Rainfall vs Drainage Density Scatter: Drainage density moderately influences how rainfall contributes to flooding.

Correlation Heatmap: Elevation, drainage, and rainfall show patterns that help identify key flood predictors.

K-means Clusters (Elevation vs Drainage): Clustering reveals distinct flood risk groups based on elevation and drainage combinations.

5 Modeling & Analysis

Modeling and analysis is the process of using data to uncover patterns, make predictions, and derive meaningful insights that support informed decision-making.

5.1 Simple Linear Regression

lm_model <- lm(return_period_years ~ historical_rainfall_intensity_mm_hr, data = data)
summary(lm_model)
## 
## Call:
## lm(formula = return_period_years ~ historical_rainfall_intensity_mm_hr, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.040 -13.428  -9.441   6.352  87.995 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          9.03547    1.09362   8.262 2.73e-16 ***
## historical_rainfall_intensity_mm_hr  0.22670    0.02174  10.427  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.6 on 1819 degrees of freedom
## Multiple R-squared:  0.0564, Adjusted R-squared:  0.05588 
## F-statistic: 108.7 on 1 and 1819 DF,  p-value: < 2.2e-16
data$pred_lm <- predict(lm_model)

# Plot

ggplot(data, aes(x = historical_rainfall_intensity_mm_hr, y = return_period_years)) +
  geom_point(color = "blue", alpha = 0.5) +       # actual data points
  geom_line(aes(y = pred_lm), color = "red", linewidth = 1) +  # regression line
  labs(
    title = "Linear Regression: Return Period vs Rainfall Intensity",
    x = "Historical Rainfall Intensity (mm/hr)",
    y = "Return Period (years)"
  ) +
  theme_minimal()

5.2 Generalized Linear Model (Logistic) predicting flood_binary

glm_model <- glm(flood_binary~
                  historical_rainfall_intensity_mm_hr,
                 data = data, family = binomial)

print(summary(glm_model))
## 
## Call:
## glm(formula = flood_binary ~ historical_rainfall_intensity_mm_hr, 
##     family = binomial, data = data)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          2.486507   0.125925   19.75   <2e-16 ***
## historical_rainfall_intensity_mm_hr -0.034551   0.002403  -14.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2192.9  on 1820  degrees of freedom
## Residual deviance: 1930.7  on 1819  degrees of freedom
## AIC: 1934.7
## 
## Number of Fisher Scoring iterations: 4
cat("\nOdds ratios:\n"); print(exp(coef(glm_model)))
## 
## Odds ratios:
##                         (Intercept) historical_rainfall_intensity_mm_hr 
##                          12.0192182                           0.9660389
data$pred_prob <- predict(glm_model, type = "response")

# Plot
ggplot(data, aes(x = historical_rainfall_intensity_mm_hr, y = flood_binary)) +
  geom_jitter(height = 0.05, alpha = 0.4, color = "blue") +
  geom_line(aes(y = pred_prob), color = "red", size = 1) +
  labs(
    title = "Logistic Regression: Probability of Flood vs Rainfall Intensity",
    x = "Historical Rainfall Intensity (mm/hr)",
    y = "Flood Occurrence (0 = No, 1 = Yes)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

5.3 ANOVA

aov_elev <- aov(elevation_m ~ flood_binary, data = data)
summary(aov_elev)
##                Df  Sum Sq Mean Sq F value Pr(>F)    
## flood_binary    1  732199  732199   657.1 <2e-16 ***
## Residuals    1819 2026737    1114                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data$pred_elev <- predict(aov_elev)

# Plot
ggplot(data, aes(x = factor(flood_binary), y = elevation_m, fill = factor(flood_binary))) +
  geom_boxplot() +
  geom_point(aes(y = pred_elev), color = "red", size = 2, position = position_jitter(width = 0.1)) +
  labs(
    title = "ANOVA: Elevation vs Flood Binary",
    x = "Flood (0 = No, 1 = Yes)",
    y = "Elevation (m)",
    fill = "Flood"
  ) +
  theme_minimal()

5.4 K-Means summary

data$cluster <- factor(km$cluster)


ggplot(data, aes(x = elevation_m, y = drainage_density_km_per_km2, color = cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "K-Means Clusters: Elevation vs Drainage Density",
    x = "Elevation (m)",
    y = "Drainage Density (km/km²)",
    color = "Cluster"
  ) +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")

5.5 KNN classification

df_knn <- data %>% select(all_of(num_cols), flood_binary)

df_knn$flood_binary <- as.factor(df_knn$flood_binary)

# Train-test split
set.seed(123)
idx <- createDataPartition(df_knn$flood_binary, p = 0.7, list = FALSE)
train <- df_knn[idx, ]
test  <- df_knn[-idx, ]

# Ensure factors in train and test
train$flood_binary <- as.factor(train$flood_binary)
test$flood_binary  <- as.factor(test$flood_binary)

# Scale features
sc <- preProcess(train %>% select(all_of(num_cols)), method = c("center", "scale"))
trainX <- predict(sc, train %>% select(all_of(num_cols))) %>% as.matrix()
testX  <- predict(sc, test %>% select(all_of(num_cols))) %>% as.matrix()

# Set k safely
k <- min(5, min(table(train$flood_binary)))

# KNN prediction
knn_pred <- class::knn(train = trainX, test = testX, cl = train$flood_binary, k = k)

# Confusion matrix
cat("KNN confusion matrix:\n")
## KNN confusion matrix:
print(table(Predicted = knn_pred, Actual = test$flood_binary))
##          Actual
## Predicted High Low
##      High  104  28
##      Low    54 359

6 Conclusion

In this analysis, we explored the factors influencing flooding using both descriptive and predictive approaches. We applied K-means clustering to identify patterns in elevation and drainage density, performed linear and logistic regression to quantify the relationship between rainfall intensity and flood occurrence, and implemented a K-Nearest Neighbors (KNN) model to classify areas as high or low flood risk. The models and visualizations provide insights into flood-prone areas, helping in risk assessment and informed decision-making for flood management