Night Light Clustering For Poland

Installing Necessary Libraries or Packages

library(terra); # library for reading GeoTIFF data

## Warning: package 'terra' was built under R version 4.5.2

library(tidyverse); # library utilized for ggplot2 and other functions

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'forcats' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

library(dbscan);

## Warning: package 'dbscan' was built under R version 4.5.2

# library(sf);
library(ClusterR); # library imported to utilize Optimal_Clusters_KMeans function for silhouette score calculation and also for obtaining elbow point method results

## Warning: package 'ClusterR' was built under R version 4.5.2

#library(clustertend); Depricated and suggestion to utilize hopkins library
library(hopkins); # library imported to utilize hopkins function to check data clusterability

## Warning: package 'hopkins' was built under R version 4.5.2

library(fpc); # library for calinhara function import to calculate Calinski-Harabasz index

## Warning: package 'fpc' was built under R version 4.5.2

# library(factoextra);

Reconstructed Poland Maps Data using NOAA/DMSP-OLS/NIGHTTIME_LIGHTS dataset in Google Earth Engine with all country polygons for Night Light data

Javascript code used to retrieve the dataset and construct maps

The javascript code attached is executed in Google Earth Engine code editor and is utilized for extracting required dataset for 2012 for clustering purpose and other GeoTIFF or tif files show cased in the paper

  // Country Boundaries Loaded
  var countries = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017");
  
  // Poland Boundary Data filtered
  var poland = countries.filter(ee.Filter.eq("country_na", "Poland"));
  
  // Neighboring Country Boundaries filtered
  var neighbors = countries
  .filterBounds(poland.geometry())
  .filter(ee.Filter.neq("country_na", "Poland"));
  
  // Voivodeships Data filtered using GAUL Level 1 Data Set
  var gaul = ee.FeatureCollection("FAO/GAUL/2015/level1");
  var voivodeships = gaul.filter(ee.Filter.eq("ADM0_NAME", "Poland"));
  
  // Setting up width of Poland and respective voivodeship boundaries
  var NEON_BOUNDARY_VALUE = 200;
  var BOUNDARY_WIDTH_PIXELS = 2.25;
  
  var region_of_interest_rect = ee.Geometry.Rectangle([13.0, 48.0, 25.0, 56.0]);
  
  // Loading DMSP‑OLS night lights data from dataset in Google Earth Engine
  var dmspCollection = ee.ImageCollection("NOAA/DMSP-OLS/NIGHTTIME_LIGHTS")
  .filterBounds(region_of_interest_rect);
  
  // Setting up year values for median night light data extraction
  var year_to_process = 2012;
  var startDate = ee.Date(year_to_process + '-01-01');
  var endDate = ee.Date(year_to_process + '-12-31');
  
  var yearImages = dmspCollection.filterDate(startDate, endDate);
  
  if (yearImages.size().getInfo() === 0) {
    print("No DMSP data for " + year_to_process);
  } else {
    // Taking median of the data for entire year
    var nightLightsComposite = yearImages.median()
      .select("stable_lights")
      .clip(poland.geometry());
  
    // Making neighbouring countries night light value 0
    var neighborsMask = ee.Image().byte().paint(neighbors, 1);
    var nightLightsWithBlackNeighbors = nightLightsComposite.where(neighborsMask.eq(1), 0);
  
    // Increasing contrast for better visualization
    var stretchedLights = nightLightsWithBlackNeighbors
      .unitScale(0, 63)   // normalize 0–63 → 0–1
      .multiply(150);     // stretch to 0–150 for stronger contrast
  
    var emptyCanvas = ee.Image(0).rename("stable_lights").byte();
    
    // Including boundaries and night light value
    var polandOutlineRaster = emptyCanvas.paint({
      featureCollection: poland,
      color: NEON_BOUNDARY_VALUE,
      width: BOUNDARY_WIDTH_PIXELS
    });
  
    var voivodeshipsOutlineRaster = emptyCanvas.paint({
      featureCollection: voivodeships,
      color: NEON_BOUNDARY_VALUE,
      width: BOUNDARY_WIDTH_PIXELS
    });
  
    var finalImageForExport = stretchedLights
      .unmask(0)
      .max(polandOutlineRaster)
      .max(voivodeshipsOutlineRaster);
  
    // Color palette and visualization parameters chosen for night lights data
    var finalVisParams = {
      min: 0,
      max: 200,
      palette: [
        'black','blue','purple','cyan','orange','yellow','red','white',
        '#00FF00'
      ]
    };
  
    // Displaying data in Google Earth Engine maps for cross-verification prior to export
    Map.addLayer(finalImageForExport, finalVisParams,
                 "High‑Contrast Night Lights " + year_to_process);
  
    var centerPoint = poland.geometry().centroid();
    Map.setCenter(centerPoint.coordinates().get(0).getInfo(),
                  centerPoint.coordinates().get(1).getInfo(), 6);
  
    // Exporting map images as prepared in previous steps
    Export.image.toDrive({
      image: finalImageForExport,
      description: 'NightLights_' + year_to_process + '_high_contrast',
      folder: 'EarthEngineExports',
      fileNamePrefix: 'nightlights_' + year_to_process + '_high_contrast',
      region: region_of_interest_rect,
      scale: 500,
      maxPixels: 1e13,
      fileFormat: 'GeoTIFF'
    });
    
    var imageWithCoords = finalImageForExport
      .rename("night_light")
      .addBands(ee.Image.pixelLonLat());
    
    
    var samples = imageWithCoords.sampleRegions({
      collection: voivodeships,
      properties: ['ADM1_NAME'],   // province name according to dataset
      scale: 1000,
      geometries: false
    }).map(function (f) {
      return f.set({
        year: year_to_process,
        province: f.get('ADM1_NAME')
      });
    });
    
    // Exporting Longitude, Latitude, Province Name, Night Light intensity data for clustering
    Export.table.toDrive({
      collection: samples,
      description: 'NightLights_CSV_' + year_to_process,
      folder: 'EarthEngineExports',
      fileNamePrefix: 'nightlights_' + year_to_process + '_pixels',
      fileFormat: 'CSV'
    });
  }

Plotting the datasets in Google Earth Engine itself with map overlay and observing the same to validate an increase in Night Light intensity over a period from year 2000 to 2012

# Dividing screen into 9 frames
# windows(width = 12, height = 10);
par(mfrow = c(3, 3));

for(i in seq(2000, 2012, 2)){
  rel_path = paste0("./2000-2012GeospatialNightLightData/nightlights_", i, ".tif");
  r <- rast(rel_path);
  plot(r, main = paste("Year", i))
}

Importing Dataset extracted for 2012 and observing summary for the same

df <- read_csv("./2000-2012GeospatialNightLightData/nightlights_2012_pixels.csv");

## Rows: 507207 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): system:index, ADM1_NAME, province, .geo
## dbl (4): latitude, longitude, night_light, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(df);

## spc_tbl_ [507,207 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ system:index: chr [1:507207] "000000000000000003c0_0" "000000000000000003c0_1" "000000000000000003c0_2" "000000000000000003c0_3" ...
##  $ ADM1_NAME   : chr [1:507207] "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" ...
##  $ latitude    : num [1:507207] 50.1 50.1 50.1 50.1 50.1 ...
##  $ longitude   : num [1:507207] 16.7 16.7 16.7 16.7 16.7 ...
##  $ night_light : num [1:507207] 200 200 200 200 200 200 200 200 200 200 ...
##  $ province    : chr [1:507207] "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" ...
##  $ year        : num [1:507207] 2012 2012 2012 2012 2012 ...
##  $ .geo        : chr [1:507207] "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `system:index` = col_character(),
##   ..   ADM1_NAME = col_character(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   night_light = col_double(),
##   ..   province = col_character(),
##   ..   year = col_double(),
##   ..   .geo = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(df$night_light); # data should be in range from 0 to 200

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   19.05   26.19   43.97   47.62  200.00

Data Cleanup

Dropping system:index column | Reason : The same is an ID value added in Google Earth Engine Data Set NOAA/DMSP-OLS/NIGHTTIME_LIGHTS which isn’t of much relevance to us. Dropping ADM1_NAME column | Reason : Redundant data since column value is already added by us in Province column Dropping .geo column | Reason : Empty and common JSON object for all data set entries Dropping year column | Reason : Only value is 2012 since data for 2012 is only extracted in CSV format from Google Earth Engine - Remaining years data is just used for visulization and verification of growth trend.

# print(summary(df$year)) # Only value 2012 thus, year column can also be dropped
df <- df[ , !(names(df) %in% c("system:index", ".geo", "ADM1_NAME", "year"))];
print(head(df, 5));

## # A tibble: 5 × 4
##   latitude longitude night_light province    
##      <dbl>     <dbl>       <dbl> <chr>       
## 1     50.1      16.7         200 Dolnoslaskie
## 2     50.1      16.7         200 Dolnoslaskie
## 3     50.1      16.7         200 Dolnoslaskie
## 4     50.1      16.7         200 Dolnoslaskie
## 5     50.1      16.7         200 Dolnoslaskie

Converting Province column value to factor data type since Poland is bifurcated into only 16 Provinces (or voivodeships)

df$province <- as.factor(df$province)
print(head(df, 5));

## # A tibble: 5 × 4
##   latitude longitude night_light province    
##      <dbl>     <dbl>       <dbl> <fct>       
## 1     50.1      16.7         200 Dolnoslaskie
## 2     50.1      16.7         200 Dolnoslaskie
## 3     50.1      16.7         200 Dolnoslaskie
## 4     50.1      16.7         200 Dolnoslaskie
## 5     50.1      16.7         200 Dolnoslaskie

Log transforming the data to reduce skewness and for scaling. Selecting only columns with numerical values eliminating Province (or voivodeship column) since the information for the same can be reconstructed using longitude and latitude column values.

df_scaled <- scale(select(mutate(df, night_light_log = log1p(night_light)), latitude, longitude, night_light_log));

df_scaled <- as.data.frame(df_scaled); # Converting df_scaled from matrix (result of utilization of scale function) to dataframe

df_scaled$original_longitude <- df$longitude;
df_scaled$original_latitude <- df$latitude;
df_scaled$original_night_light <- df$night_light;
df_scaled$province <- df$province;
# print(head(df_scaled, 5));

Accounting to large size of the dataset, we are sampling 11000 random values from the same for ease of clustering

set.seed(11);
df_sample <- df_scaled[sample(nrow(df_scaled), 11000), ];
# print(head(df_sample, 5));

Determining the ideal number of clusters and also validating the clusterability of the dataset

Deciding on ideal number of clusters using Silhouette Score and Elbow Point method

df_sample_selected_columns <- df_sample[, c("latitude", "longitude", "night_light_log")]

res_elbow_point <- Optimal_Clusters_KMeans(
  df_sample_selected_columns,
  max_clusters = 20,
  criterion = "WCSSE"
);

res_silhouette <- Optimal_Clusters_KMeans(
  df_sample_selected_columns,
  max_clusters = 20,
  criterion = "silhouette"
);

par(mfrow = c(1, 2));

plot(res_elbow_point, main = "Elbow Point");
plot(res_silhouette, main = "Silhouette");

As per silhouette score, 4 appears to be the ideal number of clusters (maximum silhouette score of 0.33 - as per visual comparison), post 4, similar silhouette score value (although lower in magnitude - 0.32) is obtained only at 13 (increased number of cluster groups - thus, the same can be discarded).

Plotting the log normalized data set for better visualization

ggplot(df_sample, aes(original_longitude, original_latitude, color = night_light_log)) + geom_point()

Using Hopkins Statistics to determine the clusterability of dataset (passing df_sample accounting to the large size of data set).

hopkins_statistics <- hopkins(df_sample_selected_columns);
print(hopkins_statistics);

## [1] 0.9982452

Hopkins Statistics value of 0.998 indicates high clusterability value for the nightlight dataset.

Proceeding for actual clustering to select the best algorithm and accordingly interpreting the results of clustering

set.seed(11); # to ensure replicability of results

k <- 4; # chosen as per silhouette score 

kmeans_results <- kmeans(df_sample_selected_columns, centers = k, nstart = 11); # nstart indicates the number of random initializations of cluster centers before actually choosing the best solution thus, setting it to 11 for sake of simplicity and speed. The same is a trade-off between speed and stability, higher the value of nstart more stable the model predictions and lower the speed, vice-versa for other combination.

df_sample$cluster <- factor(kmeans_results$cluster);
print(head(df_sample, 5));

##          latitude  longitude night_light_log original_longitude
## 65570  -0.6758266  0.0208703       0.3791067           19.45302
## 459673  0.4869804 -1.1196877      -0.2857201           16.58739
## 281148  0.3490202  1.2007579       1.6085076           22.41746
## 335756  1.6760654 -0.5225931       0.1758876           18.08758
## 291030  0.6906358  1.5475733       0.2841053           23.28882
##        original_latitude original_night_light      province cluster
## 65570           51.19948             40.47619       Lodzkie       3
## 459673          52.78950             16.66667 Wielkopolskie       4
## 281148          52.60085            200.00000     Podlaskie       1
## 335756          54.41545             30.95238     Pomorskie       4
## 291030          53.06798             35.71429     Podlaskie       1

Plotting K-means algorithm clustering results for visualization and comparison of results

ggplot(df_sample, aes(original_longitude, original_latitude)) + geom_point(aes(color = original_night_light, shape = cluster));

Drawing conclusions from results obtained by K-means clustering

Calculating Mean Night Light intensity per cluster to draw conclusions and also intensity per province according to clusters for more details.

aggregate(original_night_light ~ cluster, data = df_sample, mean);

##   cluster original_night_light
## 1       1             38.00939
## 2       2              0.00000
## 3       3             61.51085
## 4       4             45.34555

aggregate(original_night_light ~ province + cluster, data = df_sample, mean);

##               province cluster original_night_light
## 1   Kujawsko-Pomorskie       1             39.77933
## 2              Lodzkie       1             29.49309
## 3            Lubeiskie       1             34.23859
## 4          Mazowieckie       1             43.07585
## 5            Podlaskie       1             34.46574
## 6            Pomorskie       1             48.62637
## 7  Warminsko-Mazurskie       1             34.11403
## 8         Dolnoslaskie       2              0.00000
## 9   Kujawsko-Pomorskie       2              0.00000
## 10             Lodzkie       2              0.00000
## 11           Lubeiskie       2              0.00000
## 12            Lubuskie       2              0.00000
## 13          Malopolske       2              0.00000
## 14         Mazowieckie       2              0.00000
## 15            Opolskie       2              0.00000
## 16        Podkarpackie       2              0.00000
## 17           Podlaskie       2              0.00000
## 18           Pomorskie       2              0.00000
## 19             Slaskie       2              0.00000
## 20      Swietokrzyskie       2              0.00000
## 21 Warminsko-Mazurskie       2              0.00000
## 22       Wielkopolskie       2              0.00000
## 23 Zachodnio-Pomorskie       2              0.00000
## 24        Dolnoslaskie       3             74.14369
## 25             Lodzkie       3             60.44767
## 26           Lubeiskie       3             48.35476
## 27          Malopolske       3             66.14494
## 28         Mazowieckie       3             64.89796
## 29            Opolskie       3             59.53953
## 30        Podkarpackie       3             56.69203
## 31             Slaskie       3             83.04020
## 32      Swietokrzyskie       3             55.21570
## 33       Wielkopolskie       3            154.08163
## 34        Dolnoslaskie       4             49.25080
## 35  Kujawsko-Pomorskie       4             44.09452
## 36             Lodzkie       4             45.10148
## 37            Lubuskie       4             45.94719
## 38         Mazowieckie       4            200.00000
## 39            Opolskie       4             21.42857
## 40           Pomorskie       4             42.57589
## 41       Wielkopolskie       4             46.58170
## 42 Zachodnio-Pomorskie       4             42.19866

Cluster	Interpretation
Cluster 1	Semi‑urban or mixed‑development rural areas with moderate night‑light intensity.
Cluster 2	Uninhabited or sparsely populated regions such as forests, farms, lakes, and other natural landscapes.
Cluster 3	Extremely bright zones with continuous activity, including commercial districts and major industrial belts.
Cluster 4	Urban residential and metropolitan areas with high illumination, including major cities and surrounding suburbs.

Province	Interpretation
Dolnośląskie	Bright urban + high‑intensity zones (Wrocław) mixed with dark rural areas.
Kujawsko‑Pomorskie	Moderate brightness with pockets of high‑intensity urban areas (Bydgoszcz, Toruń).
Łódzkie	Full spectrum from dark rural to high‑intensity urban core (Łódź).
Lubelskie	Moderate to bright regions, fewer extreme peaks; centered around Lublin.
Lubuskie	Mostly dark rural areas with moderately bright cities (Zielona Góra, Gorzów).
Małopolskie	Bright urban regions (Kraków) with surrounding rural areas.
Mazowieckie	Contains Poland’s brightest region (200) — Warsaw; strong mix of all cluster types.
Opolskie	Mostly rural with bright urban pockets around Opole.
Podkarpackie	Bright regional center (Rzeszów) but no extreme metropolitan peaks.
Podlaskie	Moderate brightness with extensive dark forested areas.
Pomorskie	Moderate to high brightness; Tri‑City drives the intensity.
Śląskie	One of the brightest industrial regions (Katowice–Gliwice belt).
Świętokrzyskie	Moderate brightness centered around Kielce; no extreme peaks.
Warmińsko‑Mazurskie	Low population density; moderate brightness with large dark lake regions.
Wielkopolskie	Very high brightness (154 in Cluster 3) due to Poznań and industrial corridors.
Zachodniopomorskie	Dark rural areas with bright coastal and urban centers (Szczecin).

Calculating Calinski-Harabasz index

Calinski-Harabasz index is used to calculate how well clustering results obtained actually classify the data. It is a ration of dispersion between the different cluster points and dispersion within the cluster points.

ch_score <- calinhara(df_sample_selected_columns, kmeans_results$cluster);
print(ch_score);

## [1] 5997.872

The score obtained is high indicating ideal clustering results as a second reference, comparing the KMeans Clustering results with DBSCAN Clustering results.

DBSCAN clustering

eps_value <- 0.15; # radius of neighbourhood around each point. If another point lies within the radius value then is considered a neighbour. Smaller epsilon value indicates that only close points become neighbours while if epsilon value is large then, the cluster size grows quickly and may merge.
minPts_value <- 11; # the same indicates the minimum number of points required to form a cluster center. If a point has at least minimum number of neighbours then, the same becomes a cluster center and subsequently can influence other points to form a cluster. Points with lesser than minimum point value of neighbours account as border points or noise.
dbscan_results <- dbscan::dbscan(df_sample_selected_columns, eps = eps_value, minPts = minPts_value); 

# Adding cluster labels to your original df
df_sample$dbscan_cluster <- as.factor(dbscan_results$cluster);

print(head(df_sample, 5));

##          latitude  longitude night_light_log original_longitude
## 65570  -0.6758266  0.0208703       0.3791067           19.45302
## 459673  0.4869804 -1.1196877      -0.2857201           16.58739
## 281148  0.3490202  1.2007579       1.6085076           22.41746
## 335756  1.6760654 -0.5225931       0.1758876           18.08758
## 291030  0.6906358  1.5475733       0.2841053           23.28882
##        original_latitude original_night_light      province cluster
## 65570           51.19948             40.47619       Lodzkie       3
## 459673          52.78950             16.66667 Wielkopolskie       4
## 281148          52.60085            200.00000     Podlaskie       1
## 335756          54.41545             30.95238     Pomorskie       4
## 291030          53.06798             35.71429     Podlaskie       1
##        dbscan_cluster
## 65570               1
## 459673              1
## 281148              0
## 335756              1
## 291030              0