library(terra); # library for reading GeoTIFF data
## Warning: package 'terra' was built under R version 4.5.2
library(tidyverse); # library utilized for ggplot2 and other functions
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
library(dbscan);
## Warning: package 'dbscan' was built under R version 4.5.2
# library(sf);
library(ClusterR); # library imported to utilize Optimal_Clusters_KMeans function for silhouette score calculation and also for obtaining elbow point method results
## Warning: package 'ClusterR' was built under R version 4.5.2
#library(clustertend); Depricated and suggestion to utilize hopkins library
library(hopkins); # library imported to utilize hopkins function to check data clusterability
## Warning: package 'hopkins' was built under R version 4.5.2
library(fpc); # library for calinhara function import to calculate Calinski-Harabasz index
## Warning: package 'fpc' was built under R version 4.5.2
# library(factoextra);
The javascript code attached is executed in Google Earth Engine code editor and is utilized for extracting required dataset for 2012 for clustering purpose and other GeoTIFF or tif files show cased in the paper
// Country Boundaries Loaded
var countries = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017");
// Poland Boundary Data filtered
var poland = countries.filter(ee.Filter.eq("country_na", "Poland"));
// Neighboring Country Boundaries filtered
var neighbors = countries
.filterBounds(poland.geometry())
.filter(ee.Filter.neq("country_na", "Poland"));
// Voivodeships Data filtered using GAUL Level 1 Data Set
var gaul = ee.FeatureCollection("FAO/GAUL/2015/level1");
var voivodeships = gaul.filter(ee.Filter.eq("ADM0_NAME", "Poland"));
// Setting up width of Poland and respective voivodeship boundaries
var NEON_BOUNDARY_VALUE = 200;
var BOUNDARY_WIDTH_PIXELS = 2.25;
var region_of_interest_rect = ee.Geometry.Rectangle([13.0, 48.0, 25.0, 56.0]);
// Loading DMSP‑OLS night lights data from dataset in Google Earth Engine
var dmspCollection = ee.ImageCollection("NOAA/DMSP-OLS/NIGHTTIME_LIGHTS")
.filterBounds(region_of_interest_rect);
// Setting up year values for median night light data extraction
var year_to_process = 2012;
var startDate = ee.Date(year_to_process + '-01-01');
var endDate = ee.Date(year_to_process + '-12-31');
var yearImages = dmspCollection.filterDate(startDate, endDate);
if (yearImages.size().getInfo() === 0) {
print("No DMSP data for " + year_to_process);
} else {
// Taking median of the data for entire year
var nightLightsComposite = yearImages.median()
.select("stable_lights")
.clip(poland.geometry());
// Making neighbouring countries night light value 0
var neighborsMask = ee.Image().byte().paint(neighbors, 1);
var nightLightsWithBlackNeighbors = nightLightsComposite.where(neighborsMask.eq(1), 0);
// Increasing contrast for better visualization
var stretchedLights = nightLightsWithBlackNeighbors
.unitScale(0, 63) // normalize 0–63 → 0–1
.multiply(150); // stretch to 0–150 for stronger contrast
var emptyCanvas = ee.Image(0).rename("stable_lights").byte();
// Including boundaries and night light value
var polandOutlineRaster = emptyCanvas.paint({
featureCollection: poland,
color: NEON_BOUNDARY_VALUE,
width: BOUNDARY_WIDTH_PIXELS
});
var voivodeshipsOutlineRaster = emptyCanvas.paint({
featureCollection: voivodeships,
color: NEON_BOUNDARY_VALUE,
width: BOUNDARY_WIDTH_PIXELS
});
var finalImageForExport = stretchedLights
.unmask(0)
.max(polandOutlineRaster)
.max(voivodeshipsOutlineRaster);
// Color palette and visualization parameters chosen for night lights data
var finalVisParams = {
min: 0,
max: 200,
palette: [
'black','blue','purple','cyan','orange','yellow','red','white',
'#00FF00'
]
};
// Displaying data in Google Earth Engine maps for cross-verification prior to export
Map.addLayer(finalImageForExport, finalVisParams,
"High‑Contrast Night Lights " + year_to_process);
var centerPoint = poland.geometry().centroid();
Map.setCenter(centerPoint.coordinates().get(0).getInfo(),
centerPoint.coordinates().get(1).getInfo(), 6);
// Exporting map images as prepared in previous steps
Export.image.toDrive({
image: finalImageForExport,
description: 'NightLights_' + year_to_process + '_high_contrast',
folder: 'EarthEngineExports',
fileNamePrefix: 'nightlights_' + year_to_process + '_high_contrast',
region: region_of_interest_rect,
scale: 500,
maxPixels: 1e13,
fileFormat: 'GeoTIFF'
});
var imageWithCoords = finalImageForExport
.rename("night_light")
.addBands(ee.Image.pixelLonLat());
var samples = imageWithCoords.sampleRegions({
collection: voivodeships,
properties: ['ADM1_NAME'], // province name according to dataset
scale: 1000,
geometries: false
}).map(function (f) {
return f.set({
year: year_to_process,
province: f.get('ADM1_NAME')
});
});
// Exporting Longitude, Latitude, Province Name, Night Light intensity data for clustering
Export.table.toDrive({
collection: samples,
description: 'NightLights_CSV_' + year_to_process,
folder: 'EarthEngineExports',
fileNamePrefix: 'nightlights_' + year_to_process + '_pixels',
fileFormat: 'CSV'
});
}
Plotting the datasets in Google Earth Engine itself with map overlay and observing the same to validate an increase in Night Light intensity over a period from year 2000 to 2012
# Dividing screen into 9 frames
# windows(width = 12, height = 10);
par(mfrow = c(3, 3));
for(i in seq(2000, 2012, 2)){
rel_path = paste0("./2000-2012GeospatialNightLightData/nightlights_", i, ".tif");
r <- rast(rel_path);
plot(r, main = paste("Year", i))
}
df <- read_csv("./2000-2012GeospatialNightLightData/nightlights_2012_pixels.csv");
## Rows: 507207 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): system:index, ADM1_NAME, province, .geo
## dbl (4): latitude, longitude, night_light, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df);
## spc_tbl_ [507,207 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ system:index: chr [1:507207] "000000000000000003c0_0" "000000000000000003c0_1" "000000000000000003c0_2" "000000000000000003c0_3" ...
## $ ADM1_NAME : chr [1:507207] "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" ...
## $ latitude : num [1:507207] 50.1 50.1 50.1 50.1 50.1 ...
## $ longitude : num [1:507207] 16.7 16.7 16.7 16.7 16.7 ...
## $ night_light : num [1:507207] 200 200 200 200 200 200 200 200 200 200 ...
## $ province : chr [1:507207] "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" "Dolnoslaskie" ...
## $ year : num [1:507207] 2012 2012 2012 2012 2012 ...
## $ .geo : chr [1:507207] "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" "{\"type\":\"MultiPoint\",\"coordinates\":[]}" ...
## - attr(*, "spec")=
## .. cols(
## .. `system:index` = col_character(),
## .. ADM1_NAME = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. night_light = col_double(),
## .. province = col_character(),
## .. year = col_double(),
## .. .geo = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df$night_light); # data should be in range from 0 to 200
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 19.05 26.19 43.97 47.62 200.00
Dropping system:index column | Reason : The same is an ID value added in Google Earth Engine Data Set NOAA/DMSP-OLS/NIGHTTIME_LIGHTS which isn’t of much relevance to us. Dropping ADM1_NAME column | Reason : Redundant data since column value is already added by us in Province column Dropping .geo column | Reason : Empty and common JSON object for all data set entries Dropping year column | Reason : Only value is 2012 since data for 2012 is only extracted in CSV format from Google Earth Engine - Remaining years data is just used for visulization and verification of growth trend.
# print(summary(df$year)) # Only value 2012 thus, year column can also be dropped
df <- df[ , !(names(df) %in% c("system:index", ".geo", "ADM1_NAME", "year"))];
print(head(df, 5));
## # A tibble: 5 × 4
## latitude longitude night_light province
## <dbl> <dbl> <dbl> <chr>
## 1 50.1 16.7 200 Dolnoslaskie
## 2 50.1 16.7 200 Dolnoslaskie
## 3 50.1 16.7 200 Dolnoslaskie
## 4 50.1 16.7 200 Dolnoslaskie
## 5 50.1 16.7 200 Dolnoslaskie
Converting Province column value to factor data type since Poland is bifurcated into only 16 Provinces (or voivodeships)
df$province <- as.factor(df$province)
print(head(df, 5));
## # A tibble: 5 × 4
## latitude longitude night_light province
## <dbl> <dbl> <dbl> <fct>
## 1 50.1 16.7 200 Dolnoslaskie
## 2 50.1 16.7 200 Dolnoslaskie
## 3 50.1 16.7 200 Dolnoslaskie
## 4 50.1 16.7 200 Dolnoslaskie
## 5 50.1 16.7 200 Dolnoslaskie
Log transforming the data to reduce skewness and for scaling. Selecting only columns with numerical values eliminating Province (or voivodeship column) since the information for the same can be reconstructed using longitude and latitude column values.
df_scaled <- scale(select(mutate(df, night_light_log = log1p(night_light)), latitude, longitude, night_light_log));
df_scaled <- as.data.frame(df_scaled); # Converting df_scaled from matrix (result of utilization of scale function) to dataframe
df_scaled$original_longitude <- df$longitude;
df_scaled$original_latitude <- df$latitude;
df_scaled$original_night_light <- df$night_light;
df_scaled$province <- df$province;
# print(head(df_scaled, 5));
Accounting to large size of the dataset, we are sampling 11000 random values from the same for ease of clustering
set.seed(11);
df_sample <- df_scaled[sample(nrow(df_scaled), 11000), ];
# print(head(df_sample, 5));
Deciding on ideal number of clusters using Silhouette Score and Elbow Point method
df_sample_selected_columns <- df_sample[, c("latitude", "longitude", "night_light_log")]
res_elbow_point <- Optimal_Clusters_KMeans(
df_sample_selected_columns,
max_clusters = 20,
criterion = "WCSSE"
);
res_silhouette <- Optimal_Clusters_KMeans(
df_sample_selected_columns,
max_clusters = 20,
criterion = "silhouette"
);
par(mfrow = c(1, 2));
plot(res_elbow_point, main = "Elbow Point");
plot(res_silhouette, main = "Silhouette");
As per silhouette score, 4 appears to be the ideal number of clusters
(maximum silhouette score of 0.33 - as per visual comparison), post 4,
similar silhouette score value (although lower in magnitude - 0.32) is
obtained only at 13 (increased number of cluster groups - thus, the same
can be discarded).
Plotting the log normalized data set for better visualization
ggplot(df_sample, aes(original_longitude, original_latitude, color = night_light_log)) + geom_point()
Using Hopkins Statistics to determine the clusterability of dataset (passing df_sample accounting to the large size of data set).
hopkins_statistics <- hopkins(df_sample_selected_columns);
print(hopkins_statistics);
## [1] 0.9982452
Hopkins Statistics value of 0.998 indicates high clusterability value for the nightlight dataset.
set.seed(11); # to ensure replicability of results
k <- 4; # chosen as per silhouette score
kmeans_results <- kmeans(df_sample_selected_columns, centers = k, nstart = 11); # nstart indicates the number of random initializations of cluster centers before actually choosing the best solution thus, setting it to 11 for sake of simplicity and speed. The same is a trade-off between speed and stability, higher the value of nstart more stable the model predictions and lower the speed, vice-versa for other combination.
df_sample$cluster <- factor(kmeans_results$cluster);
print(head(df_sample, 5));
## latitude longitude night_light_log original_longitude
## 65570 -0.6758266 0.0208703 0.3791067 19.45302
## 459673 0.4869804 -1.1196877 -0.2857201 16.58739
## 281148 0.3490202 1.2007579 1.6085076 22.41746
## 335756 1.6760654 -0.5225931 0.1758876 18.08758
## 291030 0.6906358 1.5475733 0.2841053 23.28882
## original_latitude original_night_light province cluster
## 65570 51.19948 40.47619 Lodzkie 3
## 459673 52.78950 16.66667 Wielkopolskie 4
## 281148 52.60085 200.00000 Podlaskie 1
## 335756 54.41545 30.95238 Pomorskie 4
## 291030 53.06798 35.71429 Podlaskie 1
Plotting K-means algorithm clustering results for visualization and comparison of results
ggplot(df_sample, aes(original_longitude, original_latitude)) + geom_point(aes(color = original_night_light, shape = cluster));
Calculating Mean Night Light intensity per cluster to draw conclusions and also intensity per province according to clusters for more details.
aggregate(original_night_light ~ cluster, data = df_sample, mean);
## cluster original_night_light
## 1 1 38.00939
## 2 2 0.00000
## 3 3 61.51085
## 4 4 45.34555
aggregate(original_night_light ~ province + cluster, data = df_sample, mean);
## province cluster original_night_light
## 1 Kujawsko-Pomorskie 1 39.77933
## 2 Lodzkie 1 29.49309
## 3 Lubeiskie 1 34.23859
## 4 Mazowieckie 1 43.07585
## 5 Podlaskie 1 34.46574
## 6 Pomorskie 1 48.62637
## 7 Warminsko-Mazurskie 1 34.11403
## 8 Dolnoslaskie 2 0.00000
## 9 Kujawsko-Pomorskie 2 0.00000
## 10 Lodzkie 2 0.00000
## 11 Lubeiskie 2 0.00000
## 12 Lubuskie 2 0.00000
## 13 Malopolske 2 0.00000
## 14 Mazowieckie 2 0.00000
## 15 Opolskie 2 0.00000
## 16 Podkarpackie 2 0.00000
## 17 Podlaskie 2 0.00000
## 18 Pomorskie 2 0.00000
## 19 Slaskie 2 0.00000
## 20 Swietokrzyskie 2 0.00000
## 21 Warminsko-Mazurskie 2 0.00000
## 22 Wielkopolskie 2 0.00000
## 23 Zachodnio-Pomorskie 2 0.00000
## 24 Dolnoslaskie 3 74.14369
## 25 Lodzkie 3 60.44767
## 26 Lubeiskie 3 48.35476
## 27 Malopolske 3 66.14494
## 28 Mazowieckie 3 64.89796
## 29 Opolskie 3 59.53953
## 30 Podkarpackie 3 56.69203
## 31 Slaskie 3 83.04020
## 32 Swietokrzyskie 3 55.21570
## 33 Wielkopolskie 3 154.08163
## 34 Dolnoslaskie 4 49.25080
## 35 Kujawsko-Pomorskie 4 44.09452
## 36 Lodzkie 4 45.10148
## 37 Lubuskie 4 45.94719
## 38 Mazowieckie 4 200.00000
## 39 Opolskie 4 21.42857
## 40 Pomorskie 4 42.57589
## 41 Wielkopolskie 4 46.58170
## 42 Zachodnio-Pomorskie 4 42.19866
| Cluster | Interpretation |
|---|---|
| Cluster 1 | Semi‑urban or mixed‑development rural areas with moderate night‑light intensity. |
| Cluster 2 | Uninhabited or sparsely populated regions such as forests, farms, lakes, and other natural landscapes. |
| Cluster 3 | Extremely bright zones with continuous activity, including commercial districts and major industrial belts. |
| Cluster 4 | Urban residential and metropolitan areas with high illumination, including major cities and surrounding suburbs. |
| Province | Interpretation |
|---|---|
| Dolnośląskie | Bright urban + high‑intensity zones (Wrocław) mixed with dark rural areas. |
| Kujawsko‑Pomorskie | Moderate brightness with pockets of high‑intensity urban areas (Bydgoszcz, Toruń). |
| Łódzkie | Full spectrum from dark rural to high‑intensity urban core (Łódź). |
| Lubelskie | Moderate to bright regions, fewer extreme peaks; centered around Lublin. |
| Lubuskie | Mostly dark rural areas with moderately bright cities (Zielona Góra, Gorzów). |
| Małopolskie | Bright urban regions (Kraków) with surrounding rural areas. |
| Mazowieckie | Contains Poland’s brightest region (200) — Warsaw; strong mix of all cluster types. |
| Opolskie | Mostly rural with bright urban pockets around Opole. |
| Podkarpackie | Bright regional center (Rzeszów) but no extreme metropolitan peaks. |
| Podlaskie | Moderate brightness with extensive dark forested areas. |
| Pomorskie | Moderate to high brightness; Tri‑City drives the intensity. |
| Śląskie | One of the brightest industrial regions (Katowice–Gliwice belt). |
| Świętokrzyskie | Moderate brightness centered around Kielce; no extreme peaks. |
| Warmińsko‑Mazurskie | Low population density; moderate brightness with large dark lake regions. |
| Wielkopolskie | Very high brightness (154 in Cluster 3) due to Poznań and industrial corridors. |
| Zachodniopomorskie | Dark rural areas with bright coastal and urban centers (Szczecin). |
Calinski-Harabasz index is used to calculate how well clustering results obtained actually classify the data. It is a ration of dispersion between the different cluster points and dispersion within the cluster points.
ch_score <- calinhara(df_sample_selected_columns, kmeans_results$cluster);
print(ch_score);
## [1] 5997.872
The score obtained is high indicating ideal clustering results as a second reference, comparing the KMeans Clustering results with DBSCAN Clustering results.
eps_value <- 0.15; # radius of neighbourhood around each point. If another point lies within the radius value then is considered a neighbour. Smaller epsilon value indicates that only close points become neighbours while if epsilon value is large then, the cluster size grows quickly and may merge.
minPts_value <- 11; # the same indicates the minimum number of points required to form a cluster center. If a point has at least minimum number of neighbours then, the same becomes a cluster center and subsequently can influence other points to form a cluster. Points with lesser than minimum point value of neighbours account as border points or noise.
dbscan_results <- dbscan::dbscan(df_sample_selected_columns, eps = eps_value, minPts = minPts_value);
# Adding cluster labels to your original df
df_sample$dbscan_cluster <- as.factor(dbscan_results$cluster);
print(head(df_sample, 5));
## latitude longitude night_light_log original_longitude
## 65570 -0.6758266 0.0208703 0.3791067 19.45302
## 459673 0.4869804 -1.1196877 -0.2857201 16.58739
## 281148 0.3490202 1.2007579 1.6085076 22.41746
## 335756 1.6760654 -0.5225931 0.1758876 18.08758
## 291030 0.6906358 1.5475733 0.2841053 23.28882
## original_latitude original_night_light province cluster
## 65570 51.19948 40.47619 Lodzkie 3
## 459673 52.78950 16.66667 Wielkopolskie 4
## 281148 52.60085 200.00000 Podlaskie 1
## 335756 54.41545 30.95238 Pomorskie 4
## 291030 53.06798 35.71429 Podlaskie 1
## dbscan_cluster
## 65570 1
## 459673 1
## 281148 0
## 335756 1
## 291030 0