3D Road Network (North Jutland, Denmark)

About the Dataset

This dataset contains the detailed information about the road network in North Jutland, Denmark including logitude, latitude and altitude for road segmentation. key features: OSM_ID : Unique identifier for road segments from OpenStreetMap longitude,latitude,altitude : Representing the geographycal location and elevation of each road segment.

Loading libraries

suppressPackageStartupMessages({
  suppressWarnings({
    library(dplyr)
    library(ggplot2)
    library(cluster)
    library(factoextra)
    library(corrplot)
    library(dendextend)
    library(gridExtra)
    library(scatterplot3d)
    library(purrr)
    library(hopkins)
  })
})

Loading the Dataset

setwd("D:/alliance/3rd sem/machine learning_2/R_studo/datasets")
data <- readLines("3D_spatial_network.txt")
head(data)

## [1] "144552912,9.3498486,56.7408757,17.0527715677876"
## [2] "144552912,9.3501884,56.7406785,17.614840244389" 
## [3] "144552912,9.3505485,56.7405445,18.08353563951"  
## [4] "144552912,9.3508058,56.7404845,18.2794652530352"
## [5] "144552912,9.3510534,56.7404863,18.4229736146099"
## [6] "144552912,9.3514747,56.7405022,19.1248885940143"

splited <- strsplit(data, split = ",")

splited <- do.call(rbind, splited)
df <- as.data.frame(splited, stringAsFactor= FALSE)

colnames(df) <- c("OSMID", "longitude", "Latitude", "Altitude")

df[] <- lapply(df, function(x) as.numeric(x))
head(df)

##       OSMID longitude Latitude Altitude
## 1 144552912  9.349849 56.74088 17.05277
## 2 144552912  9.350188 56.74068 17.61484
## 3 144552912  9.350549 56.74054 18.08354
## 4 144552912  9.350806 56.74048 18.27947
## 5 144552912  9.351053 56.74049 18.42297
## 6 144552912  9.351475 56.74050 19.12489

EDA

Summary Statistics

str(df)

## 'data.frame':    434874 obs. of  4 variables:
##  $ OSMID    : num  1.45e+08 1.45e+08 1.45e+08 1.45e+08 1.45e+08 ...
##  $ longitude: num  9.35 9.35 9.35 9.35 9.35 ...
##  $ Latitude : num  56.7 56.7 56.7 56.7 56.7 ...
##  $ Altitude : num  17.1 17.6 18.1 18.3 18.4 ...

summary(df)

##      OSMID             longitude         Latitude        Altitude      
##  Min.   :  4482444   Min.   : 8.146   Min.   :56.58   Min.   : -8.608  
##  1st Qu.: 82678969   1st Qu.: 9.338   1st Qu.:56.85   1st Qu.:  7.028  
##  Median :101979668   Median : 9.887   Median :57.04   Median : 17.575  
##  Mean   : 97869978   Mean   : 9.732   Mean   :57.08   Mean   : 22.185  
##  3rd Qu.:125954704   3rd Qu.:10.172   3rd Qu.:57.31   3rd Qu.: 31.810  
##  Max.   :157742416   Max.   :11.199   Max.   :57.75   Max.   :134.442

print(sum(is.na(df)))

## [1] 0

Distribution of the Varibales

long <- ggplot(data = df, aes(x = longitude)) + geom_histogram(binwidth = .3 ,fill= "steelblue", color= "black") + 
                labs(title = "Distribution of longitude ", xlab="longitude", ylab = "count")

lati <- ggplot(data = df, aes(x = Latitude)) + geom_histogram(binwidth = .2 ,fill= "steelblue", color= "black") + 
                labs(title = "Distribution of Latitude ", xlab="Latitude", ylab = "count")

alti <- ggplot(data = df, aes(x = Altitude)) + geom_histogram(binwidth = 5 ,fill= "steelblue", color= "black") + 
                labs(title = "Distribution of Altitude ", xlab="Altitude", ylab = "count")

grid.arrange(long, lati, alti, nrow=2, ncol=2)

Longitude distribution

The distribution of the longitude values centered around 10 degree. majority of the road segment falls between 9.5 and 10.5

Latitude distribution

The latitude vales are distribution between approximatily 56.5 and 57.6. The distribution shows a slight peak in between 56.8 and 57.2, indicating tat most road network is located in this range

Altitude distribution

The altitude value range from -8 to 134 meter (height). The Diststibution is Right skewed, with majority of the values concentrated on the left side below 50. Indicating that most road network are in low lying area A small number of road above 100 meters indicating there are some high altitude regions.

correlation analysis

corr <- cor(df)
corr

##                 OSMID   longitude    Latitude    Altitude
## OSMID      1.00000000 -0.03571643 -0.05142402  0.10652031
## longitude -0.03571643  1.00000000  0.56685465  0.04232349
## Latitude  -0.05142402  0.56685465  1.00000000 -0.10538305
## Altitude   0.10652031  0.04232349 -0.10538305  1.00000000

corrplot(corr, method = "pie", type = "lower")

Outlier detection

ggplot(data=df, aes(x="", y =  longitude), color="blues") + geom_boxplot(fill = "lightblue", color= "black") + 
                    labs(title="box plot for longitude", x = "Longitude") + theme_minimal()

ggplot(data = df, aes(x = "", y = Latitude)) + 
                    geom_boxplot(fill = "steelblue", color = "black")+
                    labs(title  = "Boxplot of Latitude", x = "Latitude") + theme_minimal()

ggplot(data = df, aes(x = "", y = Altitude)) + 
                    geom_boxplot(fill = "steelblue", color = "black")+
                    labs(title  = "Boxplot of Altitude", x = "Altitude") + theme_minimal()

according to the boxplot Altitude variable in our dataset exhibits significant outliers, as evidenced by the high spread in values beyond the usual range of the data.

removing outliers from the Altitude

q1 <- quantile(df$Altitude, 0.25)
q3 <- quantile(df$Altitude, 0.75)
IQR <- q3 - q1

lower_bound <- q1 - IQR * 1.5
upper_bound <- q3 + IQR * 1.5

cleaned_data <-subset(df, Altitude >= lower_bound & Altitude <= upper_bound)
print(dim(df))

## [1] 434874      4

print(dim(cleaned_data))

## [1] 423208      4

ggplot(data = df , aes(x= longitude, y =Latitude)) + geom_point(aes(color = Altitude)) + ggtitle("geographical Distribution of the road")

#scatterplot3d(df$longitude, df$Latitude, df$Altitude, pch = 19)

Preprocessesisng

Remove null values

print(sum(is.na(df)))

## [1] 0

df <- na.omit(df)

Data scaling/ Normalization

Scaled_df <- scale(cleaned_data[, c("longitude", "Latitude", "Altitude")])
head(Scaled_df)

##    longitude  Latitude    Altitude
## 1 -0.5899961 -1.182628 -0.22001329
## 2 -0.5894581 -1.183308 -0.18519519
## 3 -0.5888880 -1.183771 -0.15616123
## 4 -0.5884806 -1.183978 -0.14402411
## 5 -0.5880886 -1.183971 -0.13513429
## 6 -0.5874216 -1.183917 -0.09165323

Distance matrix

sample_df <- Scaled_df[sample(nrow(Scaled_df), size = 5000),]
#dist_df <- dist(sample_df)

dataset is clusterable or not

hopkins(sample_df, nrow(sample_df)-1)

## [1] 0.9985882

The Hopkins statistic is used to assess the cluster tendency of a dataset, i.e., whether the data contains meaningful clusters or if the data points are randomly distributed. The value of the Hopkins statistic ranges from 0 to 1:

A value close to 1 (like your result of 0.999) indicates that the dataset is highly clustered and not random. A value close to 0.5 suggests that the data is uniformly distributed or random and does not exhibit cluster structure. A value close to 0 would indicate anti-clustering or regular spacing between points.

Since the Hopkins statistic is very high, this strongly indicates that the dataset has a highly clustered structure, making it very suitable for clustering techniques like k-means or hierarchical clustering. The high value confirms that there are distinct groups or clusters within your 3D road spatial data.

k1 <- kmeans(x = sample_df, centers = 2, nstart = 25)
k2 <- kmeans(x = sample_df, centers = 4, nstart = 25)
k3 <- kmeans(x = sample_df, centers = 6, nstart = 25)
k4 <- kmeans(x = sample_df, centers = 8, nstart = 25)

p1 <- fviz_cluster(k1, data = sample_df, geom = "point") + ggtitle("k=1")
p2 <- fviz_cluster(k2, data=sample_df, geom = "point") + ggtitle("k = 2")
p3 <- fviz_cluster(k3, data=sample_df, geom = "point") + ggtitle("k = 3")
p4 <- fviz_cluster(k4, data=sample_df, geom = "point") + ggtitle("k = 4")

p1

p2

p3

p4

Find the best cluster using Elbow method

set.seed(123)
wss <- function(k){
  kmeans(sample_df, k ,nstart = 25)$tot.withinss
}

k.values <- 1:15

wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values, type = "b", pch=19, frame=FALSE, xlab = "No. of Clusters",ylab="Total Within- Cluster sum of squares")

### best cluster

From the above elbow method to find the optimal number of clusters for the 3D road spatial data by plotting the within- cluster sum of squares(WSS) against the number of cluster(k).The WSS represents the total variance within clusters, and the goal is to minimize this value. The plot above shows how the WSS decrease as the number of clusters increases.The “elbow” in the plot indicates where adding more cluster results in diminishing improvements to the WSS. In this the elbow occurs at clusters, suggesting that 5 is the optimal number of cluster for this Dataset.

k1 <- kmeans(x = sample_df, centers = 5, nstart = 25)
p1 <- fviz_cluster(k1, data = sample_df, geom = "point") + ggtitle("k=5")
p1

hierarchical clustering

Doing some hierarchical clustering in this dataset.

dist_df <- dist(sample_df)
hc_clust_1 <- hclust(dist_df, method = "ward.D2")
fviz_dend(hc_clust_1, cex=.5)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

hc_clust_2 <- hclust(dist_df, method = "average")
hc_clust_3 <- hclust(dist_df, method = "ward.D")
hc_clust_4 <- hclust(dist_df, method = "single")
hc_clust_5 <- hclust(dist_df, method = "complete")
hc_clust_6 <- hclust(dist_df, method = "centroid")

# or

den1 <- sample_df %>% dist %>% hclust("average") %>% as.dendrogram
den2 <- sample_df %>% dist %>% hclust("ward.D") %>% as.dendrogram
den3 <- sample_df %>% dist %>% hclust("single") %>% as.dendrogram
den4 <- sample_df %>% dist %>% hclust("complete") %>% as.dendrogram
den5 <- sample_df %>% dist %>% hclust("centroid") %>% as.dendrogram
den6 <- sample_df %>% dist %>% hclust("ward.D2") %>% as.dendrogram

dend_list <- dendlist("average" = den1, "ward.D" = den2,
                      "single" = den3, "complete" = den4,
                      "centroid" = den5, "ward.D2" = den6)

corre <-round(cor.dendlist(dend_list), 2)
corre

##          average ward.D single complete centroid ward.D2
## average     1.00   0.77   0.12     0.54     0.70    0.72
## ward.D      0.77   1.00   0.04     0.52     0.48    0.85
## single      0.12   0.04   1.00     0.15     0.22    0.07
## complete    0.54   0.52   0.15     1.00     0.45    0.55
## centroid    0.70   0.48   0.22     0.45     1.00    0.53
## ward.D2     0.72   0.85   0.07     0.55     0.53    1.00

conclusion

In this Dataset, the clustering of road segments based on 3D spatial data(longitude, latitude and altitude) provides insights into how road segments are grouped based ib their geographic location and elevation. by identifiying clusters, we can draw conclusion about similar road characteristics across different region. this information is vital for urban plannersm traffic management system, and navigation tools, as it allows for better decision making related to road maintanance, trafficflow, and route optimization.

Clustering on 3D Road Network

Amal Vinob

2024-09-16