This dataset contains the detailed information about the road network in North Jutland, Denmark including logitude, latitude and altitude for road segmentation. key features: OSM_ID : Unique identifier for road segments from OpenStreetMap longitude,latitude,altitude : Representing the geographycal location and elevation of each road segment.
suppressPackageStartupMessages({
suppressWarnings({
library(dplyr)
library(ggplot2)
library(cluster)
library(factoextra)
library(corrplot)
library(dendextend)
library(gridExtra)
library(scatterplot3d)
library(purrr)
library(hopkins)
})
})
setwd("D:/alliance/3rd sem/machine learning_2/R_studo/datasets")
data <- readLines("3D_spatial_network.txt")
head(data)
## [1] "144552912,9.3498486,56.7408757,17.0527715677876"
## [2] "144552912,9.3501884,56.7406785,17.614840244389"
## [3] "144552912,9.3505485,56.7405445,18.08353563951"
## [4] "144552912,9.3508058,56.7404845,18.2794652530352"
## [5] "144552912,9.3510534,56.7404863,18.4229736146099"
## [6] "144552912,9.3514747,56.7405022,19.1248885940143"
splited <- strsplit(data, split = ",")
splited <- do.call(rbind, splited)
df <- as.data.frame(splited, stringAsFactor= FALSE)
colnames(df) <- c("OSMID", "longitude", "Latitude", "Altitude")
df[] <- lapply(df, function(x) as.numeric(x))
head(df)
## OSMID longitude Latitude Altitude
## 1 144552912 9.349849 56.74088 17.05277
## 2 144552912 9.350188 56.74068 17.61484
## 3 144552912 9.350549 56.74054 18.08354
## 4 144552912 9.350806 56.74048 18.27947
## 5 144552912 9.351053 56.74049 18.42297
## 6 144552912 9.351475 56.74050 19.12489
str(df)
## 'data.frame': 434874 obs. of 4 variables:
## $ OSMID : num 1.45e+08 1.45e+08 1.45e+08 1.45e+08 1.45e+08 ...
## $ longitude: num 9.35 9.35 9.35 9.35 9.35 ...
## $ Latitude : num 56.7 56.7 56.7 56.7 56.7 ...
## $ Altitude : num 17.1 17.6 18.1 18.3 18.4 ...
summary(df)
## OSMID longitude Latitude Altitude
## Min. : 4482444 Min. : 8.146 Min. :56.58 Min. : -8.608
## 1st Qu.: 82678969 1st Qu.: 9.338 1st Qu.:56.85 1st Qu.: 7.028
## Median :101979668 Median : 9.887 Median :57.04 Median : 17.575
## Mean : 97869978 Mean : 9.732 Mean :57.08 Mean : 22.185
## 3rd Qu.:125954704 3rd Qu.:10.172 3rd Qu.:57.31 3rd Qu.: 31.810
## Max. :157742416 Max. :11.199 Max. :57.75 Max. :134.442
print(sum(is.na(df)))
## [1] 0
long <- ggplot(data = df, aes(x = longitude)) + geom_histogram(binwidth = .3 ,fill= "steelblue", color= "black") +
labs(title = "Distribution of longitude ", xlab="longitude", ylab = "count")
lati <- ggplot(data = df, aes(x = Latitude)) + geom_histogram(binwidth = .2 ,fill= "steelblue", color= "black") +
labs(title = "Distribution of Latitude ", xlab="Latitude", ylab = "count")
alti <- ggplot(data = df, aes(x = Altitude)) + geom_histogram(binwidth = 5 ,fill= "steelblue", color= "black") +
labs(title = "Distribution of Altitude ", xlab="Altitude", ylab = "count")
grid.arrange(long, lati, alti, nrow=2, ncol=2)
The distribution of the longitude values centered around 10 degree. majority of the road segment falls between 9.5 and 10.5
The latitude vales are distribution between approximatily 56.5 and 57.6. The distribution shows a slight peak in between 56.8 and 57.2, indicating tat most road network is located in this range
The altitude value range from -8 to 134 meter (height). The Diststibution is Right skewed, with majority of the values concentrated on the left side below 50. Indicating that most road network are in low lying area A small number of road above 100 meters indicating there are some high altitude regions.
corr <- cor(df)
corr
## OSMID longitude Latitude Altitude
## OSMID 1.00000000 -0.03571643 -0.05142402 0.10652031
## longitude -0.03571643 1.00000000 0.56685465 0.04232349
## Latitude -0.05142402 0.56685465 1.00000000 -0.10538305
## Altitude 0.10652031 0.04232349 -0.10538305 1.00000000
corrplot(corr, method = "pie", type = "lower")
ggplot(data=df, aes(x="", y = longitude), color="blues") + geom_boxplot(fill = "lightblue", color= "black") +
labs(title="box plot for longitude", x = "Longitude") + theme_minimal()
ggplot(data = df, aes(x = "", y = Latitude)) +
geom_boxplot(fill = "steelblue", color = "black")+
labs(title = "Boxplot of Latitude", x = "Latitude") + theme_minimal()
ggplot(data = df, aes(x = "", y = Altitude)) +
geom_boxplot(fill = "steelblue", color = "black")+
labs(title = "Boxplot of Altitude", x = "Altitude") + theme_minimal()
according to the boxplot Altitude variable in our dataset exhibits
significant outliers, as evidenced by the high spread in values beyond
the usual range of the data.
q1 <- quantile(df$Altitude, 0.25)
q3 <- quantile(df$Altitude, 0.75)
IQR <- q3 - q1
lower_bound <- q1 - IQR * 1.5
upper_bound <- q3 + IQR * 1.5
cleaned_data <-subset(df, Altitude >= lower_bound & Altitude <= upper_bound)
print(dim(df))
## [1] 434874 4
print(dim(cleaned_data))
## [1] 423208 4
ggplot(data = df , aes(x= longitude, y =Latitude)) + geom_point(aes(color = Altitude)) + ggtitle("geographical Distribution of the road")
#scatterplot3d(df$longitude, df$Latitude, df$Altitude, pch = 19)
print(sum(is.na(df)))
## [1] 0
df <- na.omit(df)
Scaled_df <- scale(cleaned_data[, c("longitude", "Latitude", "Altitude")])
head(Scaled_df)
## longitude Latitude Altitude
## 1 -0.5899961 -1.182628 -0.22001329
## 2 -0.5894581 -1.183308 -0.18519519
## 3 -0.5888880 -1.183771 -0.15616123
## 4 -0.5884806 -1.183978 -0.14402411
## 5 -0.5880886 -1.183971 -0.13513429
## 6 -0.5874216 -1.183917 -0.09165323
sample_df <- Scaled_df[sample(nrow(Scaled_df), size = 5000),]
#dist_df <- dist(sample_df)
hopkins(sample_df, nrow(sample_df)-1)
## [1] 0.9985882
The Hopkins statistic is used to assess the cluster tendency of a dataset, i.e., whether the data contains meaningful clusters or if the data points are randomly distributed. The value of the Hopkins statistic ranges from 0 to 1:
A value close to 1 (like your result of 0.999) indicates that the dataset is highly clustered and not random. A value close to 0.5 suggests that the data is uniformly distributed or random and does not exhibit cluster structure. A value close to 0 would indicate anti-clustering or regular spacing between points.
Since the Hopkins statistic is very high, this strongly indicates that the dataset has a highly clustered structure, making it very suitable for clustering techniques like k-means or hierarchical clustering. The high value confirms that there are distinct groups or clusters within your 3D road spatial data.
k1 <- kmeans(x = sample_df, centers = 2, nstart = 25)
k2 <- kmeans(x = sample_df, centers = 4, nstart = 25)
k3 <- kmeans(x = sample_df, centers = 6, nstart = 25)
k4 <- kmeans(x = sample_df, centers = 8, nstart = 25)
p1 <- fviz_cluster(k1, data = sample_df, geom = "point") + ggtitle("k=1")
p2 <- fviz_cluster(k2, data=sample_df, geom = "point") + ggtitle("k = 2")
p3 <- fviz_cluster(k3, data=sample_df, geom = "point") + ggtitle("k = 3")
p4 <- fviz_cluster(k4, data=sample_df, geom = "point") + ggtitle("k = 4")
p1
p2
p3
p4
set.seed(123)
wss <- function(k){
kmeans(sample_df, k ,nstart = 25)$tot.withinss
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values, type = "b", pch=19, frame=FALSE, xlab = "No. of Clusters",ylab="Total Within- Cluster sum of squares")
### best cluster
From the above elbow method to find the optimal number of clusters for the 3D road spatial data by plotting the within- cluster sum of squares(WSS) against the number of cluster(k).The WSS represents the total variance within clusters, and the goal is to minimize this value. The plot above shows how the WSS decrease as the number of clusters increases.The “elbow” in the plot indicates where adding more cluster results in diminishing improvements to the WSS. In this the elbow occurs at clusters, suggesting that 5 is the optimal number of cluster for this Dataset.
k1 <- kmeans(x = sample_df, centers = 5, nstart = 25)
p1 <- fviz_cluster(k1, data = sample_df, geom = "point") + ggtitle("k=5")
p1
Doing some hierarchical clustering in this dataset.
dist_df <- dist(sample_df)
hc_clust_1 <- hclust(dist_df, method = "ward.D2")
fviz_dend(hc_clust_1, cex=.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hc_clust_2 <- hclust(dist_df, method = "average")
hc_clust_3 <- hclust(dist_df, method = "ward.D")
hc_clust_4 <- hclust(dist_df, method = "single")
hc_clust_5 <- hclust(dist_df, method = "complete")
hc_clust_6 <- hclust(dist_df, method = "centroid")
# or
den1 <- sample_df %>% dist %>% hclust("average") %>% as.dendrogram
den2 <- sample_df %>% dist %>% hclust("ward.D") %>% as.dendrogram
den3 <- sample_df %>% dist %>% hclust("single") %>% as.dendrogram
den4 <- sample_df %>% dist %>% hclust("complete") %>% as.dendrogram
den5 <- sample_df %>% dist %>% hclust("centroid") %>% as.dendrogram
den6 <- sample_df %>% dist %>% hclust("ward.D2") %>% as.dendrogram
dend_list <- dendlist("average" = den1, "ward.D" = den2,
"single" = den3, "complete" = den4,
"centroid" = den5, "ward.D2" = den6)
corre <-round(cor.dendlist(dend_list), 2)
corre
## average ward.D single complete centroid ward.D2
## average 1.00 0.77 0.12 0.54 0.70 0.72
## ward.D 0.77 1.00 0.04 0.52 0.48 0.85
## single 0.12 0.04 1.00 0.15 0.22 0.07
## complete 0.54 0.52 0.15 1.00 0.45 0.55
## centroid 0.70 0.48 0.22 0.45 1.00 0.53
## ward.D2 0.72 0.85 0.07 0.55 0.53 1.00
In this Dataset, the clustering of road segments based on 3D spatial data(longitude, latitude and altitude) provides insights into how road segments are grouped based ib their geographic location and elevation. by identifiying clusters, we can draw conclusion about similar road characteristics across different region. this information is vital for urban plannersm traffic management system, and navigation tools, as it allows for better decision making related to road maintanance, trafficflow, and route optimization.