# install the packages
# install.packages("factoextra")
# install.packages("flexclust")
# install.packages("fpc")
# install.packages("cluster")
# install.packages("ClusterR")
# install.packages("tidyverse")
# install.packages("dendextend")
# install.packages("NbClust")
# install.packages("fpc")
# install.packages("dbscan")
# install.packages("geosphere")
# install.packages("dplyr")
# install.packages("prophet")
# install.packages("viridis")
# install.packages("MASS")
library(MASS)
library(viridis)
library(prophet)
library(dplyr)
library(geosphere)
library(fpc)
library(dbscan)
library(NbClust)
library(factoextra)
library(flexclust)
library(fpc)
library(cluster)
library(ClusterR)
library(tidyverse)
library(dendextend)
uber_data <- read.csv("uber-data.csv")
str(uber_data)
## 'data.frame': 1028136 obs. of 4 variables:
## $ Date.Time: chr "9/1/2014 0:01:00" "9/1/2014 0:01:00" "9/1/2014 0:03:00" "9/1/2014 0:06:00" ...
## $ Lat : num 40.2 40.8 40.8 40.7 40.8 ...
## $ Lon : num -74 -74 -74 -74 -73.9 ...
## $ Base : chr "B02512" "B02512" "B02512" "B02512" ...
summary(uber_data)
## Date.Time Lat Lon Base
## Length:1028136 Min. :39.99 Min. :-74.77 Length:1028136
## Class :character 1st Qu.:40.72 1st Qu.:-74.00 Class :character
## Mode :character Median :40.74 Median :-73.98 Mode :character
## Mean :40.74 Mean :-73.97
## 3rd Qu.:40.76 3rd Qu.:-73.96
## Max. :41.35 Max. :-72.72
# Histogram for Latitude
hist(uber_data$Lat,
breaks = 50,
col = "skyblue",
main = "Distribution of Latitude values",
xlab = "Latitude")
# Histogram for Longitude
hist(uber_data$Lon,
breaks = 50,
col = "lightgreen",
main = "Distribution of Longitude values",
xlab = "Longitude")
# Scatterplot on longitude and latitude values
plot(uber_data$Lon, uber_data$Lat,
pch = 19, col = rgb(0,0,1,0.3),
main = "Uber trip locations (Longitude vs Latitude)",
xlab = "Longitude", ylab = "Latitude")
First, we need to check for any missing values in our dataset
colSums(is.na(uber_data))
## Date.Time Lat Lon Base
## 0 0 0 0
Next, let us fix the date type of the data under the Date.Time column which will be useful to us in later sections.
uber_data$Date.Time <- as.POSIXct(uber_data$Date.Time, format="%m/%d/%Y %H:%M:%S")
str(uber_data)
## 'data.frame': 1028136 obs. of 4 variables:
## $ Date.Time: POSIXct, format: "2014-09-01 00:01:00" "2014-09-01 00:01:00" ...
## $ Lat : num 40.2 40.8 40.8 40.7 40.8 ...
## $ Lon : num -74 -74 -74 -74 -73.9 ...
## $ Base : chr "B02512" "B02512" "B02512" "B02512" ...
Removing duplicate rows
uber_data <- unique(uber_data)
uber_locations <- uber_data[, c("Lon", "Lat")]
Not scaling longitude and latitude as it may distort the actual values,
Reference:
https://datascience.stackexchange.com/questions/56186/clustering-with-geolocation-lat-long-pairs attributes#:~:text=Standardizing%20latitude/longitude%20is%20a,It’s%20probably%20even%20interesting.
# Hopkins stat
set.seed(123)
sample_rows <- sample(1:nrow(uber_locations), 5000)
sample_uber_locations <- uber_locations[sample_rows, ]
get_clust_tendency(sample_uber_locations, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)
## $hopkins_stat
## [1] 0.9818681
##
## $plot
Since the hopkins statistic is close to 1, we can say that our data is going to be highly clusterable. Running Hopkins statistic on the entire dataset proved to cause memory issues, hence I resorted to creating a randomly shuffled sample of 5000 rows to be the baseline for future clustering and analysis.
# Try k values between 2 to 10 for CLARA
sil_width <- numeric(9)
for (k in 2:10) {
clara_model <- clara(sample_uber_locations, k, metric="euclidean", stand=FALSE, samples=10,
sampsize=2000, trace=0, medoids.x=TRUE,
rngR=FALSE, pamLike=FALSE, correct.d=TRUE)
sil <- silhouette(clara_model$clustering, dist(sample_uber_locations))
sil_width[k - 1] <- mean(sil[, 3])
}
plot(2:10, sil_width, type = "b",
xlab = "Number of clusters (k)",
ylab = "Average silhouette width",
main = "CLARA silhouette analysis")
best_k <- which.max(sil_width) + 1
cat(sprintf("Optimal number of clusters (CLARA): %d\n", best_k))
## Optimal number of clusters (CLARA): 3
opt1<-NbClust(sample_uber_locations, distance="euclidean", min.nc=2, max.nc=10, method="complete", index="ch")
opt1$Best.nc
## Number_clusters Value_Index
## 4.000 1126.174
The overall poor silhouette score being around 0.3 and its volatility when playing around with parameters like sample size and number of samples gives us a conclusion that CLARA is far from being the optimal clustering algorithm. Furthermore, the fact that using Calinski-Harabasz says k=4 but the silhouette score plot contradicts this stating that k=4 has the worst silhouette score is another reason why trying another approach is need. Additionally, the value index being not the greatest indicates that calculated clusters on the sample of the dataset are not of the highest quality with clear seperations.
cluster_clara <- eclust(sample_uber_locations,"clara",k=3)
opt<-Optimal_Clusters_KMeans(sample_uber_locations, max_clusters=10, plot_clusters=TRUE, criterion="silhouette")
Running silhouette analysis after using k-means tells us that the optimal number of clusters is 2 since it has the highest silhouette width. We can also see that k=3 and k=4 are also quite close and possible candidates.
fviz_nbclust(sample_uber_locations, kmeans, method = "gap_stat")
Finding the Gap statistic value indicates that it is actually k=1 that has the largest gap size.
cluster_km <- eclust(sample_uber_locations,"kmeans", 2)
cluster_km <- eclust(sample_uber_locations,"kmeans", 3)
cluster_km <- eclust(sample_uber_locations,"kmeans", 4)
Running k means We can clearly see that this is not a good clustering as the clusters are not seperated at all from each other. We also see an inconsistency between the clustering quality metric of gap statistic and silhouette width.
Looking at a few online resources it is worth trying out clustering based on density.
uber_data_matrix <- as.matrix(sample_uber_locations)
set.seed(123)
dbscan::kNNdistplot(uber_data_matrix, k = 5)
abline(h = 0.05, lty = 2)
uber_data_dbscan <- fpc::dbscan(uber_data_matrix, eps = 0.05, MinPts = 10)
plot(uber_data_dbscan, uber_data_matrix, main = "DBSCAN on Uber data", frame = FALSE)
fviz_cluster(uber_data_dbscan, data = uber_data_matrix, geom = "point")
table(uber_data_dbscan$cluster)
##
## 0 1 2
## 53 4918 29
Running a density based clustering approach like DBSCAN gives better clustering and visually we can see that there are 2 clusters that have some separation between them. But we can clearly see majority of the points are stacked in the pink cluster and there are only a few points in the other cluster.
This required playing around with the minPoints parameter as I wanted to avoid weak mini clusters from being created with close vicinity of a bigger cluster.
dist_hav <- distm(uber_data_matrix, fun = distHaversine)
dist_hav_km <- dist_hav / 1000
eps_km <- 3
MinPts <- 10
db_hav <- dbscan(as.dist(dist_hav_km), eps = eps_km, minPts = MinPts, borderPoints = TRUE)
fviz_cluster(db_hav, data = uber_data_matrix, geom = "point")
table(db_hav$cluster)
##
## 0 1 2
## 88 4888 24
Running DBSCAN with haversine based distance gives us results aligned with DBSCAN with euclidean distance. We can conclude that DBSCAN gives us insight that the biggest cluster could represent a popular city, the other weaker cluster could represent the suburbs.
uber_time_analysis <- uber_data[sample_rows, ]
uber_time_analysis$cluster <- uber_data_dbscan$cluster
uber_time_analysis$hour <- as.numeric(format(uber_time_analysis$Date.Time, "%H"))
uber_time_analysis$date <- as.Date(uber_time_analysis$Date.Time)
# Removing the noise based cluster from analysis (cluster 0)
uber_clusters <- uber_time_analysis %>%
filter(cluster %in% c(1, 2))
cluster_time_summary <- uber_clusters %>%
group_by(cluster, hour) %>%
summarise(trip_count = n(), .groups = "drop")
ggplot(cluster_time_summary,
aes(x = as.numeric(hour), y = trip_count, color = factor(cluster))) +
geom_line(size = 1.2) +
geom_point(size = 2) +
facet_wrap(~cluster, scales = "free_y") +
scale_color_manual(
values = c("1" = "pink", "2" = "cyan"),
labels = c("1" = "Cluster 1", "2" = "Cluster 2")
) +
labs(
title = "Cluster Activity by Time of Day",
subtitle = "Hourly trip counts shown separately per cluster",
x = "Hour of Day (0–23)",
y = "Number of Trips",
color = "Cluster"
) +
theme_minimal() +
theme(legend.position = "none")
On comparing the cluster activity between both clusters we can
clearly see that they are distinct. Cluster 1 exhibits
behavior that can be explained with people’s daily routines like rising
in the morning between 7:30 to 8:30 AM where people are commuting to
work. There is also higher spike during the end of the workday around
4:30-5:30 PM where people are either leaving work or going for outings
after work hours. Towards the end of the day the Uber activity also
decreases which makes sense that most people tend to avoid late night
trips particularly on weekdays.
Cluster 2 shows a lot more irregular activity because
it maybe a specialized location like a suburban area further away from
the city so the spikes during the early morning and evening times that
can indicate the early departure and late arrivals of commuters coming
back to the suburbs.
cluster_date_summary <- uber_time_analysis %>%
filter(cluster %in% c(1, 2)) %>%
group_by(cluster, date) %>%
summarise(trip_count = n(), .groups = "drop")
ggplot(cluster_date_summary,
aes(x = date, y = trip_count, color = factor(cluster))) +
geom_line(size = 1.2) +
geom_point(size = 2) +
facet_wrap(~cluster, scales = "free_y") +
scale_color_manual(
values = c("1" = "pink", "2" = "cyan"),
labels = c("1" = "Cluster 1", "2" = "Cluster 2")
) +
labs(
title = "Cluster Activity by Date",
subtitle = "Daily trip counts shown separately per cluster",
x = "Date",
y = "Number of Trips",
color = "Cluster"
) +
theme_minimal() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
Cluster 1 exhibits relatively high and stable daily trip volumes throughout the month of September, with small fluctuations that reflect typical weekday–weekend demand cycles. Peaks occur periodically every few days, indicating that this cluster represents the main urban area with continuous ride activity. The lack of any long-term dramatic decline tells us that Uber drivers are always in need.
Cluster 2, on the other hand, displays very low and irregular trip counts, rarely exceeding a few rides per day. The inconsistent pattern implies that this cluster corresponds to a low-demand region, where ride requests are occasional rather than routine.
cluster_centers <- uber_time_analysis %>%
filter(cluster %in% c(1, 2)) %>%
group_by(cluster) %>%
summarise(
center_lon = mean(Lon),
center_lat = mean(Lat),
.groups = "drop"
)
print(cluster_centers)
## # A tibble: 2 × 3
## cluster center_lon center_lat
## <dbl> <dbl> <dbl>
## 1 1 -74.0 40.7
## 2 2 -74.2 40.7
# Taking some points to analyze behaviour around the calculated center
radius <- 0.01
uber_center_points <- uber_time_analysis %>%
inner_join(cluster_centers, by = "cluster") %>%
filter(abs(Lon - center_lon) < radius,
abs(Lat - center_lat) < radius)
center_hour_summary <- uber_center_points %>%
group_by(cluster, hour) %>%
summarise(trip_count = n(), .groups = "drop")
ggplot(center_hour_summary, aes(x = as.numeric(hour), y = trip_count,
color = factor(cluster), group = cluster)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
scale_color_manual(
values = c("1" = "pink", "2" = "cyan"),
labels = c("Cluster 1", "Cluster 2")
) +
labs(
title = "Trip Activity Around Cluster Centers by Hour",
subtitle = "Trips within 1 km of each cluster center",
x = "Hour of Day (0–23)",
y = "Number of Trips",
color = "Cluster"
) +
theme_minimal()
center_date_summary <- uber_center_points %>%
group_by(cluster, date) %>%
summarise(trip_count = n(), .groups = "drop")
ggplot(center_date_summary, aes(x = date, y = trip_count,
color = factor(cluster), group = cluster)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
scale_color_manual(
values = c("1" = "pink", "2" = "cyan"),
labels = c("Cluster 1", "Cluster 2")
) +
labs(
title = "Trip Activity Around Cluster Centers by Date",
subtitle = "Trips with a radius around calculated center",
x = "Date",
y = "Number of Trips",
color = "Cluster"
) +
theme_minimal() +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1)
)
When taking a deeper look at the date and time of data points near cluster centers the trend of higher overall demand continues for cluster 1 in comparison to cluster 2 which has a lot less an infrequent activity.
set.seed(1)
rides_by_cluster_time <- uber_time_analysis %>%
group_by(cluster, hour = lubridate::floor_date(Date.Time, "hour")) %>%
summarise(count = n(), .groups = "drop")
cluster_id <- 1
cluster1 <- rides_by_cluster_time %>%
filter(cluster == cluster_id) %>%
rename(ds = hour, y = count)
m <- prophet(cluster1)
future <- make_future_dataframe(m, periods = 24, freq = "hour")
forecast <- predict(m, future)
historical_data <- uber_time_analysis %>%
filter(cluster == cluster_id)
dens <- kde2d(
historical_data$Lon,
historical_data$Lat,
n = 200
)
dens_df <- data.frame(
expand.grid(Lon = dens$x, Lat = dens$y),
density = as.vector(dens$z)
)
N_pred <- round(mean(forecast$yhat[forecast$ds > max(cluster1$ds)]))
prob <- as.numeric(dens$z)
prob <- prob / sum(prob)
idx <- sample(length(prob), size = N_pred, replace = TRUE, prob = prob)
lon_grid <- rep(dens$x, times = length(dens$y))
lat_grid <- rep(dens$y, each = length(dens$x))
predicted_points <- data.frame(
Lon = lon_grid[idx],
Lat = lat_grid[idx]
)
ggplot() +
geom_tile(data = dens_df, aes(x = Lon, y = Lat, fill = density), alpha = 0.7) +
geom_point(data = predicted_points, aes(x = Lon, y = Lat),
color = "cyan", alpha = 0.6, size = 1) +
scale_fill_viridis(option = "plasma") +
labs(
title = "Forecasted Ride Locations (Next 24 Hours) – Cluster 1",
subtitle = "Predicted rides sampled from KDE weighted by Prophet forecast",
x = "Longitude",
y = "Latitude",
fill = "Density"
) +
theme_minimal()
A heat map is visualized above by historical data inside the cluster by taking into account pickups in the same time interval in terms of time of day. The cyan points indicate that all of the newly predicted points are around the highest demand zone of near (−74.0, 40.75), so we can safely say that if an Uber driver wants a higher chance of getting trip requests over the next 24 hours from 2014-09-30 22:51:00 CEST (latest recorded time in cluster 1) then they should hang around the hotspot.
Because of the fact that Cluster 1 has significantly more points compared to the very sparse cluster 2, this kind of forecasting makes more sense to be run on only Cluster 1.