# install.packages("flexclust")
# install.packages("fpc")
# install.packages("cluster")
# install.packages("ClusterR")
# install.packages("tidyverse")
# install.packages("dendextend")
# install.packages("hopkins")
# install.packages("dplyr")
# install.packages("factoextra")
# install.packages("fpc")
# install.packages("dbscan")
# install.packages("NbClust")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(flexclust)
library(fpc)
library(cluster)
library(ClusterR)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.19.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:stats':
## 
##     cutree
library(hopkins)
library(dplyr)
library(ggplot2)
library(dbscan)
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:fpc':
## 
##     dbscan
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(NbClust)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lubridate)

1. Data Preparation

Loading data into uber_data

uber_data <- read.csv("C:/Users/akhil/OneDrive/Desktop/UNI/Unsupervised/uber-data.csv")
head(uber_data)
str(uber_data)
## 'data.frame':    1028136 obs. of  4 variables:
##  $ Date.Time: chr  "9/1/2014 0:01:00" "9/1/2014 0:01:00" "9/1/2014 0:03:00" "9/1/2014 0:06:00" ...
##  $ Lat      : num  40.2 40.8 40.8 40.7 40.8 ...
##  $ Lon      : num  -74 -74 -74 -74 -73.9 ...
##  $ Base     : chr  "B02512" "B02512" "B02512" "B02512" ...

Cleaning Data

  1. Renaming column names
names(uber_data) <- c("Date.Time", "Latitude", "Longitude", "Base")
  1. Converting to Date Time
uber_data$Date.Time <- as.POSIXct(uber_data$Date.Time,
                             format = "%m/%d/%Y %H:%M:%S", )
  1. Checking for NAs and removing exactly duplicated values
colSums(is.na(uber_data))
## Date.Time  Latitude Longitude      Base 
##         0         0         0         0
sum(duplicated(uber_data))
## [1] 24037
uber_data <- unique(uber_data)
sum(duplicated(uber_data))
## [1] 0

Visualizing the data

This graph helps us visualize all the uber_data coordinates.

X <- as.data.frame(uber_data %>% select(Longitude, Latitude))
ggplot(X, aes(x = Longitude, y = Latitude)) + geom_point(alpha = 0.3, color = "blue") + labs(title = "Uber pickup locations", x = "Longitude", y = "Latitude") + theme_minimal()

My initial impression was that it’s not immediately obvious how many clusters exist or what their shapes might be.

I also added some columns that will help me later on and plotted graphs to show the general trend of the entire data set in terms of hours and days of the week.

uber_data$hour <- as.numeric(format(uber_data$Date.Time, "%H"))
uber_data$weekday <- weekdays(uber_data$Date.Time)
uber_data$day <- as.numeric(format(uber_data$Date.Time, "%d"))

ggplot(uber_data, aes(x = hour)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "white") +
  labs(title = "Trips by Hour", x = "Hour of the Day", y = "Number of Trips") +
  theme_minimal()

ggplot(uber_data, aes(x = weekday)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Trips by Day of Week", x = "Day", y = "Trips") +
  theme_minimal()

Setting up data with different seeds

set.seed(1)
sample_index1 <- sample(1:nrow(X), 5000)
X_sample1 <- X[sample_index1, ]

set.seed(2)
sample_index2 <- sample(1:nrow(X), 5000)
X_sample2 <- X[sample_index2, ]

set.seed(3)
sample_index3 <- sample(1:nrow(X), 5000)
X_sample3 <- X[sample_index3, ]

set.seed(123)
sample_index123 <- sample(1:nrow(X), 5000)
X_sample <- X[sample_index123, ]

2. Clustering

1. Checking for clusterability

Hopkins Statistics

To check whether the data is clusterable, I used the Hopkins Statistic.

get_clust_tendency(X_sample, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 1)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## $hopkins_stat
## [1] 0.9422234
## 
## $plot

Since the Hopkins statistic (0.9422234) is close to 1, we can say that the data has a significant tendency to form clusters rather than being uniformly distributed. Hence we can move forward with finding the optimal number of clusters (k).

2. Finding the right k (number of clusters)

Calinski-Harabasz

h1<-NbClust(X_sample1, distance="euclidean", min.nc=2, max.nc=8, method="complete", index="ch")
h2<-NbClust(X_sample2, distance="euclidean", min.nc=2, max.nc=8, method="complete", index="ch")
h3<-NbClust(X_sample3, distance="euclidean", min.nc=2, max.nc=8, method="complete", index="ch")
h1$Best.nc
## Number_clusters     Value_Index 
##            2.00         1610.57
h2$Best.nc
## Number_clusters     Value_Index 
##           7.000        1165.835
h3$Best.nc
## Number_clusters     Value_Index 
##           2.000        2522.793

I tested the Calinski-Harabasz method on three different samples (using different random seeds), and the optimal number of clusters obtained was 2, 7, and 2. Even though one sample suggested 7 clusters, I chose the solution with the highest Value Index (2522.793), which corresponds to 2 clusters. I did this because the higher the Calinski-Harabasz index, the more distinct the clusters are.

Silhouette and Gap Statistics

  1. Hierarchical
fviz_nbclust(X_sample1, FUN = hcut, method = "silhouette")

fviz_nbclust(X_sample2, FUN = hcut, method = "silhouette")

fviz_nbclust(X_sample3, FUN = hcut, method = "silhouette")

fviz_nbclust(X_sample1, FUN = hcut, method = "gap_stat")

fviz_nbclust(X_sample2, FUN = hcut, method = "gap_stat")

fviz_nbclust(X_sample3, FUN = hcut, method = "gap_stat")

  1. K-means
fviz_nbclust(X_sample1, FUN = kmeans, method = "silhouette")

fviz_nbclust(X_sample2, FUN = kmeans, method = "silhouette")

fviz_nbclust(X_sample3, FUN = kmeans, method = "silhouette")

fviz_nbclust(X_sample1, FUN = kmeans, method = "gap_stat")

fviz_nbclust(X_sample2, FUN = kmeans, method = "gap_stat")

fviz_nbclust(X_sample3, FUN = kmeans, method = "gap_stat")

  1. CLARA
fviz_nbclust(X_sample1, FUN = cluster::clara, method = "silhouette")

fviz_nbclust(X_sample2, FUN = cluster::clara, method = "silhouette")

fviz_nbclust(X_sample3, FUN = cluster::clara, method = "silhouette")

fviz_nbclust(X_sample1, FUN = cluster::clara, method = "gap_stat")

fviz_nbclust(X_sample2, FUN = cluster::clara, method = "gap_stat")

fviz_nbclust(X_sample3, FUN = cluster::clara, method = "gap_stat")

Silhouette Width

I used different samples to ensure there was no bias in my analysis. Across all the Silhouette Analyses, the general trend indicated that the optimal number of clusters was 2, with a silhouette score of around 0.7 for both hierarchical and K-means methods. However, the CLARA method didn’t perform as well, as its average silhouette width was typically between 0.4 and 0.5.

Gap Statistic

I also used the Gap Statistic method, and from the graphs, it consistently indicated that the optimal number of clusters was 1.

3. Applying optimal K to different clustering algorithms

  1. K-means
km2<-eclust(X_sample, "kmeans", k= 2)

km3<-eclust(X_sample, "kmeans", k= 3)

With k = 2, the algorithm splits the dataset almost evenly into two halves, forming a “pie-slice” pattern. We can also see the same thing in k = 3, where there is another slice added and there are no clear divisions between the clusters. This is most probably because K-means assumes that the clusters are usually somewhat spherical in shape and are of similar sizes, unlike the irregular patterns here.

  1. CLARA
clara3<-eclust(X_sample, "clara", k=2)

clara4<-eclust(X_sample, "clara", k=3)

CLARA also shows similar behavior. At k = 2, it again splits the area almost straight down the middle, which is not that useful. At k = 3, it again adds a pie slice. Even though CLARA handles outliers better than K-means, it still doesn’t have any meaningful clusters.

Since both K-means and CLARA tend to partition space geometrically rather than by actual density, I shifted my focus to more density-based methods like DBSCAN.

  1. DBSCAN

I ran DBSCAN with different random samples to check whether the results were consistent and not biased by the sample. In most runs, the algorithm consistently found two clusters. One large cluster with around 9,800 points (Cluster 1) and another smaller one with about 80 points (Cluster 2).

set.seed(1)
sample_index_db1 <- sample(1:nrow(X), 10000)
X_sample_db1 <- X[sample_index_db1, ]

kNNdistplot(X_sample_db1, k =  10)
abline(h = 0.05, lty = 2)

db <- fpc::dbscan(X_sample_db1, eps = 0.05, MinPts = 10)
plot(db, X_sample_db1, main = "DBSCAN", frame = FALSE)

fviz_cluster(db, X_sample_db1, stand = FALSE, geom = "point")

table(db$cluster)
## 
##    0    1    2 
##   85 9835   80
set.seed(10)
sample_index_db10 <- sample(1:nrow(X), 10000)
X_sample_db10 <- X[sample_index_db10, ]

kNNdistplot(X_sample_db10, k =  10)
abline(h = 0.05, lty = 2)

db <- fpc::dbscan(X_sample_db10, eps = 0.05, MinPts = 10)
plot(db, X_sample_db10, main = "DBSCAN", frame = FALSE)

fviz_cluster(db, X_sample_db10, stand = FALSE, geom = "point")

table(db$cluster)
## 
##    0    1    2 
##   67 9858   75
set.seed(50)
sample_index_db50 <- sample(1:nrow(X), 10000)
X_sample_db50 <- X[sample_index_db50, ]

kNNdistplot(X_sample_db50, k =  10)
abline(h = 0.05, lty = 2)

db <- fpc::dbscan(X_sample_db50, eps = 0.05, MinPts = 10)
plot(db, X_sample_db50, main = "DBSCAN", frame = FALSE)

fviz_cluster(db, X_sample_db50, stand = FALSE, geom = "point")

table(db$cluster)
## 
##    0    1    2    3 
##   69 9835   86   10
set.seed(123)
sample_index <- sample(1:nrow(X), 10000)
X_sample <- X[sample_index, ]

kNNdistplot(X_sample, k =  10)
abline(h = 0.05, lty = 2)

db <- fpc::dbscan(X_sample, eps = 0.05, MinPts = 10)
plot(db, X_sample, main = "DBSCAN", frame = FALSE)

fviz_cluster(db, X_sample, stand = FALSE, geom = "point")

table(db$cluster)
## 
##    0    1    2 
##   70 9854   76

During the DBSCAN experiment, I tried changing the values of epsilon and MinPts to have good clustering results.

MinPts

When I used a small MinPts value (around 5), many small and insignificant clusters appeared very close to the large cluster. To fix this issue, I tried increasing the value of MinPts and found a sweet spot at MinPts = 10, which allowed smaller clusters near Cluster 1 to merge into it.

Epsilon

I tested several epsilon values to prevent the formation of unnecessary clusters and found that an epsilon of about 0.05 gave the best overall results. I also had the kNNdistplot graph to help me decide the optimal epsilon value.

3. Analyzing the clusters

I made sure that the sample data was correctly matched with the uber_data data frame so that all columns (Date.Time, Day, Weekday, Hour, Longitude, Latitude, and Cluster) were combined into a single data frame.

X_sample$cluster <- db$cluster

X_sample$index <- sample_index
uber_data$index <- 1:nrow(uber_data)
merged <- merge(X_sample, uber_data[, c("index", "Date.Time", "hour", "day", "weekday")], by = "index")

uber_filtered <- merged %>% filter(cluster != 0)
uber_filtered$weekday <- factor(
  format(uber_filtered$Date.Time, "%a"),
  levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
  ordered = TRUE
)

To analyze the cluster centers (which will come up later), I used a radius-based approach to include nearby data points that best represent each cluster’s center. The points that I selected near the center are highlighted in yellow.

cluster_centers <- uber_filtered %>%
  group_by(cluster) %>%
  summarise(
    center_long = mean(Longitude),
    center_lat = mean(Latitude)
  )

uber_core <- uber_filtered %>%
  left_join(cluster_centers, by = "cluster") %>%
  mutate(
    dist_center = sqrt((Longitude - center_long)^2 + (Latitude - center_lat)^2)
  )

uber_core <- uber_core %>%
  filter(dist_center <= 0.01)

ggplot(uber_filtered, aes(x = Longitude, y = Latitude, color = factor(cluster))) +
  geom_point(alpha = 0.2) +
  geom_point(data = uber_core, color = "yellow", size = 1) +
  geom_point(data = cluster_centers, aes(x = center_long, y = center_lat),
             color = "black", size = 4, shape = 8) +
  labs(title = "Cluster centers and points within chosen radius",
       x = "Longitude", y = "Latitude", color = "Cluster") +
  theme_minimal()

1. Analysis based on time (hour)

ggplot(uber_filtered, aes(x = hour, fill = factor(cluster))) +
  geom_histogram(binwidth = 1, color = "white") +
  facet_wrap(~ cluster, scales = "free_y") +
  labs(title = "Cluster activity by hour (faceted view)",
       x = "Hour of Day", y = "Trips (per cluster)", fill = "Cluster") +
  theme_minimal()

hourly_cluster_summary <- uber_filtered %>%
  group_by(cluster, hour) %>%
  summarise(trip_count = n(), .groups = "drop")

ggplot(hourly_cluster_summary, aes(x = as.numeric(hour), y = trip_count, color = factor(cluster))) +
  geom_line(linewidth = 1) +
  geom_point() +
  labs(title = "Cluster activity by hour (line plot, base R aggregation)",
       x = "Hour of Day", y = "Trips", color = "Cluster") +
  theme_minimal()

These plots show how ride activity changes by hour for each cluster.

Cluster 1

In Cluster 1 ride activity increases around 6 AM, peaks at 8 AM, dips during midday and rises again between 4 and 6 PM. This pattern makes sense as these are the times people usually commute between work and their homes. The high number of rides and dense activity in Cluster 1 tells us that it mostly represents a city center or a major commercial area.

Cluster 2

Cluster 2 has fewer and more irregular trips. Although there is a small peak in the evening (similar to Cluster 1), the scattered pattern tells us that it probably represents suburban or residential areas where people use Uber less often and mostly for occasional rides.

Analyzing the cluster centers by time

hourly_core_summary <- uber_core %>%
  group_by(cluster, hour) %>%
  summarise(trip_count = n(), .groups = "drop")

ggplot(hourly_core_summary, aes(x = hour, y = trip_count, color = factor(cluster))) + geom_line(size = 1) + 
  geom_point(size = 2) +
  labs(title = paste0("Hourly activity within a specific radius"),
       x = "Hour of Day", y = "Number of Trips", color = "Cluster") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

To focus on the main areas of each cluster, I applied a radius based filter around each center to extract data points that best represent each cluster. The line chart shows that even with only these core points, Cluster 1 keeps its pattern of having 2 peaks (one around 8AM and one around 5PM), while Cluster 2 has small and irregular bursts of activity.

This radius based filtering (on time of the day) shows that most Uber rides come from the main cluster centers, and the demand patterns around these areas stay consistent even after removing spatial noise.

2. Analysis based on date (day of the month and weekday )

ggplot(uber_filtered, aes(x = day, fill = factor(cluster))) +
  geom_histogram(binwidth = 1, color = "white") +
  facet_wrap(~ cluster, scales = "free_y") +
  labs(title = "Cluster activity by Day (faceted view)",
       x = "Day of the month", y = "Trips (per cluster)", fill = "Cluster") +
  theme_minimal()

ggplot(uber_filtered, aes(x = weekday, fill = factor(cluster))) +
  geom_bar(color = "white") +
  facet_wrap(~ cluster, scales = "free_y") +
  labs(
    title = "Cluster activity by Weekday (faceted view)",
    x = "Weekdays",
    y = "Trips (per cluster)",
    fill = "Cluster"
  ) +
  theme_minimal()

daily_cluster_summary <- uber_filtered %>%
  group_by(cluster, day) %>%
  summarise(trip_count = n(), .groups = "drop")

ggplot(daily_cluster_summary, aes(x = day, y = trip_count, color = factor(cluster))) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  labs(title = "Cluster activity by day of month",
       x = "Day of Month (September 2014)",
       y = "Number of Trips",
       color = "Cluster") +
  theme_minimal()

weekday_cluster_summary <- uber_filtered %>%
  filter(cluster != 0) %>%
  group_by(cluster, weekday) %>%
  summarise(trip_count = n(), .groups = "drop")

ggplot(weekday_cluster_summary, aes(x = weekday, y = trip_count, color = factor(cluster), group = cluster)) +
  geom_line(size = 1.1) +
  geom_point(size = 2) +
  labs(title = "Cluster activity by weekday",
       x = "Day of Week",
       y = "Number of Trips",
       color = "Cluster") +
  theme_minimal()

These plots show the daily and weekday activity across a month.

Cluster 1

Cluster 1 shows a stable and consistent level of demand throughout the month, with daily trip counts generally ranging from 250 to 450 rides. We can see that the activity is usually higher on the weekdays (especially Tuesday, Thursday, and Friday) and are noticeably lesser on Sunday, which makes sense as there is no work on Sundays and shops tend to be closed as well.

This pattern aligns well with our previous assumption that these travels are routine travels (for office commutes, errands etc) and Cluster 1 mainly represents a high density city or downtown region.

The consistency of this trend across both daily and weekday perspectives also suggests that demand in this cluster remains largely unaffected by short-term factors such as weather or local events (seasonal factors) which means that it is easy to forecast what the data for the next couple of days might look like.

Cluster 2

Cluster 2 shows a highly irregular and inconsistent level of demand throughout the month, with daily trip counts often remaining below ten rides per day. Unlike Cluster 1, there is no clear pattern or repetition across days, which could mean that that ride activity in this region is not driven by regular commuting behavior.

This pattern aligns well with our earlier assumption that Cluster 2 represents low-density suburban or outskirts, where Uber demand is scattered and more dependent on occasional needs rather than daily commuting.

The irregular trip patterns across days and weekdays suggests that demand in this cluster might depend heavily on short term factors like local events, weather, or personal travel plans, making it less predictable than the steady activity seen in Cluster 1.

Analyzing the cluster centers by date

daily_core_summary <- uber_core %>%
  group_by(cluster, day) %>%
  summarise(trip_count = n(), .groups = "drop")

ggplot(daily_core_summary, aes(x = day, y = trip_count, color = factor(cluster))) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  labs(
    title = "Daily activity within a specific radius of cluster centers",
    x = "Day of Month (September 2014)",
    y = "Number of Trips",
    color = "Cluster"
  ) +
  theme_minimal()

Even when looking only at points within a specific radius of each cluster center, Cluster 1 still shows clear ups and downs in activity (even though it’s not as prominent), meaning demand in the central area stays strong and consistent. Cluster 2, on the other hand, remains low and irregular, suggesting that the lower demand is a natural feature of the area rather than just because the trips are more spread out.

4. Forecasting

To estimate future ride demand, I used the ARIMA model, based on recommendations from Google and Stack Overflow. The data was grouped by day and cluster, and I generated a seven-day forecast for each cluster using the arima() function.

The plots show the historical uber_data on the left of the dashed line and the predicted values for the next week on the right.

daily_cluster_trips <- uber_filtered %>%
  mutate(date = as.Date(Date.Time)) %>%
  group_by(cluster, date) %>%
  summarise(trip_count = n(), .groups = "drop")

daily_cluster_trips_full <- daily_cluster_trips %>%
  group_by(cluster) %>%
  complete(date = seq(min(date), max(date), by = "day"), fill = list(trip_count = 0)) %>%
  ungroup()

forecasted_values <- data.frame()

for (cl in unique(daily_cluster_trips_full$cluster)) {
  cluster_data <- daily_cluster_trips_full %>% filter(cluster == cl)
  
  ts_cluster <- ts(cluster_data$trip_count, frequency = 7)
  
  model <- auto.arima(ts_cluster)
  forecast_cluster <- forecast(model, h = 7)
  
  temp <- data.frame(
    cluster = cl,
    date = seq(max(cluster_data$date) + 1, by = "day", length.out = 7),
    forecast = as.numeric(forecast_cluster$mean)
  )
  
  forecasted_values <- rbind(forecasted_values, temp)
}

combined_data <- rbind(
  daily_cluster_trips_full %>% rename(forecast = trip_count),
  forecasted_values
)

# Cluster 1
ggplot(combined_data %>% filter(cluster == 1),
       aes(x = date, y = forecast)) +
  geom_line(color = "#1f77b4", size = 1.2) +
  geom_vline(xintercept = max(daily_cluster_trips_full$date),
             linetype = "dashed", color = "black") +
  labs(title = "Cluster 1 — Actual and Forecasted Daily Trips",
       x = "Date", y = "Trips") +
  theme_minimal()

# Cluster 2
ggplot(combined_data %>% filter(cluster == 2),
       aes(x = date, y = forecast)) +
  geom_line(color = "#ff7f0e", size = 1.2) +
  geom_vline(xintercept = max(daily_cluster_trips_full$date),
             linetype = "dashed", color = "black") +
  labs(title = "Cluster 2 — Actual and Forecasted Daily Trips",
       x = "Date", y = "Trips") +
  theme_minimal()

Cluster 1

The forecast for Cluster 1 shows the same pattern of ups and downs (weekdays and weekends). Daily trips range between about 250 and 450, with regular peaks on weekdays that match the days where people mostly commute for work. The forecast shows this pattern will likely stay the same over the next week, meaning demand in the central area is steady and predictable.

Cluster 2

The forecast for Cluster 2 shows very low and irregular demand, with most days having fewer than five trips. There is no clear pattern between weekdays and weekends, which as mentioned before, means that rides in this area happen only occasionally or during special events. The forecast for the next week shows the same flat trend, meaning demand in these outer areas is unstable and hard to predict.