#Loading packages
library(factoextra)
library(flexclust)
library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)
library(tidyverse)
library(dendextend)
library(dbscan)
library(RColorBrewer)
library(tidyverse)
library(dendextend)
library(leaflet)
library(ggplot2)
#Loading data, separating data and time
uber.data <- read.csv("C:/Users/User/OneDrive/Pulpit/Studia magisterskie/USL/task_1/uber-data.csv")
uber.data = uber.data %>% separate(Date.Time, into = c("date", "time"), sep = " ")
#data preview
str(uber.data)
## 'data.frame': 1028136 obs. of 5 variables:
## $ date: chr "9/1/2014" "9/1/2014" "9/1/2014" "9/1/2014" ...
## $ time: chr "0:01:00" "0:01:00" "0:03:00" "0:06:00" ...
## $ Lat : num 40.2 40.8 40.8 40.7 40.8 ...
## $ Lon : num -74 -74 -74 -74 -73.9 ...
## $ Base: chr "B02512" "B02512" "B02512" "B02512" ...
summary(uber.data)
## date time Lat Lon
## Length:1028136 Length:1028136 Min. :39.99 Min. :-74.77
## Class :character Class :character 1st Qu.:40.72 1st Qu.:-74.00
## Mode :character Mode :character Median :40.74 Median :-73.98
## Mean :40.74 Mean :-73.97
## 3rd Qu.:40.76 3rd Qu.:-73.96
## Max. :41.35 Max. :-72.72
## Base
## Length:1028136
## Class :character
## Mode :character
##
##
##
any(is.na(uber.data))
## [1] FALSE
Here I select only the columns with Longitude and Latitude and I choose a random sample of the data to speed up the code, as the original dataset has over 1 million rows.
uber.data_short<- uber.data[sample(nrow(uber.data), 1000), ]
set.seed(123)
locations_short <- uber.data_short[3:4]
Visualizing the data on a map of New York, I can see that most observations are concentrated in Manhattan.
leaflet(data = locations_short) %>%
addTiles() %>%
addMarkers(~Lon, ~Lat)
I experimented with different clustering models and distance metrics. Based on the silhouette scores, the optimal number of clusters appears to be either 2 or 5.
opt3<-Optimal_Clusters_KMeans(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette")
opt4<-Optimal_Clusters_Medoids(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette",distance_metric='euclidean')
##
## Based on the plot give the number of clusters (greater than 1) that you consider optimal?
opt5<-Optimal_Clusters_Medoids(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette",distance_metric='manhattan')
##
## Based on the plot give the number of clusters (greater than 1) that you consider optimal?
opt_aut<-pamk(locations_short, krange=2:10, criterion="asw", usepam=TRUE, scaling=FALSE, alpha=0.001, diss=inherits(locations_short, "dist"), critout=FALSE)
opt_aut
## $pamobject
## Medoids:
## ID Lat Lon
## 1017137 143 40.7435 -73.9849
## 62995 485 40.7446 -73.8437
## Clustering vector:
## 34964 248356 812924 69114 622188 832696 202515 349104 160372 518549
## 1 1 1 1 1 2 1 1 2 1
## 445114 873758 734479 942973 748237 1009936 659234 803536 863103 846849
## 1 2 1 1 1 1 1 1 1 1
## 832050 22897 53085 685383 736530 384005 712484 787858 216765 606188
## 1 2 1 1 1 1 1 1 1 1
## 319210 739725 575532 712534 496120 7829 784754 44596 464787 414315
## 1 2 1 2 1 1 1 1 1 1
## 128481 188406 551352 231122 995870 417005 465025 1002294 653260 370766
## 1 1 1 1 1 1 1 1 1 2
## 529872 258965 309953 147697 626640 750519 491100 319126 4317 584291
## 1 1 1 1 1 1 1 1 1 1
## 993707 734441 825454 216923 609041 814172 637988 693998 1001170 26568
## 1 1 1 1 1 1 1 1 1 1
## 203212 847316 355332 925566 155516 68480 413284 179412 60884 546334
## 1 1 1 1 1 1 1 1 1 1
## 195934 449937 306089 745259 130491 1002397 173374 754075 182352 567578
## 1 2 1 1 1 1 2 1 1 1
## 723788 235416 930997 143374 143017 593873 697535 842703 478495 70301
## 1 1 1 1 1 1 2 1 1 1
## 987678 354060 480090 271885 500249 375399 741010 832480 740677 928179
## 2 1 1 1 1 1 1 1 1 1
## 819407 843198 1009579 196088 900671 380710 1025054 839645 486046 281419
## 1 1 2 1 1 1 1 2 1 1
## 141973 160493 56896 267276 879346 285454 256924 392667 794137 713083
## 1 1 1 1 1 1 1 1 2 1
## 919986 546505 48995 645119 604265 779042 92082 876896 198304 810942
## 1 1 1 1 1 1 1 2 1 1
## 135105 861985 1017137 127913 894327 508032 918425 460217 9040 532548
## 1 1 1 1 1 1 1 1 1 1
## 851634 1023364 192293 97540 618376 553386 596176 74199 64673 509942
## 1 1 1 1 1 1 1 1 1 1
## 923173 602142 576217 144371 199735 698828 945594 892287 554232 542092
## 1 1 1 1 1 2 1 1 1 1
## 503716 132436 200671 953473 284278 211074 29308 491002 730132 1008051
## 1 1 2 1 1 1 1 1 1 1
## 143362 255247 185117 974018 210817 606915 752857 852462 280093 965251
## 1 1 1 1 1 1 1 1 1 1
## 54772 783671 737364 825340 51389 887896 467143 394954 191920 980827
## 1 1 1 1 1 1 1 1 2 1
## 989183 757440 1024211 742369 492795 778518 103545 870705 588697 721657
## 1 1 1 1 1 1 1 1 1 1
## 379898 173637 134165 223562 568374 432049 970637 504416 140278 775612
## 1 1 1 1 1 1 1 1 1 1
## 222863 721199 1007968 571653 98697 10796 165832 321752 693048 142779
## 1 1 1 1 1 1 1 1 1 1
## 354656 498718 3200 996020 313727 121425 259695 621278 79143 727941
## 1 1 1 1 1 1 1 1 1 1
## 703746 303270 1009859 472620 34255 206110 561362 789546 734639 834877
## 1 1 1 2 1 1 2 1 1 1
## 566431 443643 180248 511872 559414 388541 130422 117045 312385 839302
## 1 1 1 1 1 1 1 1 1 1
## 327547 198995 294262 879957 572619 310265 689294 36141 105978 162597
## 1 1 2 2 1 1 1 1 1 1
## 943211 345656 265333 617814 297086 391884 974286 444310 594797 794726
## 2 1 1 1 1 1 1 1 1 2
## 757097 668498 103406 761992 982789 1020194 634239 208679 854194 566507
## 1 1 1 1 1 1 1 1 1 1
## 462829 728622 421977 68188 536247 765076 773866 77111 639817 199738
## 1 1 1 1 1 2 1 1 1 1
## 387020 447802 292324 783999 102353 837178 468149 281833 515056 483122
## 2 2 2 1 1 1 1 1 1 1
## 1011523 706503 464808 702887 401296 946998 58156 279848 400263 182864
## 1 1 2 2 1 1 1 1 2 1
## 1019796 449469 838733 412256 1007181 518056 671419 296979 843564 382370
## 1 1 2 1 1 1 1 1 1 1
## 1006476 831662 664583 487666 684739 521261 651841 538414 542996 706933
## 1 1 2 1 1 1 1 1 2 1
## 538241 177014 431983 909086 392741 32727 363281 157824 678889 166812
## 1 1 1 1 1 2 1 1 1 1
## 755113 718793 646550 301247 741062 879363 405502 695598 530779 151598
## 1 1 1 1 1 1 1 1 1 1
## 1025678 718590 928641 803208 382271 627585 891809 932297 688763 510823
## 1 1 1 1 1 1 1 1 1 1
## 111770 333837 694867 877714 474736 407192 457824 789852 795794 965300
## 1 1 1 1 1 1 1 1 1 1
## 796950 271121 882703 338321 538734 969380 557565 288653 484845 215939
## 1 1 2 1 2 1 1 1 1 1
## 530547 742583 475084 717116 285574 750939 887647 391914 271104 371993
## 1 1 1 1 1 1 1 1 1 1
## 646554 789123 196887 918768 847239 315463 501741 161971 588818 663486
## 1 1 1 1 1 1 1 1 1 1
## 474915 117106 8 562134 472935 112841 870687 215662 278325 601918
## 1 1 1 1 1 1 1 1 1 1
## 632599 145800 678893 173527 843753 4988 569013 107529 897333 205959
## 1 1 1 1 2 2 1 1 1 1
## 871988 28801 297500 551661 296106 389799 856235 198114 72186 475681
## 1 1 1 1 1 1 1 1 1 1
## 430722 257799 1001500 895084 322429 815450 708802 74501 893002 711979
## 1 1 2 1 1 1 1 1 1 1
## 13402 347634 987475 330889 183758 275531 556212 412644 440156 616764
## 2 1 1 1 1 1 1 1 1 1
## 494080 770339 6885 683046 730469 373921 294162 38203 331152 489925
## 1 1 1 1 1 1 2 1 1 1
## 31866 884864 187594 809847 90728 309342 477462 605358 658722 606069
## 1 1 1 1 1 1 2 2 1 1
## 357750 97354 115330 779791 62995 173926 990517 835742 256231 964346
## 1 1 1 1 2 1 1 2 1 1
## 115995 511667 984354 657685 903665 484731 358417 539726 530767 590912
## 1 2 2 1 1 1 1 1 1 1
## 557375 168802 188935 749708 933874 574044 403386 684523 755573 999413
## 1 1 1 1 1 1 1 1 1 1
## 946247 760167 539340 272627 445452 519399 420264 670889 398766 166533
## 1 1 1 1 1 1 1 1 1 1
## 855839 153834 487156 527166 909832 350422 366563 233345 864266 289545
## 1 1 1 1 1 1 1 1 1 1
## 387988 874343 606724 730729 351109 506498 713622 708128 567503 185578
## 1 2 1 1 2 1 1 2 1 1
## 962494 933373 385040 34683 46514 352900 618756 709495 356613 168767
## 1 1 1 2 2 1 1 1 1 1
## 473072 478411 518755 755920 987908 166177 545963 389490 861310 435606
## 1 1 1 1 1 1 1 1 1 1
## 208304 403819 987898 509177 286523 144652 921519 339965 357011 543660
## 1 1 1 1 1 1 1 1 1 1
## 180758 46722 889293 706990 53327 848293 370657 58942 187353 423495
## 1 1 1 1 1 1 1 1 1 1
## 296623 733172 563897 505843 736676 473151 423872 159971 1026324 665776
## 1 1 1 1 1 1 1 1 1 1
## 681356 545142 93572 959581 412028 471463 247949 74278 831155 261559
## 1 1 1 1 1 1 1 1 1 1
## 268304 376395 538479 285388 768651 474707 965957 749792 804126 693294
## 1 1 1 1 1 1 1 1 1 1
## 655762 55522 553502 83653 788190 193278 709038 153688 976104 689909
## 1 1 1 1 1 1 1 1 1 1
## 309162 500413 342287 658140 310890 714867 762477 31049 785570 561378
## 1 1 1 1 1 1 2 1 1 1
## 778164 34259 714279 681613 543835 942681 38891 237439 484680 321432
## 1 1 1 2 1 1 2 2 1 1
## 826688 623264 422878 1013507 273686 391758 705143 429597 532766 424559
## 1 1 2 1 1 1 1 1 2 1
## 972343 334036 567980 977847 514631 945589 876203 15466 646783 164768
## 1 1 1 1 1 1 1 1 1 1
## 644570 218428 704506 221296 521462 620410 469557 886565 110338 565723
## 1 1 1 1 1 1 1 1 2 1
## 462077 797070 311697 390709 87432 260193 504529 908812 884782 174935
## 1 1 1 1 1 2 1 1 1 1
## 657893 43050 183007 113950 549650 274493 937486 970017 268045 530057
## 1 1 1 1 1 1 1 1 1 1
## 845585 566124 744154 629495 187044 894176 142493 429579 916237 314219
## 1 1 1 1 2 1 1 1 1 1
## 668188 574441 613975 378794 175656 876139 903895 361033 954393 338782
## 2 1 1 1 1 1 1 1 1 1
## 620226 165537 649305 514817 1002867 937705 88148 475104 150584 495967
## 1 1 1 1 1 1 1 1 1 1
## 490136 156963 958040 253277 158440 997789 563205 151833 903578 856467
## 1 1 1 2 1 1 1 1 1 1
## 837836 855665 303706 763783 220080 335892 216607 565429 858368 172907
## 1 1 1 1 1 1 1 2 1 1
## 380462 121766 675694 915813 815942 43211 153179 279942 172717 395288
## 1 1 2 1 1 1 1 1 1 1
## 951595 47990 325699 703276 24553 71150 1022975 927789 970193 679606
## 2 1 1 1 1 1 1 2 2 1
## 25078 985612 236961 7659 716048 732328 483931 567240 919354 153424
## 1 1 1 1 1 1 1 1 1 1
## 320670 886391 809003 69029 99288 185606 95749 447152 7392 957629
## 1 1 1 1 1 2 1 1 1 1
## 556283 659243 297179 933537 112837 247343 960259 306676 408080 48744
## 1 1 1 1 1 1 1 1 1 1
## 288048 461355 448433 910623 668531 521237 953654 596239 410366 130206
## 1 2 1 1 1 1 1 1 1 1
## 651740 565792 867700 1027165 311888 127093 212634 214828 294422 801205
## 1 2 1 1 1 1 1 1 1 2
## 366546 814115 440701 409028 476891 941485 199047 365175 805259 949402
## 1 2 1 1 1 1 1 1 1 1
## 901491 708887 578535 822758 1022097 796503 70057 168780 230881 824138
## 1 1 1 2 1 1 1 1 1 1
## 802293 915865 575751 168019 704502 766331 734393 373170 413535 378559
## 1 1 1 1 1 1 1 1 1 1
## 336811 472240 903666 255176 986530 600469 634407 167127 309499 365625
## 1 1 1 1 1 1 1 1 1 1
## 670814 776532 564835 222791 855934 968673 876442 311606 286554 416334
## 1 1 1 2 1 1 1 1 1 1
## 644706 13937 83524 165361 191032 154575 606695 827792 374724 1026959
## 1 1 1 1 1 2 1 1 1 1
## 726068 999249 55680 297554 961201 627548 370188 7795 487737 465242
## 1 1 1 1 1 1 1 1 2 1
## 409320 767723 901580 289518 930637 283010 939244 389784 501542 7622
## 1 1 1 1 1 1 1 2 1 1
## 188083 995271 589789 633981 97979 811375 328635 84228 656652 702561
## 1 2 1 1 1 1 1 1 1 1
## 319383 784108 563870 563706 858505 766941 802763 105061 270819 840371
## 1 1 1 1 2 1 1 1 1 1
## 64078 555012 995650 909859 368389 885437 267332 497921 113600 714689
## 1 2 1 1 1 1 1 2 1 1
## 119464 904461 278038 18506 348242 301099 1017007 177586 810813 144765
## 1 1 2 2 1 1 1 1 1 1
## 255790 68411 364045 997194 152585 863000 753631 704965 422518 27819
## 1 1 1 1 1 1 1 1 1 1
## 587901 271980 52925 833000 474850 281264 274218 831150 104662 640887
## 1 1 1 1 1 1 1 1 1 1
## 954222 211147 801003 876443 1011965 23035 388949 655763 92513 710838
## 1 1 1 2 1 1 1 2 1 1
## 857352 29751 891360 428038 609581 717273 187233 421500 917005 501300
## 1 1 1 1 1 1 1 1 1 1
## 283506 709156 714794 267648 749713 678816 888747 139349 360423 311791
## 1 1 1 1 1 1 1 1 1 1
## 331756 653606 208247 414175 126917 225007 674622 957915 818431 120711
## 1 1 1 1 1 1 1 1 1 1
## 578866 1016985 1022900 144221 762437 728057 799472 111724 401378 32529
## 1 2 1 1 1 1 1 1 1 1
## Objective function:
## build swap
## 0.03863418 0.03854497
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## $nc
## [1] 2
##
## $crit
## [1] 0.0000000 0.6653447 0.3694718 0.2145569 0.2365516 0.3593187 0.3992730
## [8] 0.4225717 0.3542251 0.3566124
After each clustering model, Iâll visualize the clusters on the map to assess whether this approach makes sense.
k-means
cluster_km <- kmeans(locations_short, 2)
plot(locations_short$Lat, locations_short$Lon, col = cluster_km$cluster, pch = 19,
xlab = "Latitude", ylab = "Longitude", main = "K-means Clustering")
locations_kmeans <- data.frame(locations_short, cluster = as.factor(cluster_km$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_kmeans$cluster)
leaflet(data = locations_kmeans) %>%
addTiles() %>%
addCircleMarkers(
~Lon, ~Lat,
color = ~pal(cluster),
radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
popup = ~paste("Cluster:", cluster)
) %>%
addLegend(position = "topright", pal = pal, values = locations_kmeans$cluster, title = "Clusters")
Using 2 clusters seems too simplistic, as the map shows one large cluster with a few observations on the east. In the next models, Iâll experiment with a larger number of clusters to obtain more interesting and realistic results.
PAM
cluster_pam<-eclust(locations_short, "pam", k= 5)
locations_pam <- data.frame(locations_short, cluster = as.factor(cluster_pam$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_pam$cluster)
leaflet(data = locations_pam) %>%
addTiles() %>%
addCircleMarkers(
~Lon, ~Lat,
color = ~pal(cluster),
radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
popup = ~paste("Cluster:", cluster)
) %>%
addLegend(position = "topright", pal = pal, values = locations_pam$cluster, title = "Clusters")
Hierarchical clustering
Here, I experimented with various functions like chust, agnes, and diana, along with different methods (complete, ward). I then compared the number of observations within each cluster. It turned out that when using 5 clusters, one cluster was usually very small (around 1 observation), which didnât make sense.After several attempts, I decided to use the agnes function with the Ward method and chose 4 clusters.
d <- dist(locations_short, method = "euclidean")
hc <- agnes(d, method = "ward")
# cut tree into 4 groups
sub_grp <- cutree(hc, k = 4)
table(sub_grp)
## sub_grp
## 1 2 3 4
## 496 417 74 13
locations_hierarchical <-locations_short %>%
mutate(cluster = sub_grp)
pltree(hc, cex = 0.6, hang = -1, main = "dendrogram - agnes", labels = FALSE)
rect.hclust(hc, k = 4, border = 2:5)
pal <- colorFactor(palette = "Set1", domain = locations_hierarchical$cluster)
leaflet(data = locations_hierarchical) %>%
addTiles() %>%
addCircleMarkers(
~Lon, ~Lat,
color = ~pal(cluster),
radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
popup = ~paste("Cluster:", cluster)
) %>%
addLegend(position = "topright", pal = pal, values = locations_hierarchical$cluster, title = "Clusters")
DB scan
In this section I chosen eps basen KNNdisplot and by putting different numbers and looking how dbscan splits the map.
dbscan::kNNdistplot(locations_short, k = 2)
db <- fpc::dbscan(locations_short, eps = 0.01, MinPts = 5)
plot(db, locations_short, main = "DBSCAN", frame = FALSE)
locations_dbscan <- data.frame(locations_short, cluster = as.factor(db$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_dbscan$cluster)
leaflet(data = locations_dbscan) %>%
addTiles() %>%
addCircleMarkers(
~Lon, ~Lat,
color = ~pal(cluster),
radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
popup = ~paste("Cluster:", cluster)
) %>%
addLegend(position = "topright", pal = pal, values = locations_dbscan$cluster, title = "Clusters")
Here, using DBSCAN, we obtained 10 clusters plus outliers. On the map, we can see one large cluster, which includes Manhattan. Unfortunately, it was difficult to separate some districts because they are very close to each other.The other clusters are also interesting. The model for separate clusters assigned taxis departing from airports or located in specific regions of New York. In my opinion, DBSCAN performed very well with this task, as it best reflected the actual areas where taxis are picked up.
To sum up I made clustering using 4 models: k-means, pan, hierarchical clustering and dbscan. Now letâs see some statistics of outputs of these models
Calinski-Harabasz index
#K-means
round(calinhara(locations_short, cluster_km$cluster),digits=2)
## [1] 611.48
#pam
round(calinhara(locations_short, cluster_pam$cluster),digits=2)
## [1] 439.32
#hierarchical clustering
round(calinhara(locations_short, locations_hierarchical$cluster),digits=2)
## [1] 566.28
#dbscan
round(calinhara(locations_short, db$cluster),digits=2)
## [1] 704.75
according to Calinski-Harabasz index, the best method is k-means with 2 clusters
Duda-Hart index
#K-means
dudahart2(locations_short, cluster_km$cluster)
## $p.value
## [1] 0.00575951
##
## $dh
## [1] 0.6200758
##
## $compare
## [1] 0.6063293
##
## $cluster1
## [1] TRUE
##
## $alpha
## [1] 0.001
##
## $z
## [1] 3.090232
#pam
dudahart2(locations_short, cluster_pam$cluster)
## $p.value
## [1] 0
##
## $dh
## [1] 0.2337995
##
## $compare
## [1] 0.6063293
##
## $cluster1
## [1] FALSE
##
## $alpha
## [1] 0.001
##
## $z
## [1] 3.090232
#hierarchical clustering
dudahart2(locations_short, locations_hierarchical$cluster)
## $p.value
## [1] 0
##
## $dh
## [1] 0.199517
##
## $compare
## [1] 0.6063293
##
## $cluster1
## [1] FALSE
##
## $alpha
## [1] 0.001
##
## $z
## [1] 3.090232
#dbscan
dudahart2(locations_short, db$cluster)
## $p.value
## [1] 0
##
## $dh
## [1] 0.1668409
##
## $compare
## [1] 0.6063293
##
## $cluster1
## [1] FALSE
##
## $alpha
## [1] 0.001
##
## $z
## [1] 3.090232
In all of these cases Duda-Hart index is equal or really close to 0, which means clusters are not homogenous. Normally, such a conclusion could be alarming; however, considering that we have only two variables, it might be insufficient, and the Duda-Hart index may not be a suitable measure for evaluating our models.
#Adding date and time information
uber.data_short$time <- as.POSIXct(uber.data_short$time, format="%H:%M:%S", tz="UTC")
uber.data_short$date <- as.Date(uber.data_short$date, format = "%m/%d/%Y")
locations_kmeans_dt<- merge(locations_kmeans, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_pam_dt<- merge(locations_pam, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_hierarchical_dt<- merge(locations_hierarchical, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_dbscan_dt<- merge(locations_dbscan, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
Creating boxplots for every model
ggplot(locations_kmeans_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
geom_boxplot() +
scale_y_datetime(labels = scales::date_format("%H:%M")) +
labs(title = "Boxplot of Time by K-means cluster",
x = "Cluster",
y = "Time") +
scale_fill_brewer(palette = "Set1")+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_pam_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
geom_boxplot() +
scale_y_datetime(labels = scales::date_format("%H:%M")) +
labs(title = "Boxplot of Time by pam cluster",
x = "Cluster",
y = "Time") +
scale_fill_brewer(palette = "Set1")+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_hierarchical_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
geom_boxplot() +
scale_y_datetime(labels = scales::date_format("%H:%M")) +
labs(title = "Boxplot of Time by hierarchical cluster",
x = "Cluster",
y = "Time") +
scale_fill_brewer(palette = "Set1")+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_dbscan_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
geom_boxplot() +
scale_y_datetime(labels = scales::date_format("%H:%M")) +
labs(title = "Boxplot of Time by dbscan cluster",
x = "Cluster",
y = "Time") +
scale_fill_brewer(palette = "Set1")+
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
In most cases, the clusters behave similarly in terms of time. However, in the case of DBSCAN, some clusters are characterized by earlier hours, while others are more concentrated around evening hours. After looking at the map, it turns out that these are observations occurring at airports near New York, which is an interesting observation and shows that the creation of these clusters was a good move.
ggplot(locations_kmeans_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
geom_boxplot() +
scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
labs(title = "Boxplot of Date by K-means cluster",
x = "Cluster",
y = "Date") +
scale_fill_brewer(palette = "Set1") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_pam_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
geom_boxplot() +
scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
labs(title = "Boxplot of Date by pam cluster",
x = "Cluster",
y = "Date") +
scale_fill_brewer(palette = "Set1") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_hierarchical_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
geom_boxplot() +
scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
labs(title = "Boxplot of Date by hierarchical cluster",
x = "Cluster",
y = "Date") +
scale_fill_brewer(palette = "Set1") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
ggplot(locations_dbscan_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
geom_boxplot() +
scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
labs(title = "Boxplot of Date by Cluster",
x = "Cluster",
y = "Date") +
scale_fill_brewer(palette = "Set1") +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none")
When it comes to date analysis, the situation is very similar to the time analysis. Only clustering with DBSCAN reveals noticeable differences between clusters. Some clusters are more concentrated around specific weeks in September, while others include observations from throughout the entire month.
In the analysis of Uber ride location data, we had only four variables available, with just two used in clustering. The study explored multiple approaches by applying various models, such as k-means, PAM, hierarchical, and DBSCAN, also experimented with parameters, distance measures, and the number of clusters in these models. Based on visual analysis on the map and statistical insights, it can be concluded that DBSCAN performed best in this task, as it most accurately reflected locations on the New York City map and captured the actual, diverse nature of Uber rides.