Uber task

Uber task for USL classes

#Loading packages
library(factoextra)
library(flexclust)
library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)
library(tidyverse)
library(dendextend)
library(dbscan)
library(RColorBrewer)
library(tidyverse)
library(dendextend)
library(leaflet)
library(ggplot2)

#Loading data, separating data and time 
uber.data <- read.csv("C:/Users/User/OneDrive/Pulpit/Studia magisterskie/USL/task_1/uber-data.csv")
uber.data = uber.data %>% separate(Date.Time, into = c("date", "time"), sep = " ")

#data preview
str(uber.data)

## 'data.frame':    1028136 obs. of  5 variables:
##  $ date: chr  "9/1/2014" "9/1/2014" "9/1/2014" "9/1/2014" ...
##  $ time: chr  "0:01:00" "0:01:00" "0:03:00" "0:06:00" ...
##  $ Lat : num  40.2 40.8 40.8 40.7 40.8 ...
##  $ Lon : num  -74 -74 -74 -74 -73.9 ...
##  $ Base: chr  "B02512" "B02512" "B02512" "B02512" ...

summary(uber.data)

##      date               time                Lat             Lon        
##  Length:1028136     Length:1028136     Min.   :39.99   Min.   :-74.77  
##  Class :character   Class :character   1st Qu.:40.72   1st Qu.:-74.00  
##  Mode  :character   Mode  :character   Median :40.74   Median :-73.98  
##                                        Mean   :40.74   Mean   :-73.97  
##                                        3rd Qu.:40.76   3rd Qu.:-73.96  
##                                        Max.   :41.35   Max.   :-72.72  
##      Base          
##  Length:1028136    
##  Class :character  
##  Mode  :character  
##                    
##                    
##

any(is.na(uber.data))

## [1] FALSE

Here I select only the columns with Longitude and Latitude and I choose a random sample of the data to speed up the code, as the original dataset has over 1 million rows.

uber.data_short<- uber.data[sample(nrow(uber.data), 1000), ]
set.seed(123)
locations_short <- uber.data_short[3:4]

Visualizing the data on a map of New York, I can see that most observations are concentrated in Manhattan.

leaflet(data = locations_short) %>%
  addTiles() %>%  
  addMarkers(~Lon, ~Lat)

I experimented with different clustering models and distance metrics. Based on the silhouette scores, the optimal number of clusters appears to be either 2 or 5.

opt3<-Optimal_Clusters_KMeans(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette")

opt4<-Optimal_Clusters_Medoids(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette",distance_metric='euclidean')

##   
## Based on the plot give the number of clusters (greater than 1) that you consider optimal?

opt5<-Optimal_Clusters_Medoids(locations_short, max_clusters=10, plot_clusters=TRUE, criterion="silhouette",distance_metric='manhattan')

##   
## Based on the plot give the number of clusters (greater than 1) that you consider optimal?

opt_aut<-pamk(locations_short, krange=2:10, criterion="asw", usepam=TRUE, scaling=FALSE, alpha=0.001, diss=inherits(locations_short, "dist"), critout=FALSE)
opt_aut

## $pamobject
## Medoids:
##          ID     Lat      Lon
## 1017137 143 40.7435 -73.9849
## 62995   485 40.7446 -73.8437
## Clustering vector:
##   34964  248356  812924   69114  622188  832696  202515  349104  160372  518549 
##       1       1       1       1       1       2       1       1       2       1 
##  445114  873758  734479  942973  748237 1009936  659234  803536  863103  846849 
##       1       2       1       1       1       1       1       1       1       1 
##  832050   22897   53085  685383  736530  384005  712484  787858  216765  606188 
##       1       2       1       1       1       1       1       1       1       1 
##  319210  739725  575532  712534  496120    7829  784754   44596  464787  414315 
##       1       2       1       2       1       1       1       1       1       1 
##  128481  188406  551352  231122  995870  417005  465025 1002294  653260  370766 
##       1       1       1       1       1       1       1       1       1       2 
##  529872  258965  309953  147697  626640  750519  491100  319126    4317  584291 
##       1       1       1       1       1       1       1       1       1       1 
##  993707  734441  825454  216923  609041  814172  637988  693998 1001170   26568 
##       1       1       1       1       1       1       1       1       1       1 
##  203212  847316  355332  925566  155516   68480  413284  179412   60884  546334 
##       1       1       1       1       1       1       1       1       1       1 
##  195934  449937  306089  745259  130491 1002397  173374  754075  182352  567578 
##       1       2       1       1       1       1       2       1       1       1 
##  723788  235416  930997  143374  143017  593873  697535  842703  478495   70301 
##       1       1       1       1       1       1       2       1       1       1 
##  987678  354060  480090  271885  500249  375399  741010  832480  740677  928179 
##       2       1       1       1       1       1       1       1       1       1 
##  819407  843198 1009579  196088  900671  380710 1025054  839645  486046  281419 
##       1       1       2       1       1       1       1       2       1       1 
##  141973  160493   56896  267276  879346  285454  256924  392667  794137  713083 
##       1       1       1       1       1       1       1       1       2       1 
##  919986  546505   48995  645119  604265  779042   92082  876896  198304  810942 
##       1       1       1       1       1       1       1       2       1       1 
##  135105  861985 1017137  127913  894327  508032  918425  460217    9040  532548 
##       1       1       1       1       1       1       1       1       1       1 
##  851634 1023364  192293   97540  618376  553386  596176   74199   64673  509942 
##       1       1       1       1       1       1       1       1       1       1 
##  923173  602142  576217  144371  199735  698828  945594  892287  554232  542092 
##       1       1       1       1       1       2       1       1       1       1 
##  503716  132436  200671  953473  284278  211074   29308  491002  730132 1008051 
##       1       1       2       1       1       1       1       1       1       1 
##  143362  255247  185117  974018  210817  606915  752857  852462  280093  965251 
##       1       1       1       1       1       1       1       1       1       1 
##   54772  783671  737364  825340   51389  887896  467143  394954  191920  980827 
##       1       1       1       1       1       1       1       1       2       1 
##  989183  757440 1024211  742369  492795  778518  103545  870705  588697  721657 
##       1       1       1       1       1       1       1       1       1       1 
##  379898  173637  134165  223562  568374  432049  970637  504416  140278  775612 
##       1       1       1       1       1       1       1       1       1       1 
##  222863  721199 1007968  571653   98697   10796  165832  321752  693048  142779 
##       1       1       1       1       1       1       1       1       1       1 
##  354656  498718    3200  996020  313727  121425  259695  621278   79143  727941 
##       1       1       1       1       1       1       1       1       1       1 
##  703746  303270 1009859  472620   34255  206110  561362  789546  734639  834877 
##       1       1       1       2       1       1       2       1       1       1 
##  566431  443643  180248  511872  559414  388541  130422  117045  312385  839302 
##       1       1       1       1       1       1       1       1       1       1 
##  327547  198995  294262  879957  572619  310265  689294   36141  105978  162597 
##       1       1       2       2       1       1       1       1       1       1 
##  943211  345656  265333  617814  297086  391884  974286  444310  594797  794726 
##       2       1       1       1       1       1       1       1       1       2 
##  757097  668498  103406  761992  982789 1020194  634239  208679  854194  566507 
##       1       1       1       1       1       1       1       1       1       1 
##  462829  728622  421977   68188  536247  765076  773866   77111  639817  199738 
##       1       1       1       1       1       2       1       1       1       1 
##  387020  447802  292324  783999  102353  837178  468149  281833  515056  483122 
##       2       2       2       1       1       1       1       1       1       1 
## 1011523  706503  464808  702887  401296  946998   58156  279848  400263  182864 
##       1       1       2       2       1       1       1       1       2       1 
## 1019796  449469  838733  412256 1007181  518056  671419  296979  843564  382370 
##       1       1       2       1       1       1       1       1       1       1 
## 1006476  831662  664583  487666  684739  521261  651841  538414  542996  706933 
##       1       1       2       1       1       1       1       1       2       1 
##  538241  177014  431983  909086  392741   32727  363281  157824  678889  166812 
##       1       1       1       1       1       2       1       1       1       1 
##  755113  718793  646550  301247  741062  879363  405502  695598  530779  151598 
##       1       1       1       1       1       1       1       1       1       1 
## 1025678  718590  928641  803208  382271  627585  891809  932297  688763  510823 
##       1       1       1       1       1       1       1       1       1       1 
##  111770  333837  694867  877714  474736  407192  457824  789852  795794  965300 
##       1       1       1       1       1       1       1       1       1       1 
##  796950  271121  882703  338321  538734  969380  557565  288653  484845  215939 
##       1       1       2       1       2       1       1       1       1       1 
##  530547  742583  475084  717116  285574  750939  887647  391914  271104  371993 
##       1       1       1       1       1       1       1       1       1       1 
##  646554  789123  196887  918768  847239  315463  501741  161971  588818  663486 
##       1       1       1       1       1       1       1       1       1       1 
##  474915  117106       8  562134  472935  112841  870687  215662  278325  601918 
##       1       1       1       1       1       1       1       1       1       1 
##  632599  145800  678893  173527  843753    4988  569013  107529  897333  205959 
##       1       1       1       1       2       2       1       1       1       1 
##  871988   28801  297500  551661  296106  389799  856235  198114   72186  475681 
##       1       1       1       1       1       1       1       1       1       1 
##  430722  257799 1001500  895084  322429  815450  708802   74501  893002  711979 
##       1       1       2       1       1       1       1       1       1       1 
##   13402  347634  987475  330889  183758  275531  556212  412644  440156  616764 
##       2       1       1       1       1       1       1       1       1       1 
##  494080  770339    6885  683046  730469  373921  294162   38203  331152  489925 
##       1       1       1       1       1       1       2       1       1       1 
##   31866  884864  187594  809847   90728  309342  477462  605358  658722  606069 
##       1       1       1       1       1       1       2       2       1       1 
##  357750   97354  115330  779791   62995  173926  990517  835742  256231  964346 
##       1       1       1       1       2       1       1       2       1       1 
##  115995  511667  984354  657685  903665  484731  358417  539726  530767  590912 
##       1       2       2       1       1       1       1       1       1       1 
##  557375  168802  188935  749708  933874  574044  403386  684523  755573  999413 
##       1       1       1       1       1       1       1       1       1       1 
##  946247  760167  539340  272627  445452  519399  420264  670889  398766  166533 
##       1       1       1       1       1       1       1       1       1       1 
##  855839  153834  487156  527166  909832  350422  366563  233345  864266  289545 
##       1       1       1       1       1       1       1       1       1       1 
##  387988  874343  606724  730729  351109  506498  713622  708128  567503  185578 
##       1       2       1       1       2       1       1       2       1       1 
##  962494  933373  385040   34683   46514  352900  618756  709495  356613  168767 
##       1       1       1       2       2       1       1       1       1       1 
##  473072  478411  518755  755920  987908  166177  545963  389490  861310  435606 
##       1       1       1       1       1       1       1       1       1       1 
##  208304  403819  987898  509177  286523  144652  921519  339965  357011  543660 
##       1       1       1       1       1       1       1       1       1       1 
##  180758   46722  889293  706990   53327  848293  370657   58942  187353  423495 
##       1       1       1       1       1       1       1       1       1       1 
##  296623  733172  563897  505843  736676  473151  423872  159971 1026324  665776 
##       1       1       1       1       1       1       1       1       1       1 
##  681356  545142   93572  959581  412028  471463  247949   74278  831155  261559 
##       1       1       1       1       1       1       1       1       1       1 
##  268304  376395  538479  285388  768651  474707  965957  749792  804126  693294 
##       1       1       1       1       1       1       1       1       1       1 
##  655762   55522  553502   83653  788190  193278  709038  153688  976104  689909 
##       1       1       1       1       1       1       1       1       1       1 
##  309162  500413  342287  658140  310890  714867  762477   31049  785570  561378 
##       1       1       1       1       1       1       2       1       1       1 
##  778164   34259  714279  681613  543835  942681   38891  237439  484680  321432 
##       1       1       1       2       1       1       2       2       1       1 
##  826688  623264  422878 1013507  273686  391758  705143  429597  532766  424559 
##       1       1       2       1       1       1       1       1       2       1 
##  972343  334036  567980  977847  514631  945589  876203   15466  646783  164768 
##       1       1       1       1       1       1       1       1       1       1 
##  644570  218428  704506  221296  521462  620410  469557  886565  110338  565723 
##       1       1       1       1       1       1       1       1       2       1 
##  462077  797070  311697  390709   87432  260193  504529  908812  884782  174935 
##       1       1       1       1       1       2       1       1       1       1 
##  657893   43050  183007  113950  549650  274493  937486  970017  268045  530057 
##       1       1       1       1       1       1       1       1       1       1 
##  845585  566124  744154  629495  187044  894176  142493  429579  916237  314219 
##       1       1       1       1       2       1       1       1       1       1 
##  668188  574441  613975  378794  175656  876139  903895  361033  954393  338782 
##       2       1       1       1       1       1       1       1       1       1 
##  620226  165537  649305  514817 1002867  937705   88148  475104  150584  495967 
##       1       1       1       1       1       1       1       1       1       1 
##  490136  156963  958040  253277  158440  997789  563205  151833  903578  856467 
##       1       1       1       2       1       1       1       1       1       1 
##  837836  855665  303706  763783  220080  335892  216607  565429  858368  172907 
##       1       1       1       1       1       1       1       2       1       1 
##  380462  121766  675694  915813  815942   43211  153179  279942  172717  395288 
##       1       1       2       1       1       1       1       1       1       1 
##  951595   47990  325699  703276   24553   71150 1022975  927789  970193  679606 
##       2       1       1       1       1       1       1       2       2       1 
##   25078  985612  236961    7659  716048  732328  483931  567240  919354  153424 
##       1       1       1       1       1       1       1       1       1       1 
##  320670  886391  809003   69029   99288  185606   95749  447152    7392  957629 
##       1       1       1       1       1       2       1       1       1       1 
##  556283  659243  297179  933537  112837  247343  960259  306676  408080   48744 
##       1       1       1       1       1       1       1       1       1       1 
##  288048  461355  448433  910623  668531  521237  953654  596239  410366  130206 
##       1       2       1       1       1       1       1       1       1       1 
##  651740  565792  867700 1027165  311888  127093  212634  214828  294422  801205 
##       1       2       1       1       1       1       1       1       1       2 
##  366546  814115  440701  409028  476891  941485  199047  365175  805259  949402 
##       1       2       1       1       1       1       1       1       1       1 
##  901491  708887  578535  822758 1022097  796503   70057  168780  230881  824138 
##       1       1       1       2       1       1       1       1       1       1 
##  802293  915865  575751  168019  704502  766331  734393  373170  413535  378559 
##       1       1       1       1       1       1       1       1       1       1 
##  336811  472240  903666  255176  986530  600469  634407  167127  309499  365625 
##       1       1       1       1       1       1       1       1       1       1 
##  670814  776532  564835  222791  855934  968673  876442  311606  286554  416334 
##       1       1       1       2       1       1       1       1       1       1 
##  644706   13937   83524  165361  191032  154575  606695  827792  374724 1026959 
##       1       1       1       1       1       2       1       1       1       1 
##  726068  999249   55680  297554  961201  627548  370188    7795  487737  465242 
##       1       1       1       1       1       1       1       1       2       1 
##  409320  767723  901580  289518  930637  283010  939244  389784  501542    7622 
##       1       1       1       1       1       1       1       2       1       1 
##  188083  995271  589789  633981   97979  811375  328635   84228  656652  702561 
##       1       2       1       1       1       1       1       1       1       1 
##  319383  784108  563870  563706  858505  766941  802763  105061  270819  840371 
##       1       1       1       1       2       1       1       1       1       1 
##   64078  555012  995650  909859  368389  885437  267332  497921  113600  714689 
##       1       2       1       1       1       1       1       2       1       1 
##  119464  904461  278038   18506  348242  301099 1017007  177586  810813  144765 
##       1       1       2       2       1       1       1       1       1       1 
##  255790   68411  364045  997194  152585  863000  753631  704965  422518   27819 
##       1       1       1       1       1       1       1       1       1       1 
##  587901  271980   52925  833000  474850  281264  274218  831150  104662  640887 
##       1       1       1       1       1       1       1       1       1       1 
##  954222  211147  801003  876443 1011965   23035  388949  655763   92513  710838 
##       1       1       1       2       1       1       1       2       1       1 
##  857352   29751  891360  428038  609581  717273  187233  421500  917005  501300 
##       1       1       1       1       1       1       1       1       1       1 
##  283506  709156  714794  267648  749713  678816  888747  139349  360423  311791 
##       1       1       1       1       1       1       1       1       1       1 
##  331756  653606  208247  414175  126917  225007  674622  957915  818431  120711 
##       1       1       1       1       1       1       1       1       1       1 
##  578866 1016985 1022900  144221  762437  728057  799472  111724  401378   32529 
##       1       2       1       1       1       1       1       1       1       1 
## Objective function:
##      build       swap 
## 0.03863418 0.03854497 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"      
## 
## $nc
## [1] 2
## 
## $crit
##  [1] 0.0000000 0.6653447 0.3694718 0.2145569 0.2365516 0.3593187 0.3992730
##  [8] 0.4225717 0.3542251 0.3566124

Clustering

After each clustering model, I’ll visualize the clusters on the map to assess whether this approach makes sense.

k-means

cluster_km <- kmeans(locations_short, 2)
plot(locations_short$Lat, locations_short$Lon, col = cluster_km$cluster, pch = 19,
     xlab = "Latitude", ylab = "Longitude", main = "K-means Clustering")

locations_kmeans <- data.frame(locations_short, cluster = as.factor(cluster_km$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_kmeans$cluster)
leaflet(data = locations_kmeans) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Lon, ~Lat, 
    color = ~pal(cluster),
    radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
    popup = ~paste("Cluster:", cluster)
  ) %>%
  addLegend(position = "topright", pal = pal, values = locations_kmeans$cluster, title = "Clusters")

Using 2 clusters seems too simplistic, as the map shows one large cluster with a few observations on the east. In the next models, I’ll experiment with a larger number of clusters to obtain more interesting and realistic results.

PAM

cluster_pam<-eclust(locations_short, "pam", k= 5)

locations_pam <- data.frame(locations_short, cluster = as.factor(cluster_pam$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_pam$cluster)
leaflet(data = locations_pam) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Lon, ~Lat, 
    color = ~pal(cluster),
    radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
    popup = ~paste("Cluster:", cluster)
  ) %>%
  addLegend(position = "topright", pal = pal, values = locations_pam$cluster, title = "Clusters")

Hierarchical clustering

Here, I experimented with various functions like chust, agnes, and diana, along with different methods (complete, ward). I then compared the number of observations within each cluster. It turned out that when using 5 clusters, one cluster was usually very small (around 1 observation), which didn’t make sense.After several attempts, I decided to use the agnes function with the Ward method and chose 4 clusters.

d <- dist(locations_short, method = "euclidean")
hc <- agnes(d, method = "ward")
# cut tree into 4 groups
sub_grp <- cutree(hc, k = 4)
table(sub_grp)

## sub_grp
##   1   2   3   4 
## 496 417  74  13

locations_hierarchical <-locations_short %>%
  mutate(cluster = sub_grp)

pltree(hc, cex = 0.6, hang = -1, main = "dendrogram - agnes", labels = FALSE)

rect.hclust(hc, k = 4, border = 2:5)

pal <- colorFactor(palette = "Set1", domain = locations_hierarchical$cluster)
leaflet(data = locations_hierarchical) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Lon, ~Lat, 
    color = ~pal(cluster),
    radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
    popup = ~paste("Cluster:", cluster)
  ) %>%
  addLegend(position = "topright", pal = pal, values = locations_hierarchical$cluster, title = "Clusters")

DB scan

In this section I chosen eps basen KNNdisplot and by putting different numbers and looking how dbscan splits the map.

dbscan::kNNdistplot(locations_short, k =  2)

db <- fpc::dbscan(locations_short, eps = 0.01, MinPts = 5)

plot(db, locations_short, main = "DBSCAN", frame = FALSE)

locations_dbscan <- data.frame(locations_short, cluster = as.factor(db$cluster))
pal <- colorFactor(palette = "Set1", domain = locations_dbscan$cluster)
leaflet(data = locations_dbscan) %>%
  addTiles() %>%
  addCircleMarkers(
    ~Lon, ~Lat, 
    color = ~pal(cluster),
    radius = 6, fill = TRUE, fillOpacity = 0.7, stroke = FALSE,
    popup = ~paste("Cluster:", cluster)
  ) %>%
  addLegend(position = "topright", pal = pal, values = locations_dbscan$cluster, title = "Clusters")

Here, using DBSCAN, we obtained 10 clusters plus outliers. On the map, we can see one large cluster, which includes Manhattan. Unfortunately, it was difficult to separate some districts because they are very close to each other.The other clusters are also interesting. The model for separate clusters assigned taxis departing from airports or located in specific regions of New York. In my opinion, DBSCAN performed very well with this task, as it best reflected the actual areas where taxis are picked up.

To sum up I made clustering using 4 models: k-means, pan, hierarchical clustering and dbscan. Now let’s see some statistics of outputs of these models

Calinski-Harabasz index

#K-means
round(calinhara(locations_short, cluster_km$cluster),digits=2)

## [1] 611.48

#pam
round(calinhara(locations_short, cluster_pam$cluster),digits=2)

## [1] 439.32

#hierarchical clustering
round(calinhara(locations_short, locations_hierarchical$cluster),digits=2)

## [1] 566.28

#dbscan
round(calinhara(locations_short, db$cluster),digits=2)

## [1] 704.75

according to Calinski-Harabasz index, the best method is k-means with 2 clusters

Duda-Hart index

#K-means
dudahart2(locations_short, cluster_km$cluster)

## $p.value
## [1] 0.00575951
## 
## $dh
## [1] 0.6200758
## 
## $compare
## [1] 0.6063293
## 
## $cluster1
## [1] TRUE
## 
## $alpha
## [1] 0.001
## 
## $z
## [1] 3.090232

#pam
dudahart2(locations_short, cluster_pam$cluster)

## $p.value
## [1] 0
## 
## $dh
## [1] 0.2337995
## 
## $compare
## [1] 0.6063293
## 
## $cluster1
## [1] FALSE
## 
## $alpha
## [1] 0.001
## 
## $z
## [1] 3.090232

#hierarchical clustering
dudahart2(locations_short, locations_hierarchical$cluster)

## $p.value
## [1] 0
## 
## $dh
## [1] 0.199517
## 
## $compare
## [1] 0.6063293
## 
## $cluster1
## [1] FALSE
## 
## $alpha
## [1] 0.001
## 
## $z
## [1] 3.090232

#dbscan
dudahart2(locations_short, db$cluster)

## $p.value
## [1] 0
## 
## $dh
## [1] 0.1668409
## 
## $compare
## [1] 0.6063293
## 
## $cluster1
## [1] FALSE
## 
## $alpha
## [1] 0.001
## 
## $z
## [1] 3.090232

In all of these cases Duda-Hart index is equal or really close to 0, which means clusters are not homogenous. Normally, such a conclusion could be alarming; however, considering that we have only two variables, it might be insufficient, and the Duda-Hart index may not be a suitable measure for evaluating our models.

Analyzing cluster by time

#Adding date and time information
uber.data_short$time <- as.POSIXct(uber.data_short$time, format="%H:%M:%S", tz="UTC")
uber.data_short$date <- as.Date(uber.data_short$date, format = "%m/%d/%Y")

locations_kmeans_dt<- merge(locations_kmeans, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_pam_dt<- merge(locations_pam, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_hierarchical_dt<- merge(locations_hierarchical, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)
locations_dbscan_dt<- merge(locations_dbscan, uber.data_short[, c("date", "time")], by = "row.names", all.x = TRUE)

Creating boxplots for every model

ggplot(locations_kmeans_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_datetime(labels = scales::date_format("%H:%M")) +
  labs(title = "Boxplot of Time by K-means cluster", 
       x = "Cluster", 
       y = "Time") +
  scale_fill_brewer(palette = "Set1")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_pam_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_datetime(labels = scales::date_format("%H:%M")) +
  labs(title = "Boxplot of Time by pam cluster", 
       x = "Cluster", 
       y = "Time") +
  scale_fill_brewer(palette = "Set1")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_hierarchical_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_datetime(labels = scales::date_format("%H:%M")) +
  labs(title = "Boxplot of Time by hierarchical cluster", 
       x = "Cluster", 
       y = "Time") +
  scale_fill_brewer(palette = "Set1")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_dbscan_dt, aes(x = factor(cluster), y = time, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_datetime(labels = scales::date_format("%H:%M")) +
  labs(title = "Boxplot of Time by dbscan cluster", 
       x = "Cluster", 
       y = "Time") +
  scale_fill_brewer(palette = "Set1")+
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

In most cases, the clusters behave similarly in terms of time. However, in the case of DBSCAN, some clusters are characterized by earlier hours, while others are more concentrated around evening hours. After looking at the map, it turns out that these are observations occurring at airports near New York, which is an interesting observation and shows that the creation of these clusters was a good move.

ggplot(locations_kmeans_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
  labs(title = "Boxplot of Date by K-means cluster", 
       x = "Cluster", 
       y = "Date") +
  scale_fill_brewer(palette = "Set1") + 
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_pam_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
  labs(title = "Boxplot of Date by pam cluster", 
       x = "Cluster", 
       y = "Date") +
  scale_fill_brewer(palette = "Set1") + 
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_hierarchical_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
  labs(title = "Boxplot of Date by hierarchical cluster", 
       x = "Cluster", 
       y = "Date") +
  scale_fill_brewer(palette = "Set1") + 
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

ggplot(locations_dbscan_dt, aes(x = factor(cluster), y = date, fill = factor(cluster))) +
  geom_boxplot() +
  scale_y_date(labels = scales::date_format("%Y-%m-%d")) +
  labs(title = "Boxplot of Date by Cluster", 
       x = "Cluster", 
       y = "Date") +
  scale_fill_brewer(palette = "Set1") + 
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = "none")

When it comes to date analysis, the situation is very similar to the time analysis. Only clustering with DBSCAN reveals noticeable differences between clusters. Some clusters are more concentrated around specific weeks in September, while others include observations from throughout the entire month.

Conclusions

In the analysis of Uber ride location data, we had only four variables available, with just two used in clustering. The study explored multiple approaches by applying various models, such as k-means, PAM, hierarchical, and DBSCAN, also experimented with parameters, distance measures, and the number of clusters in these models. Based on visual analysis on the map and statistical insights, it can be concluded that DBSCAN performed best in this task, as it most accurately reflected locations on the New York City map and captured the actual, diverse nature of Uber rides.

Uber task

Kamil Morański 432426

2024-11-11

Uber task for USL classes

Clustering

Analyzing cluster by time

Conclusions