setwd("G:\\My Drive\\UL\\Facultative homework")
df0 <- read.csv("uber-data.csv")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.1.1
library(stats)
library(factoextra)
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.1.1
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.1.1
## Loading required package: stats4
library(fpc)
## Warning: package 'fpc' was built under R version 4.1.1
library(clustertend)
## Warning: package 'clustertend' was built under R version 4.1.1
library(cluster)
## Warning: package 'cluster' was built under R version 4.1.1
library(ClusterR)
## Warning: package 'ClusterR' was built under R version 4.1.1
## Loading required package: gtools
## Warning: package 'gtools' was built under R version 4.1.1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'purrr' was built under R version 4.1.1
## Warning: package 'dplyr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.1.1
## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(NbClust)
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.1.1
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.1.1
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
#Clean data
df0 = na.omit(df0)

#Remove duplicated observations and remove the last column
df <- df0[!duplicated(df0),c(1:3)] 
  
#Summary
summary(df$Lon)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -74.77  -74.00  -73.98  -73.97  -73.96  -72.72
summary(df$Lat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39.99   40.72   40.74   40.74   40.76   41.35
#Select longitude and latitude
main <- df[2:3]

#z-score standardization
main_z <- data.frame(scale(main))
#Elbow method
opt<-Optimal_Clusters_KMeans(main_z, max_clusters=10, plot_clusters = TRUE)

Because of the large dataset, the Sillouette method doesn’t work well, so the Elbow method is used instead to diagnosed the optimal number of clusters. It seems that the range from 3 to 7 are acceptable, so I check below each of the situation to see which scenario is the most ideal.

# Calinski-Harabasz & Duda-Hart
km3<-kmeans(main_z, 3) # stats::
round(calinhara(main_z, km3$cluster),digits=2) #fpc::calinhara()
## [1] 507867.3
km4<-kmeans(main_z, 4) # stats::
round(calinhara(main_z, km4$cluster),digits=2) #fpc::calinhara()
## [1] 493220
km5<-kmeans(main_z, 5) # stats::
round(calinhara(main_z, km5$cluster),digits=2) #fpc::calinhara()
## [1] 488743.8
km6<-kmeans(main_z, 6) # stats::
round(calinhara(main_z, km6$cluster),digits=2) #fpc::calinhara()
## [1] 529267.8
km7<-kmeans(main_z, 7) # stats::
round(calinhara(main_z, km7$cluster),digits=2) #fpc::calinhara()
## [1] 568512.7
#Combine the cluster column to the main dataframe
df <- cbind(df, km7["cluster"])
head(df)
##          Date.Time     Lat      Lon cluster
## 1 9/1/2014 0:01:00 40.2201 -74.0021       1
## 2 9/1/2014 0:01:00 40.7500 -74.0027       4
## 3 9/1/2014 0:03:00 40.7559 -73.9864       6
## 4 9/1/2014 0:06:00 40.7450 -73.9889       4
## 5 9/1/2014 0:11:00 40.8145 -73.9444       6
## 6 9/1/2014 0:12:00 40.6735 -73.9918       1
#Check the size of each cluster
km7$size
## [1] 103854  17992  42601 444152   9910 355754  29836
#Assign the df1 as the dataframe that I will manipulate
df1 <- df
#The date and time format is changed
df1$Date.Time <- strptime(df$Date.Time, format = "%m/%d/%Y %H:%M:%S") 
head(df1)
##             Date.Time     Lat      Lon cluster
## 1 2014-09-01 00:01:00 40.2201 -74.0021       1
## 2 2014-09-01 00:01:00 40.7500 -74.0027       4
## 3 2014-09-01 00:03:00 40.7559 -73.9864       6
## 4 2014-09-01 00:06:00 40.7450 -73.9889       4
## 5 2014-09-01 00:11:00 40.8145 -73.9444       6
## 6 2014-09-01 00:12:00 40.6735 -73.9918       1
df1$Day <- as.integer(format(df1$Date.Time, "%d"))
df1$Month <- as.integer(format(df1$Date.Time, "%m"))
df1$Year <- as.integer(format(df1$Date.Time, "%Y"))
df1$Week_day <- strftime(df1$Date.Time, "%A")
df1$Time_hour <- as.integer(format(df1$Date.Time, "%H"))
df1$Time_minute <- as.integer(format(df1$Date.Time, "%M"))
df1$Time_second <- as.integer(format(df1$Date.Time, "%S"))
df1$Time_bin <- ifelse(df1$Time_hour >= 6 & df1$Time_hour < 10,"Morning peak", ifelse(df1$Time_hour>= 10 & df1$Time_hour < 15, "Morning off-peak", ifelse(df1$Time_hour >= 15 & df1$Time_hour < 19, "Evening peak","Evening off-peak" )))
summary(df1)
##    Date.Time                        Lat             Lon            cluster     
##  Min.   :2014-09-01 00:00:00   Min.   :39.99   Min.   :-74.77   Min.   :1.000  
##  1st Qu.:2014-09-08 19:04:00   1st Qu.:40.72   1st Qu.:-74.00   1st Qu.:4.000  
##  Median :2014-09-16 06:38:00   Median :40.74   Median :-73.98   Median :4.000  
##  Mean   :2014-09-16 03:30:18   Mean   :40.74   Mean   :-73.97   Mean   :4.419  
##  3rd Qu.:2014-09-23 13:20:00   3rd Qu.:40.76   3rd Qu.:-73.96   3rd Qu.:6.000  
##  Max.   :2014-09-30 22:59:00   Max.   :41.35   Max.   :-72.72   Max.   :7.000  
##       Day            Month        Year        Week_day           Time_hour    
##  Min.   : 1.00   Min.   :9   Min.   :2014   Length:1004099     Min.   : 0.00  
##  1st Qu.: 8.00   1st Qu.:9   1st Qu.:2014   Class :character   1st Qu.:10.00  
##  Median :16.00   Median :9   Median :2014   Mode  :character   Median :15.00  
##  Mean   :15.54   Mean   :9   Mean   :2014                      Mean   :14.09  
##  3rd Qu.:23.00   3rd Qu.:9   3rd Qu.:2014                      3rd Qu.:19.00  
##  Max.   :30.00   Max.   :9   Max.   :2014                      Max.   :23.00  
##   Time_minute     Time_second   Time_bin        
##  Min.   : 0.00   Min.   :0    Length:1004099    
##  1st Qu.:14.00   1st Qu.:0    Class :character  
##  Median :29.00   Median :0    Mode  :character  
##  Mean   :29.36   Mean   :0                      
##  3rd Qu.:44.00   3rd Qu.:0                      
##  Max.   :59.00   Max.   :0

Following the suggestion of Wikipedia about the rush hours, I create 4 bins to analyze time. 6am - 10am is morning peak time, 10am - 3pm is morning off-peak, 3pm - 7pm is evening peak time and 7pm to 6am the next day is evening off-peak time. In addition, when checking the summary of year and month, all observations are collected in September 2014, therefore, I will not analyze about the month and year, rather I will focus on the week days and time bins as explained above.

#Break the total data to 7 sets based on clusters
cluster1 <- df1[df1$cluster == 1,]
cluster2 <- df1[df1$cluster == 2,]
cluster3 <- df1[df1$cluster == 3,]
cluster4 <- df1[df1$cluster == 4,]
cluster5 <- df1[df1$cluster == 5,]
cluster6 <- df1[df1$cluster == 6,]
cluster7 <- df1[df1$cluster == 7,]
#Trips by week day
require(gridExtra)
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 4.1.1
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
wd_plot0 <- ggplot(data=df1, aes(x=Week_day)) + geom_bar() + ggtitle("Total sample")
wd_plot1 <- ggplot(data=cluster1, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 1")
wd_plot2 <- ggplot(data=cluster2, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 2")
wd_plot3 <- ggplot(data=cluster3, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 3")
wd_plot4 <- ggplot(data=cluster4, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 4")
wd_plot5 <- ggplot(data=cluster5, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 5")
wd_plot6 <- ggplot(data=cluster6, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 6")
wd_plot7 <- ggplot(data=cluster7, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 7")

#Summarize all charts in 2 rows
plot_grid(wd_plot0, wd_plot1,wd_plot2, wd_plot3, align = "h", nrow = 1)

plot_grid(wd_plot4, wd_plot5,wd_plot6, wd_plot7, align = "h", nrow = 1)

#Show each chart in full size
wd_plot0

wd_plot1

wd_plot2

wd_plot3

wd_plot4

wd_plot5

wd_plot6

wd_plot7

On total sample, people use Uber mostly on Tue, Sat and Fri respectively and they use least on Sun. Cluster 4 have the closer shape with total sample in which Tue, Fri and Sat are busiest and Sun is calmest. Cluster 2 is also close, but its Tue’s consumption is in 3rd place and much lower than in Fri and Sat. Cluster 1 and 7 share the same shape, in which Sat is significantly dominate the 1st place and Wed is the lowest consumption day. Cluster 5 and 6 share some similar pattern: Sun and Mon are the peak days whereas Wed and Sat are the lowest consumption days. Finally cluster 3, it seems to have its own unique trend where Tue and Thu are the highest consumption day and Sunday is the calmest day.

#Trips by hour
require(gridExtra)
hour_plot0 <- ggplot(data=df1, aes(x=Time_bin)) + geom_bar() + ggtitle("Total sample")
hour_plot1 <- ggplot(data=cluster1, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 1")
hour_plot2 <- ggplot(data=cluster2, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 2")
hour_plot3 <- ggplot(data=cluster3, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 3")
hour_plot4 <- ggplot(data=cluster4, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 4")
hour_plot5 <- ggplot(data=cluster5, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 5")
hour_plot6 <- ggplot(data=cluster6, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 6")
hour_plot7 <- ggplot(data=cluster7, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 7")

#Summarize all charts in 2 rows
plot_grid(hour_plot0, hour_plot1,hour_plot2, hour_plot3, align = "h", nrow = 1)

plot_grid(hour_plot4, hour_plot5,hour_plot6, hour_plot7, align = "h", nrow = 1)

#Show each chart in full size
hour_plot0

hour_plot1

hour_plot2

hour_plot3

hour_plot4

hour_plot5

hour_plot6

hour_plot7

On total sample, people use Uber more in the evening (after 3pm) than in the morning (6am – 3pm). And they use the most in the evening off peak time. There is an assumption that people use Uber for non-business travel more than business commute. Cluster 1, 2, 3, 5, 6 share the same pattern with the total sample. For cluster 1, the evening peak is slightly higher than morning off-peak. For cluster 3, it is opposite with cluster 1, the evening peak is significantly higher than morning off-peak, and it almost reaches the evening off-peak consumption. For cluster 4, the morning off peak is not the calmest period in the day, it is the 2nd busiest time in the day, and the difference among 4 periods are not so big as in other clusters. For cluster 7, the morning off-peak is higher than evening peak time which makes this cluster unique.

Now I am going to apply the forcast for the dataset:

#Set 20% for testing
ind <- sample(nrow(main_z), 200000)
#Split train and test data
main_z[["train"]] <- TRUE
main_z[["train"]][ind] <- FALSE
#Do cluster again the train data
cl1 = kcca(main_z[main_z[["train"]]==TRUE, 1:2], k=7, kccaFamily("kmeans"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
cl1
## kcca object of family 'kmeans' 
## 
## call:
## kcca(x = main_z[main_z[["train"]] == TRUE, 1:2], k = 7, family = kccaFamily("kmeans"))
## 
## cluster sizes:
## 
##      1      2      3      4      5      6      7 
## 285003  83079  34190 355431  23883  14513   8000
# do the prediction
pred_train <- predict(cl1)
pred_test <- predict(cl1, newdata=main_z[main_z[["train"]]==FALSE, 1:2])
# visualize the output
image(cl1)
points(main_z[main_z[["train"]]==TRUE, 1:2], col=pred_train, pch=19, cex=0.3)
points(main_z[main_z[["train"]]==FALSE, 1:2], col=pred_test, pch=22, bg="orange")