setwd("G:\\My Drive\\UL\\Facultative homework")
df0 <- read.csv("uber-data.csv")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
## Warning: package 'NbClust' was built under R version 4.1.1
library(stats)
library(factoextra)
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.1.1
## Loading required package: grid
## Loading required package: lattice
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.1.1
## Loading required package: stats4
library(fpc)
## Warning: package 'fpc' was built under R version 4.1.1
library(clustertend)
## Warning: package 'clustertend' was built under R version 4.1.1
library(cluster)
## Warning: package 'cluster' was built under R version 4.1.1
library(ClusterR)
## Warning: package 'ClusterR' was built under R version 4.1.1
## Loading required package: gtools
## Warning: package 'gtools' was built under R version 4.1.1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'tidyr' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## Warning: package 'purrr' was built under R version 4.1.1
## Warning: package 'dplyr' was built under R version 4.1.1
## Warning: package 'stringr' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.1.1
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(NbClust)
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.1.1
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.1.1
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
#Clean data
df0 = na.omit(df0)
#Remove duplicated observations and remove the last column
df <- df0[!duplicated(df0),c(1:3)]
#Summary
summary(df$Lon)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -74.77 -74.00 -73.98 -73.97 -73.96 -72.72
summary(df$Lat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39.99 40.72 40.74 40.74 40.76 41.35
#Select longitude and latitude
main <- df[2:3]
#z-score standardization
main_z <- data.frame(scale(main))
#Elbow method
opt<-Optimal_Clusters_KMeans(main_z, max_clusters=10, plot_clusters = TRUE)
Because of the large dataset, the Sillouette method doesn’t work well, so the Elbow method is used instead to diagnosed the optimal number of clusters. It seems that the range from 3 to 7 are acceptable, so I check below each of the situation to see which scenario is the most ideal.
# Calinski-Harabasz & Duda-Hart
km3<-kmeans(main_z, 3) # stats::
round(calinhara(main_z, km3$cluster),digits=2) #fpc::calinhara()
## [1] 507867.3
km4<-kmeans(main_z, 4) # stats::
round(calinhara(main_z, km4$cluster),digits=2) #fpc::calinhara()
## [1] 493220
km5<-kmeans(main_z, 5) # stats::
round(calinhara(main_z, km5$cluster),digits=2) #fpc::calinhara()
## [1] 488743.8
km6<-kmeans(main_z, 6) # stats::
round(calinhara(main_z, km6$cluster),digits=2) #fpc::calinhara()
## [1] 529267.8
km7<-kmeans(main_z, 7) # stats::
round(calinhara(main_z, km7$cluster),digits=2) #fpc::calinhara()
## [1] 568512.7
#Combine the cluster column to the main dataframe
df <- cbind(df, km7["cluster"])
head(df)
## Date.Time Lat Lon cluster
## 1 9/1/2014 0:01:00 40.2201 -74.0021 1
## 2 9/1/2014 0:01:00 40.7500 -74.0027 4
## 3 9/1/2014 0:03:00 40.7559 -73.9864 6
## 4 9/1/2014 0:06:00 40.7450 -73.9889 4
## 5 9/1/2014 0:11:00 40.8145 -73.9444 6
## 6 9/1/2014 0:12:00 40.6735 -73.9918 1
#Check the size of each cluster
km7$size
## [1] 103854 17992 42601 444152 9910 355754 29836
#Assign the df1 as the dataframe that I will manipulate
df1 <- df
#The date and time format is changed
df1$Date.Time <- strptime(df$Date.Time, format = "%m/%d/%Y %H:%M:%S")
head(df1)
## Date.Time Lat Lon cluster
## 1 2014-09-01 00:01:00 40.2201 -74.0021 1
## 2 2014-09-01 00:01:00 40.7500 -74.0027 4
## 3 2014-09-01 00:03:00 40.7559 -73.9864 6
## 4 2014-09-01 00:06:00 40.7450 -73.9889 4
## 5 2014-09-01 00:11:00 40.8145 -73.9444 6
## 6 2014-09-01 00:12:00 40.6735 -73.9918 1
df1$Day <- as.integer(format(df1$Date.Time, "%d"))
df1$Month <- as.integer(format(df1$Date.Time, "%m"))
df1$Year <- as.integer(format(df1$Date.Time, "%Y"))
df1$Week_day <- strftime(df1$Date.Time, "%A")
df1$Time_hour <- as.integer(format(df1$Date.Time, "%H"))
df1$Time_minute <- as.integer(format(df1$Date.Time, "%M"))
df1$Time_second <- as.integer(format(df1$Date.Time, "%S"))
df1$Time_bin <- ifelse(df1$Time_hour >= 6 & df1$Time_hour < 10,"Morning peak", ifelse(df1$Time_hour>= 10 & df1$Time_hour < 15, "Morning off-peak", ifelse(df1$Time_hour >= 15 & df1$Time_hour < 19, "Evening peak","Evening off-peak" )))
summary(df1)
## Date.Time Lat Lon cluster
## Min. :2014-09-01 00:00:00 Min. :39.99 Min. :-74.77 Min. :1.000
## 1st Qu.:2014-09-08 19:04:00 1st Qu.:40.72 1st Qu.:-74.00 1st Qu.:4.000
## Median :2014-09-16 06:38:00 Median :40.74 Median :-73.98 Median :4.000
## Mean :2014-09-16 03:30:18 Mean :40.74 Mean :-73.97 Mean :4.419
## 3rd Qu.:2014-09-23 13:20:00 3rd Qu.:40.76 3rd Qu.:-73.96 3rd Qu.:6.000
## Max. :2014-09-30 22:59:00 Max. :41.35 Max. :-72.72 Max. :7.000
## Day Month Year Week_day Time_hour
## Min. : 1.00 Min. :9 Min. :2014 Length:1004099 Min. : 0.00
## 1st Qu.: 8.00 1st Qu.:9 1st Qu.:2014 Class :character 1st Qu.:10.00
## Median :16.00 Median :9 Median :2014 Mode :character Median :15.00
## Mean :15.54 Mean :9 Mean :2014 Mean :14.09
## 3rd Qu.:23.00 3rd Qu.:9 3rd Qu.:2014 3rd Qu.:19.00
## Max. :30.00 Max. :9 Max. :2014 Max. :23.00
## Time_minute Time_second Time_bin
## Min. : 0.00 Min. :0 Length:1004099
## 1st Qu.:14.00 1st Qu.:0 Class :character
## Median :29.00 Median :0 Mode :character
## Mean :29.36 Mean :0
## 3rd Qu.:44.00 3rd Qu.:0
## Max. :59.00 Max. :0
Following the suggestion of Wikipedia about the rush hours, I create 4 bins to analyze time. 6am - 10am is morning peak time, 10am - 3pm is morning off-peak, 3pm - 7pm is evening peak time and 7pm to 6am the next day is evening off-peak time. In addition, when checking the summary of year and month, all observations are collected in September 2014, therefore, I will not analyze about the month and year, rather I will focus on the week days and time bins as explained above.
#Break the total data to 7 sets based on clusters
cluster1 <- df1[df1$cluster == 1,]
cluster2 <- df1[df1$cluster == 2,]
cluster3 <- df1[df1$cluster == 3,]
cluster4 <- df1[df1$cluster == 4,]
cluster5 <- df1[df1$cluster == 5,]
cluster6 <- df1[df1$cluster == 6,]
cluster7 <- df1[df1$cluster == 7,]
#Trips by week day
require(gridExtra)
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 4.1.1
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
wd_plot0 <- ggplot(data=df1, aes(x=Week_day)) + geom_bar() + ggtitle("Total sample")
wd_plot1 <- ggplot(data=cluster1, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 1")
wd_plot2 <- ggplot(data=cluster2, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 2")
wd_plot3 <- ggplot(data=cluster3, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 3")
wd_plot4 <- ggplot(data=cluster4, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 4")
wd_plot5 <- ggplot(data=cluster5, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 5")
wd_plot6 <- ggplot(data=cluster6, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 6")
wd_plot7 <- ggplot(data=cluster7, aes(x=Week_day)) + geom_bar() + ggtitle("Cluster 7")
#Summarize all charts in 2 rows
plot_grid(wd_plot0, wd_plot1,wd_plot2, wd_plot3, align = "h", nrow = 1)
plot_grid(wd_plot4, wd_plot5,wd_plot6, wd_plot7, align = "h", nrow = 1)
#Show each chart in full size
wd_plot0
wd_plot1
wd_plot2
wd_plot3
wd_plot4
wd_plot5
wd_plot6
wd_plot7
On total sample, people use Uber mostly on Tue, Sat and Fri respectively and they use least on Sun. Cluster 4 have the closer shape with total sample in which Tue, Fri and Sat are busiest and Sun is calmest. Cluster 2 is also close, but its Tue’s consumption is in 3rd place and much lower than in Fri and Sat. Cluster 1 and 7 share the same shape, in which Sat is significantly dominate the 1st place and Wed is the lowest consumption day. Cluster 5 and 6 share some similar pattern: Sun and Mon are the peak days whereas Wed and Sat are the lowest consumption days. Finally cluster 3, it seems to have its own unique trend where Tue and Thu are the highest consumption day and Sunday is the calmest day.
#Trips by hour
require(gridExtra)
hour_plot0 <- ggplot(data=df1, aes(x=Time_bin)) + geom_bar() + ggtitle("Total sample")
hour_plot1 <- ggplot(data=cluster1, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 1")
hour_plot2 <- ggplot(data=cluster2, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 2")
hour_plot3 <- ggplot(data=cluster3, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 3")
hour_plot4 <- ggplot(data=cluster4, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 4")
hour_plot5 <- ggplot(data=cluster5, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 5")
hour_plot6 <- ggplot(data=cluster6, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 6")
hour_plot7 <- ggplot(data=cluster7, aes(x=Time_bin)) + geom_bar() + ggtitle("Cluster 7")
#Summarize all charts in 2 rows
plot_grid(hour_plot0, hour_plot1,hour_plot2, hour_plot3, align = "h", nrow = 1)
plot_grid(hour_plot4, hour_plot5,hour_plot6, hour_plot7, align = "h", nrow = 1)
#Show each chart in full size
hour_plot0
hour_plot1
hour_plot2
hour_plot3
hour_plot4
hour_plot5
hour_plot6
hour_plot7
On total sample, people use Uber more in the evening (after 3pm) than in the morning (6am – 3pm). And they use the most in the evening off peak time. There is an assumption that people use Uber for non-business travel more than business commute. Cluster 1, 2, 3, 5, 6 share the same pattern with the total sample. For cluster 1, the evening peak is slightly higher than morning off-peak. For cluster 3, it is opposite with cluster 1, the evening peak is significantly higher than morning off-peak, and it almost reaches the evening off-peak consumption. For cluster 4, the morning off peak is not the calmest period in the day, it is the 2nd busiest time in the day, and the difference among 4 periods are not so big as in other clusters. For cluster 7, the morning off-peak is higher than evening peak time which makes this cluster unique.
Now I am going to apply the forcast for the dataset:
#Set 20% for testing
ind <- sample(nrow(main_z), 200000)
#Split train and test data
main_z[["train"]] <- TRUE
main_z[["train"]][ind] <- FALSE
#Do cluster again the train data
cl1 = kcca(main_z[main_z[["train"]]==TRUE, 1:2], k=7, kccaFamily("kmeans"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
cl1
## kcca object of family 'kmeans'
##
## call:
## kcca(x = main_z[main_z[["train"]] == TRUE, 1:2], k = 7, family = kccaFamily("kmeans"))
##
## cluster sizes:
##
## 1 2 3 4 5 6 7
## 285003 83079 34190 355431 23883 14513 8000
# do the prediction
pred_train <- predict(cl1)
pred_test <- predict(cl1, newdata=main_z[main_z[["train"]]==FALSE, 1:2])
# visualize the output
image(cl1)
points(main_z[main_z[["train"]]==TRUE, 1:2], col=pred_train, pch=19, cex=0.3)
points(main_z[main_z[["train"]]==FALSE, 1:2], col=pred_test, pch=22, bg="orange")