Introduction

This is exploratory analysis of the Kaggle New York City Taxi Trip Duration dataset which can be found at the following link: https://www.kaggle.com/c/nyc-taxi-trip-duration/data

In this markdown the data is loaded and cleaned and exploratory visualisation of the pickup and drop off locations is done. K-means clustering is then used on the pick up and drop off coordinates to develop a route feature matrix. A regression model is trained on these features along with data on the pick up time and the month the trip took place in.

Load data

First required packages and data are loaded.

setwd("~/Kaggle/Taxis")

library(data.table)
library(ggplot2)
library(lubridate)
library(knitr)
library(leaflet)

test_kaggle = fread("test.csv", verbose = F, showProgress = F)
train_kaggle = fread("train.csv", verbose = F, showProgress = F)

Examining the top rows of the data:

head(train_kaggle)
##           id vendor_id     pickup_datetime    dropoff_datetime
## 1: id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03
##    passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1:               1        -73.98215        40.76794         -73.96463
## 2:               1        -73.98042        40.73856         -73.99948
## 3:               1        -73.97903        40.76394         -74.00533
## 4:               1        -74.01004        40.71997         -74.01227
## 5:               1        -73.97305        40.79321         -73.97292
## 6:               6        -73.98286        40.74220         -73.99208
##    dropoff_latitude store_and_fwd_flag trip_duration
## 1:         40.76560                  N           455
## 2:         40.73115                  N           663
## 3:         40.71009                  N          2124
## 4:         40.70672                  N           429
## 5:         40.78252                  N           435
## 6:         40.74918                  N           443

Add Features

For each trip the distance (km) is calculated and added to train and test dataframe. The average speed (km/h) is then calculated and added to the dataframe. The hour of pick up and month the trip took place are taken from the pick up time/date data and added.

# function to get distance between points
distance = function(lat1, long1, lat2, long2) {
    deglen = 110.25
    x = lat1 - lat2
    y = (long1 - long2) * cos(lat1)
    return(deglen * sqrt(x * x + y * y))
}

# Add distance to train_kaggle data
train_kaggle$distance = distance(train_kaggle$pickup_latitude, train_kaggle$pickup_longitude, 
    train_kaggle$dropoff_latitude, train_kaggle$dropoff_longitude)
test_kaggle$distance = distance(test_kaggle$pickup_latitude, test_kaggle$pickup_longitude, 
    test_kaggle$dropoff_latitude, test_kaggle$dropoff_longitude)

# Add speed
train_kaggle$speed = train_kaggle$distance/(train_kaggle$trip_duration/(60 * 
    60))
test_kaggle$speed = test_kaggle$distance/(test_kaggle$trip_duration/(60 * 60))

# Add hour of pickup

# To train data
t.lub.train <- ymd_hms(train_kaggle$pickup_datetime)

train_kaggle$pickup_hour <- as.numeric(format(t.lub.train, "%H")) + as.numeric(format(t.lub.train, 
    "%M"))/60

# To test data
t.lub.test <- ymd_hms(test_kaggle$pickup_datetime)

test_kaggle$pickup_hour <- as.numeric(format(t.lub.test, "%H")) + as.numeric(format(t.lub.test, 
    "%M"))/60

# Add month
train_kaggle$month <- as.factor(format(t.lub.train, "%m"))
test_kaggle$month <- as.factor(format(t.lub.test, "%m"))

Looking at the top rows of the data now:

head(train_kaggle)
##           id vendor_id     pickup_datetime    dropoff_datetime
## 1: id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03
##    passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1:               1        -73.98215        40.76794         -73.96463
## 2:               1        -73.98042        40.73856         -73.99948
## 3:               1        -73.97903        40.76394         -74.00533
## 4:               1        -74.01004        40.71997         -74.01227
## 5:               1        -73.97305        40.79321         -73.97292
## 6:               6        -73.98286        40.74220         -73.99208
##    dropoff_latitude store_and_fwd_flag trip_duration distance     speed
## 1:         40.76560                  N           455 1.944101 15.381898
## 2:         40.73115                  N           663 2.245056 12.190349
## 3:         40.71009                  N          2124 6.603957 11.193147
## 4:         40.70672                  N           429 1.481267 12.430211
## 5:         40.78252                  N           435 1.178525  9.753308
## 6:         40.74918                  N           443 1.271931 10.336236
##    pickup_hour month
## 1:  17.4000000    03
## 2:   0.7166667    06
## 3:  11.5833333    01
## 4:  19.5333333    04
## 5:  13.5000000    03
## 6:  22.0166667    01

Examining data for outliers

par(mfrow = c(1,2))
hist(train_kaggle$trip_duration/60,main='Duration (Minutes)',xlim = c(0,80),breaks = c(0,10,20,30,40,50,60,70,80,500000))
boxplot(train_kaggle$trip_duration/60,main='Duration (Minutes)',ylim=c(0,80))

hist(train_kaggle$distance,main = 'Distance (km)',breaks=c(0,5,10,15,20,25,30,35,500000),xlim=c(0,40))
boxplot(train_kaggle$distance,main = 'Distance (km)',ylim=c(0,20))

The above graphs show the presence of a number of outliers. We can remove any items with distance > 20 km and duration > 1 hour, as well as items showing a distance of 0 km or 0 minutes.

train_kaggle = train_kaggle[train_kaggle$distance != 0, ]
train_kaggle = train_kaggle[train_kaggle$trip_duration != 0, ]
train_kaggle = train_kaggle[train_kaggle$distance < 20, ]
train_kaggle = train_kaggle[train_kaggle$trip_duration < 3600, ]

Exploratory Plots

The ggplot2 package is used to visualise the data.

A sample set is used to plot data. The duration versus time of pickup is shown below. Time of day has a clear effect on trip duration with trips in the early morning (3 AM to 6 AM) being quicker on average.

The code used to generate the first plot is shown below.

1. Trip Duration by Time of Day

# Take sample data set
sample_data = train_kaggle[sample(1:dim(train_kaggle)[1], 20000), ]

# Plot
ggplot(mapping = aes(x = sample_data$pickup_hour, y = sample_data$trip_duration/60)) + 
    geom_point(cex = 0.2, alpha = 0.2) + geom_smooth() + scale_x_continuous(breaks = 0:24) + 
    xlab(label = "Pickup time") + ylab(label = "Trip average duration") + theme(legend.position = "none") + 
    ggtitle("Trip duration by time of Day") + scale_y_continuous(limits = c(0, 
    30))

2. Trip Duration versus Distance

Trip duration is plotted versus distance.

3. Pickup and drop off locations

100,000 pick up locations are plotted.

100,000 drop off locations are plotted.

Leaflet Sample Display

A map view of 1000 random Pickup locations using leaflet is shown below:

sample_data = train_kaggle[sample(1:dim(train_kaggle)[1],1000),]

leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addAwesomeMarkers(
    lng=sample_data$pickup_longitude, 
    lat=sample_data$pickup_latitude,
    clusterOptions = markerClusterOptions()
    
  )

Map view of corresponding drop off locations using leaflet

leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addAwesomeMarkers(
    lng=sample_data$dropoff_longitude, 
    lat=sample_data$dropoff_latitude,
    clusterOptions = markerClusterOptions()
  )

This is a useful tool to get an idea of the distribution of locations.

Dummy Variable Creation

As there is not a linear relationship between the time of day and trip duration dummy variables for hour of pickup are created for regression.

# Training Data
hour.dummies = model.matrix(~cut(train_kaggle$pickup_hour,breaks = c(0,3,6,7,8,9,19,21,24),include.lowest = T))
hour.dummies = hour.dummies[,-1]
colnames(hour.dummies)=paste('hr',c(3,6,7,8,9,19,21))
hour.dummies = hour.dummies==1
train_kaggle = cbind(train_kaggle,hour.dummies)


#Test Data
hour.dummies.test = model.matrix(~cut(test_kaggle$pickup_hour,breaks = c(0,3,6,7,8,9,19,21,24),include.lowest = T))
hour.dummies.test = hour.dummies.test[,-1]
colnames(hour.dummies.test)=paste('hr',c(3,6,7,8,9,19,21))
hour.dummies.test = hour.dummies.test==1
test_kaggle = cbind(test_kaggle,hour.dummies.test)

Similarly for the month:

# Train data
month.dummies = model.matrix(~train_kaggle$month)
month.dummies = month.dummies[,-1]
colnames(month.dummies)=paste('mnth',2:6)
month.dummies = month.dummies==1
train_kaggle = cbind(train_kaggle,month.dummies)

# test data
month.dummies = model.matrix(~test_kaggle$month)
month.dummies = month.dummies[,-1]
colnames(month.dummies)=paste('mnth',2:6)
month.dummies = month.dummies==1
test_kaggle = cbind(test_kaggle,month.dummies)

The data now looks like this:

head(train_kaggle)
##           id vendor_id     pickup_datetime    dropoff_datetime
## 1: id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03
##    passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1:               1        -73.98215        40.76794         -73.96463
## 2:               1        -73.98042        40.73856         -73.99948
## 3:               1        -73.97903        40.76394         -74.00533
## 4:               1        -74.01004        40.71997         -74.01227
## 5:               1        -73.97305        40.79321         -73.97292
## 6:               6        -73.98286        40.74220         -73.99208
##    dropoff_latitude store_and_fwd_flag trip_duration distance     speed
## 1:         40.76560                  N           455 1.944101 15.381898
## 2:         40.73115                  N           663 2.245056 12.190349
## 3:         40.71009                  N          2124 6.603957 11.193147
## 4:         40.70672                  N           429 1.481267 12.430211
## 5:         40.78252                  N           435 1.178525  9.753308
## 6:         40.74918                  N           443 1.271931 10.336236
##    pickup_hour month  hr 3  hr 6  hr 7  hr 8  hr 9 hr 19 hr 21 mnth 2
## 1:  17.4000000    03 FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  FALSE
## 2:   0.7166667    06 FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
## 3:  11.5833333    01 FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  FALSE
## 4:  19.5333333    04 FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  FALSE
## 5:  13.5000000    03 FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  FALSE
## 6:  22.0166667    01 FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  FALSE
##    mnth 3 mnth 4 mnth 5 mnth 6
## 1:   TRUE  FALSE  FALSE  FALSE
## 2:  FALSE  FALSE  FALSE   TRUE
## 3:  FALSE  FALSE  FALSE  FALSE
## 4:  FALSE   TRUE  FALSE  FALSE
## 5:   TRUE  FALSE  FALSE  FALSE
## 6:  FALSE  FALSE  FALSE  FALSE

Clustering of pickup and drop off Locations

So far features collected for regression include distance, time of day and month of year. In order to improve prediction accuracy the route should be taken into account. This can be done by clustering drop off and pick up locations and including combinations as dummy variables.

First k-means clustering of pickup and drop off locations, with 50 clusters for each. Test data is included with training data for clustering to ensure test data has the cluster information available for prediction.

# Define number of clusters
cen = 10

pickup = rbind(cbind(train_kaggle$pickup_longitude,train_kaggle$pickup_latitude),cbind(test_kaggle$pickup_longitude,test_kaggle$pickup_latitude))
clus_pickup = kmeans(pickup, centers = cen, nstart = 1)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 101712550)
train_kaggle$pickupclus = as.factor(clus_pickup$cluster[1:dim(train_kaggle)[1]])
test_kaggle$pickupclus = as.factor(clus_pickup$cluster[(dim(train_kaggle)[1]+1):length(clus_pickup$cluster)])



# cluster locations for dropoff

dropoff = rbind(cbind(train_kaggle$dropoff_longitude,train_kaggle$dropoff_latitude),cbind(test_kaggle$dropoff_longitude,test_kaggle$dropoff_latitude))
clus_dropoff = kmeans(dropoff, centers = cen, nstart = 1)

train_kaggle$dropoffclus = as.factor(clus_dropoff$cluster[1:dim(train_kaggle)[1]])
test_kaggle$dropoffclus = as.factor(clus_dropoff$cluster[(dim(train_kaggle)[1]+1):length(clus_dropoff$cluster)])

The pick up and drop off locations are plotted below coloured by cluster:

sample_data = train_kaggle[sample(1:dim(train_kaggle)[1],100000),]

ggplot(mapping=aes(x=sample_data$pickup_longitude,y=sample_data$pickup_latitude))+
  geom_point(alpha=.2,cex=0.0001,aes(colour=sample_data$pickupclus))+
  scale_x_continuous(limits = c(-74.02,-73.85))+
  scale_y_continuous(limits = c(40.7,40.85))+
  ggtitle('Pick up locations clustered')+
  guides(fill=F)+
  xlab('Longitude')+ylab('Latitude')+ggtitle('Plot of pick up locations with clustering')+
  theme(legend.position = 'none')

# dropoff location by cluster----
  ggplot(mapping=aes(x=sample_data$dropoff_longitude,y=sample_data$dropoff_latitude))+
    geom_point(alpha=.2,cex=0.0001,aes(colour=sample_data$dropoffclus))+
    scale_x_continuous(limits = c(-74.02,-73.85))+
    scale_y_continuous(limits = c(40.7,40.85))+
  ggtitle('Drop off locations clustered')+
  theme(legend.position = 'none')+
  xlab('Longitude')+ylab('Latitude')+ggtitle('Plot of drop off locations with clustering')

We can now create dummy variables for routes, eg. for cluster 2 to cluster 11 the variable will be 2/11 and any route going between these clusters will have the boolean for this feature set as true.

# build dummys for one cluster to another
comb=as.factor(paste(as.character(clus_pickup$cluster),as.character(clus_dropoff$cluster),sep='/'))
route.dummies = (model.matrix(~comb))

route.dummies=route.dummies==1



# Add to train and test data

train_kaggle = cbind(train_kaggle,route.dummies[1:dim(train_kaggle)[1],])
test_kaggle = cbind(test_kaggle,route.dummies[(dim(train_kaggle)[1]+1):length(clus_pickup$cluster),])

Now a number of features are removed to allow model training only using dummy variables and distance (km)

model.data = train_kaggle[,-c(1:10,13,14,15,28,29,30)]

test.model.data = test_kaggle[,-c(1:9,11,12,13,27,28,29)]

Fit Model

Linear regression is used on a training set of 90% of training set provided by kaggle. Firstly a model is produced using only the distance variable to get a baseline idea of performance.

n=0.9
N = dim(model.data)[1]
train_ind = sample(1:N,n*N)
test_ind = setdiff(1:N,train_ind)
  
  train = model.data[train_ind,]
  test = model.data[test_ind,]
  
  #Basic Model
  fitlm = lm(trip_duration~distance, data=train)
  pred=predict.lm(fitlm,newdata = test)
  
  pred[pred<0] = 5
  pred[is.na(pred)]=0

Have a look at results:

plot(test$trip_duration[1:50]/60,type='h',ylab = 'duration (minutes)')
points(pred[1:50]/60,col='red',pch=15)

Generally the predictions do not appear to be very good. The model seems to have low variance, high bias (as expected for a simple linear model).

Linear regression is now used on the full set of features, for the same set (90%).

  fitlm = lm(trip_duration~., data=train)
  pred=predict.lm(fitlm,newdata = test)
## Warning in predict.lm(fitlm, newdata = test): prediction from a rank-
## deficient fit may be misleading
  pred[pred<0] = 5
  pred[is.na(pred)]=0

Have a look at results:

plot(test$trip_duration[1:50]/60,type='h',ylab = 'duration (minutes)')
points(pred[1:50]/60,col='red',pch=15)

The plot shows that predictions in red generally agree with the actual duration in black, certainly outperforming the basic distance model.

Now using the model on the kaggle test set:

  pred = predict.lm(fitlm,newdata = test_kaggle)
## Warning in predict.lm(fitlm, newdata = test_kaggle): prediction from a
## rank-deficient fit may be misleading
  pred[pred<0] = 2
  pred[is.na(pred)]=0
  test_kaggle$trip_duration=pred
out = test_kaggle[,c(1,128)]


write.csv(out,file='results.csv',row.names = F)