This is exploratory analysis of the Kaggle New York City Taxi Trip Duration dataset which can be found at the following link: https://www.kaggle.com/c/nyc-taxi-trip-duration/data
In this markdown the data is loaded and cleaned and exploratory visualisation of the pickup and drop off locations is done. K-means clustering is then used on the pick up and drop off coordinates to develop a route feature matrix. A regression model is trained on these features along with data on the pick up time and the month the trip took place in.
First required packages and data are loaded.
setwd("~/Kaggle/Taxis")
library(data.table)
library(ggplot2)
library(lubridate)
library(knitr)
library(leaflet)
test_kaggle = fread("test.csv", verbose = F, showProgress = F)
train_kaggle = fread("train.csv", verbose = F, showProgress = F)
Examining the top rows of the data:
head(train_kaggle)
## id vendor_id pickup_datetime dropoff_datetime
## 1: id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1: 1 -73.98215 40.76794 -73.96463
## 2: 1 -73.98042 40.73856 -73.99948
## 3: 1 -73.97903 40.76394 -74.00533
## 4: 1 -74.01004 40.71997 -74.01227
## 5: 1 -73.97305 40.79321 -73.97292
## 6: 6 -73.98286 40.74220 -73.99208
## dropoff_latitude store_and_fwd_flag trip_duration
## 1: 40.76560 N 455
## 2: 40.73115 N 663
## 3: 40.71009 N 2124
## 4: 40.70672 N 429
## 5: 40.78252 N 435
## 6: 40.74918 N 443
For each trip the distance (km) is calculated and added to train and test dataframe. The average speed (km/h) is then calculated and added to the dataframe. The hour of pick up and month the trip took place are taken from the pick up time/date data and added.
# function to get distance between points
distance = function(lat1, long1, lat2, long2) {
deglen = 110.25
x = lat1 - lat2
y = (long1 - long2) * cos(lat1)
return(deglen * sqrt(x * x + y * y))
}
# Add distance to train_kaggle data
train_kaggle$distance = distance(train_kaggle$pickup_latitude, train_kaggle$pickup_longitude,
train_kaggle$dropoff_latitude, train_kaggle$dropoff_longitude)
test_kaggle$distance = distance(test_kaggle$pickup_latitude, test_kaggle$pickup_longitude,
test_kaggle$dropoff_latitude, test_kaggle$dropoff_longitude)
# Add speed
train_kaggle$speed = train_kaggle$distance/(train_kaggle$trip_duration/(60 *
60))
test_kaggle$speed = test_kaggle$distance/(test_kaggle$trip_duration/(60 * 60))
# Add hour of pickup
# To train data
t.lub.train <- ymd_hms(train_kaggle$pickup_datetime)
train_kaggle$pickup_hour <- as.numeric(format(t.lub.train, "%H")) + as.numeric(format(t.lub.train,
"%M"))/60
# To test data
t.lub.test <- ymd_hms(test_kaggle$pickup_datetime)
test_kaggle$pickup_hour <- as.numeric(format(t.lub.test, "%H")) + as.numeric(format(t.lub.test,
"%M"))/60
# Add month
train_kaggle$month <- as.factor(format(t.lub.train, "%m"))
test_kaggle$month <- as.factor(format(t.lub.test, "%m"))
Looking at the top rows of the data now:
head(train_kaggle)
## id vendor_id pickup_datetime dropoff_datetime
## 1: id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1: 1 -73.98215 40.76794 -73.96463
## 2: 1 -73.98042 40.73856 -73.99948
## 3: 1 -73.97903 40.76394 -74.00533
## 4: 1 -74.01004 40.71997 -74.01227
## 5: 1 -73.97305 40.79321 -73.97292
## 6: 6 -73.98286 40.74220 -73.99208
## dropoff_latitude store_and_fwd_flag trip_duration distance speed
## 1: 40.76560 N 455 1.944101 15.381898
## 2: 40.73115 N 663 2.245056 12.190349
## 3: 40.71009 N 2124 6.603957 11.193147
## 4: 40.70672 N 429 1.481267 12.430211
## 5: 40.78252 N 435 1.178525 9.753308
## 6: 40.74918 N 443 1.271931 10.336236
## pickup_hour month
## 1: 17.4000000 03
## 2: 0.7166667 06
## 3: 11.5833333 01
## 4: 19.5333333 04
## 5: 13.5000000 03
## 6: 22.0166667 01
Examining data for outliers
par(mfrow = c(1,2))
hist(train_kaggle$trip_duration/60,main='Duration (Minutes)',xlim = c(0,80),breaks = c(0,10,20,30,40,50,60,70,80,500000))
boxplot(train_kaggle$trip_duration/60,main='Duration (Minutes)',ylim=c(0,80))
hist(train_kaggle$distance,main = 'Distance (km)',breaks=c(0,5,10,15,20,25,30,35,500000),xlim=c(0,40))
boxplot(train_kaggle$distance,main = 'Distance (km)',ylim=c(0,20))
The above graphs show the presence of a number of outliers. We can remove any items with distance > 20 km and duration > 1 hour, as well as items showing a distance of 0 km or 0 minutes.
train_kaggle = train_kaggle[train_kaggle$distance != 0, ]
train_kaggle = train_kaggle[train_kaggle$trip_duration != 0, ]
train_kaggle = train_kaggle[train_kaggle$distance < 20, ]
train_kaggle = train_kaggle[train_kaggle$trip_duration < 3600, ]
The ggplot2 package is used to visualise the data.
A sample set is used to plot data. The duration versus time of pickup is shown below. Time of day has a clear effect on trip duration with trips in the early morning (3 AM to 6 AM) being quicker on average.
The code used to generate the first plot is shown below.
# Take sample data set
sample_data = train_kaggle[sample(1:dim(train_kaggle)[1], 20000), ]
# Plot
ggplot(mapping = aes(x = sample_data$pickup_hour, y = sample_data$trip_duration/60)) +
geom_point(cex = 0.2, alpha = 0.2) + geom_smooth() + scale_x_continuous(breaks = 0:24) +
xlab(label = "Pickup time") + ylab(label = "Trip average duration") + theme(legend.position = "none") +
ggtitle("Trip duration by time of Day") + scale_y_continuous(limits = c(0,
30))
Trip duration is plotted versus distance.
100,000 pick up locations are plotted.
100,000 drop off locations are plotted.
A map view of 1000 random Pickup locations using leaflet is shown below:
sample_data = train_kaggle[sample(1:dim(train_kaggle)[1],1000),]
leaflet() %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addAwesomeMarkers(
lng=sample_data$pickup_longitude,
lat=sample_data$pickup_latitude,
clusterOptions = markerClusterOptions()
)
Map view of corresponding drop off locations using leaflet
leaflet() %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addAwesomeMarkers(
lng=sample_data$dropoff_longitude,
lat=sample_data$dropoff_latitude,
clusterOptions = markerClusterOptions()
)
This is a useful tool to get an idea of the distribution of locations.
As there is not a linear relationship between the time of day and trip duration dummy variables for hour of pickup are created for regression.
# Training Data
hour.dummies = model.matrix(~cut(train_kaggle$pickup_hour,breaks = c(0,3,6,7,8,9,19,21,24),include.lowest = T))
hour.dummies = hour.dummies[,-1]
colnames(hour.dummies)=paste('hr',c(3,6,7,8,9,19,21))
hour.dummies = hour.dummies==1
train_kaggle = cbind(train_kaggle,hour.dummies)
#Test Data
hour.dummies.test = model.matrix(~cut(test_kaggle$pickup_hour,breaks = c(0,3,6,7,8,9,19,21,24),include.lowest = T))
hour.dummies.test = hour.dummies.test[,-1]
colnames(hour.dummies.test)=paste('hr',c(3,6,7,8,9,19,21))
hour.dummies.test = hour.dummies.test==1
test_kaggle = cbind(test_kaggle,hour.dummies.test)
Similarly for the month:
# Train data
month.dummies = model.matrix(~train_kaggle$month)
month.dummies = month.dummies[,-1]
colnames(month.dummies)=paste('mnth',2:6)
month.dummies = month.dummies==1
train_kaggle = cbind(train_kaggle,month.dummies)
# test data
month.dummies = model.matrix(~test_kaggle$month)
month.dummies = month.dummies[,-1]
colnames(month.dummies)=paste('mnth',2:6)
month.dummies = month.dummies==1
test_kaggle = cbind(test_kaggle,month.dummies)
The data now looks like this:
head(train_kaggle)
## id vendor_id pickup_datetime dropoff_datetime
## 1: id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2: id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3: id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4: id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5: id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6: id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1: 1 -73.98215 40.76794 -73.96463
## 2: 1 -73.98042 40.73856 -73.99948
## 3: 1 -73.97903 40.76394 -74.00533
## 4: 1 -74.01004 40.71997 -74.01227
## 5: 1 -73.97305 40.79321 -73.97292
## 6: 6 -73.98286 40.74220 -73.99208
## dropoff_latitude store_and_fwd_flag trip_duration distance speed
## 1: 40.76560 N 455 1.944101 15.381898
## 2: 40.73115 N 663 2.245056 12.190349
## 3: 40.71009 N 2124 6.603957 11.193147
## 4: 40.70672 N 429 1.481267 12.430211
## 5: 40.78252 N 435 1.178525 9.753308
## 6: 40.74918 N 443 1.271931 10.336236
## pickup_hour month hr 3 hr 6 hr 7 hr 8 hr 9 hr 19 hr 21 mnth 2
## 1: 17.4000000 03 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 2: 0.7166667 06 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3: 11.5833333 01 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 4: 19.5333333 04 FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 5: 13.5000000 03 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 6: 22.0166667 01 FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## mnth 3 mnth 4 mnth 5 mnth 6
## 1: TRUE FALSE FALSE FALSE
## 2: FALSE FALSE FALSE TRUE
## 3: FALSE FALSE FALSE FALSE
## 4: FALSE TRUE FALSE FALSE
## 5: TRUE FALSE FALSE FALSE
## 6: FALSE FALSE FALSE FALSE
So far features collected for regression include distance, time of day and month of year. In order to improve prediction accuracy the route should be taken into account. This can be done by clustering drop off and pick up locations and including combinations as dummy variables.
First k-means clustering of pickup and drop off locations, with 50 clusters for each. Test data is included with training data for clustering to ensure test data has the cluster information available for prediction.
# Define number of clusters
cen = 10
pickup = rbind(cbind(train_kaggle$pickup_longitude,train_kaggle$pickup_latitude),cbind(test_kaggle$pickup_longitude,test_kaggle$pickup_latitude))
clus_pickup = kmeans(pickup, centers = cen, nstart = 1)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 101712550)
train_kaggle$pickupclus = as.factor(clus_pickup$cluster[1:dim(train_kaggle)[1]])
test_kaggle$pickupclus = as.factor(clus_pickup$cluster[(dim(train_kaggle)[1]+1):length(clus_pickup$cluster)])
# cluster locations for dropoff
dropoff = rbind(cbind(train_kaggle$dropoff_longitude,train_kaggle$dropoff_latitude),cbind(test_kaggle$dropoff_longitude,test_kaggle$dropoff_latitude))
clus_dropoff = kmeans(dropoff, centers = cen, nstart = 1)
train_kaggle$dropoffclus = as.factor(clus_dropoff$cluster[1:dim(train_kaggle)[1]])
test_kaggle$dropoffclus = as.factor(clus_dropoff$cluster[(dim(train_kaggle)[1]+1):length(clus_dropoff$cluster)])
The pick up and drop off locations are plotted below coloured by cluster:
sample_data = train_kaggle[sample(1:dim(train_kaggle)[1],100000),]
ggplot(mapping=aes(x=sample_data$pickup_longitude,y=sample_data$pickup_latitude))+
geom_point(alpha=.2,cex=0.0001,aes(colour=sample_data$pickupclus))+
scale_x_continuous(limits = c(-74.02,-73.85))+
scale_y_continuous(limits = c(40.7,40.85))+
ggtitle('Pick up locations clustered')+
guides(fill=F)+
xlab('Longitude')+ylab('Latitude')+ggtitle('Plot of pick up locations with clustering')+
theme(legend.position = 'none')
# dropoff location by cluster----
ggplot(mapping=aes(x=sample_data$dropoff_longitude,y=sample_data$dropoff_latitude))+
geom_point(alpha=.2,cex=0.0001,aes(colour=sample_data$dropoffclus))+
scale_x_continuous(limits = c(-74.02,-73.85))+
scale_y_continuous(limits = c(40.7,40.85))+
ggtitle('Drop off locations clustered')+
theme(legend.position = 'none')+
xlab('Longitude')+ylab('Latitude')+ggtitle('Plot of drop off locations with clustering')
We can now create dummy variables for routes, eg. for cluster 2 to cluster 11 the variable will be 2/11 and any route going between these clusters will have the boolean for this feature set as true.
# build dummys for one cluster to another
comb=as.factor(paste(as.character(clus_pickup$cluster),as.character(clus_dropoff$cluster),sep='/'))
route.dummies = (model.matrix(~comb))
route.dummies=route.dummies==1
# Add to train and test data
train_kaggle = cbind(train_kaggle,route.dummies[1:dim(train_kaggle)[1],])
test_kaggle = cbind(test_kaggle,route.dummies[(dim(train_kaggle)[1]+1):length(clus_pickup$cluster),])
Now a number of features are removed to allow model training only using dummy variables and distance (km)
model.data = train_kaggle[,-c(1:10,13,14,15,28,29,30)]
test.model.data = test_kaggle[,-c(1:9,11,12,13,27,28,29)]
Linear regression is used on a training set of 90% of training set provided by kaggle. Firstly a model is produced using only the distance variable to get a baseline idea of performance.
n=0.9
N = dim(model.data)[1]
train_ind = sample(1:N,n*N)
test_ind = setdiff(1:N,train_ind)
train = model.data[train_ind,]
test = model.data[test_ind,]
#Basic Model
fitlm = lm(trip_duration~distance, data=train)
pred=predict.lm(fitlm,newdata = test)
pred[pred<0] = 5
pred[is.na(pred)]=0
Have a look at results:
plot(test$trip_duration[1:50]/60,type='h',ylab = 'duration (minutes)')
points(pred[1:50]/60,col='red',pch=15)
Generally the predictions do not appear to be very good. The model seems to have low variance, high bias (as expected for a simple linear model).
Linear regression is now used on the full set of features, for the same set (90%).
fitlm = lm(trip_duration~., data=train)
pred=predict.lm(fitlm,newdata = test)
## Warning in predict.lm(fitlm, newdata = test): prediction from a rank-
## deficient fit may be misleading
pred[pred<0] = 5
pred[is.na(pred)]=0
Have a look at results:
plot(test$trip_duration[1:50]/60,type='h',ylab = 'duration (minutes)')
points(pred[1:50]/60,col='red',pch=15)
The plot shows that predictions in red generally agree with the actual duration in black, certainly outperforming the basic distance model.
Now using the model on the kaggle test set:
pred = predict.lm(fitlm,newdata = test_kaggle)
## Warning in predict.lm(fitlm, newdata = test_kaggle): prediction from a
## rank-deficient fit may be misleading
pred[pred<0] = 2
pred[is.na(pred)]=0
test_kaggle$trip_duration=pred
out = test_kaggle[,c(1,128)]
write.csv(out,file='results.csv',row.names = F)