ECML/PKDD 15: Taxi Trajectory Prediction (I)

In this challenge, we ask you to build a predictive framework that is able to infer the final destination of taxi rides in Porto, Portugal based on their (initial) partial trajectories. The output of such a framework must be the final tripโ€™s destination (WGS84 coordinates).

Output

## Loading required package: usethis

Methodology

library(rjson)
library(bit64)
library(data.table)
library(kableExtra)
library(tidyverse)
train_set <- fread('train.csv', select=c('TRIP_ID', 'TIMESTAMP', 'MISSING_DATA', 'POLYLINE'), stringsAsFactors=F)
test_set <- fread('test.csv', select=c('TRIP_ID', 'TIMESTAMP', 'MISSING_DATA', 'POLYLINE'), stringsAsFactors=F)

head(train_set,2) %>% 
  kbl() %>% 
  kable_styling()
TRIP_ID TIMESTAMP MISSING_DATA POLYLINE
1372636858620000589 1372636858 FALSE [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-8.630829,41.154498],[-8.630838,41.154489]]
1372637303620000596 1372637303 FALSE [[-8.639847,41.159826],[-8.640351,41.159871],[-8.642196,41.160114],[-8.644455,41.160492],[-8.646921,41.160951],[-8.649999,41.161491],[-8.653167,41.162031],[-8.656434,41.16258],[-8.660178,41.163192],[-8.663112,41.163687],[-8.666235,41.1642],[-8.669169,41.164704],[-8.670852,41.165136],[-8.670942,41.166576],[-8.66961,41.167962],[-8.668098,41.168988],[-8.66664,41.170005],[-8.665767,41.170635],[-8.66574,41.170671]]
head(test_set,2)%>% 
  kbl() %>% 
  kable_styling()
TRIP_ID TIMESTAMP MISSING_DATA POLYLINE
T1 1408039037 FALSE [[-8.585676,41.148522],[-8.585712,41.148639],[-8.585685,41.148855],[-8.58573,41.148927],[-8.585982,41.148963],[-8.586396,41.148954],[-8.586072,41.14872],[-8.586324,41.147847],[-8.586999,41.14746],[-8.586576,41.147154],[-8.584884,41.146623]]
T2 1408038611 FALSE [[-8.610876,41.14557],[-8.610858,41.145579],[-8.610903,41.145768],[-8.610444,41.146191],[-8.609445,41.146758],[-8.608896,41.147118],[-8.608968,41.147127],[-8.608707,41.147532],[-8.608347,41.148117],[-8.608149,41.148351],[-8.608041,41.148576],[-8.607654,41.14926],[-8.607348,41.149899],[-8.607393,41.149899],[-8.607357,41.149962],[-8.606817,41.150979],[-8.606358,41.151915],[-8.605719,41.152788],[-8.604981,41.153319],[-8.604783,41.154345],[-8.604828,41.154372],[-8.604801,41.155353],[-8.604648,41.156775],[-8.604522,41.158197],[-8.604513,41.159943],[-8.604378,41.160555],[-8.604378,41.1606],[-8.604369,41.160645],[-8.60436,41.160807],[-8.604162,41.161176],[-8.604126,41.161248],[-8.60409,41.161293],[-8.60409,41.161266],[-8.604108,41.161239],[-8.604126,41.161194],[-8.604135,41.161275],[-8.60391,41.162049],[-8.602929,41.162832],[-8.602551,41.163111],[-8.601894,41.163597]]

Cleansing

train_set[, id:=-seq(.N, 1, -1)]
test_set[, id:=1:.N]
train_set$TRIP_ID<- as.numeric(unlist(train_set[,1]))
train_set <- rbind(train_set, test_set)
setnames(train_set, c('trip', 'start_time', 'incomplete_entry', 'polyline', 'id'))
train_set %>% 
  filter(!(incomplete_entry!='True' & polyline!='[]'))
setkey(train_set, id)
poly <- train_set[, transpose(fromJSON(polyline)), by=id]
setnames(poly, c('id', 'lon', 'lat'))
train_set[, c('ncomplete_entry', 'polyline'):=NULL]
train_set <- merge(train_set, poly[, list(readings=.N), id], by='id', all.x=T)
train_set[, start_time:=as.POSIXct(as.integer(start_time), origin="1970-01-01", tz='GMT')]
train_set[, end_time:=start_time+15*readings]
train_set[, c('dt', 't1', 't2'):=list(as.Date(start_time), as.ITime(start_time), as.ITime(end_time))]
train_set[, day:=wday(dt)]
dates <- as.Date(c('2014-09-30', '2014-10-06', '2014-11-01', '2014-08-14', '2014-12-21'))

# Weekday AM
s <- as.ITime('07:30')
e <- as.ITime('09:30')
train_set[(id<0 & day>=2 & day<=6 & t1<e & t2>s), dt:=dates[1]]

# Weekday PM
s <- as.ITime('16:45')
e <- as.ITime('18:45')
train_set[id<0 & day>=2 & day<=6 & t1<e & t2>s, dt:=dates[2]]

# Saturday last
s <- as.ITime('02:30')
e <- as.ITime('05:00')
train_set[id<0 & day>=7 & t1<e & t2>s, dt:=dates[3]]

# Weeknight before hol
d <- as.Date(c('2013-08-14', '2013-12-24', '2014-04-17', '2014-04-24', '2014-04-30', '2014-06-09'))
s <- as.ITime('16:00')
e <- as.ITime('20:00')
train_set[dt %in% d & t1<e & t2>s, dt:=dates[4]]

# Sunday afternoon before hol
d <- as.Date(c('2013-08-11', '2013-12-08', '2013-12-22', '2013-12-29',  '2014-04-13', '2014-04-27', '2014-06-08'))
s <- as.ITime('13:30')
e <- as.ITime('16:45')
train_set[dt %in% d & t1<e & t2>s, dt:=dates[5]]

train_set <- train_set[dt %in% dates, list(id, trip, dt, readings)]
train_set <- train_set[(readings<900 & readings>8) | id>0]
setkey(train_set, id)
setkey(poly, id)
poly <- poly[train_set[, list(id)]]
rm(d, e, s)

Process

source('util.R')

poly[, n:=1:.N, id]
poly <- merge(poly, poly[, list(id, n=n-1, lon, lat)], by=c('id', 'n'), all.x=T, suffix=c('', '2'))
poly[, max_n:=.N-1L, id]
poly[is.na(lon2), c('lon2', 'lat2'):=list(lon, lat)]
train_set <- merge(train_set, poly[n==1, list(id, lon, lat)], by='id', all.x=T)
train_set <- merge(train_set, poly[n==max_n | max_n==0, list(id, lon1=lon, lat1=lat, lon2, lat2)], by='id', all.x=T)
train_set[id>0, bear:=get_bear(lon, lat, lon2, lat2)]
train_set[id>0, bear_cut:=get_bear(lon1, lat1, lon2, lat2)]
setkey(poly, id)
setkey(train_set, id)

for (d in dates){
  p2 <- poly[train_set[dt==d & id<0, list(id)]]  
  
  for (t in unique(train_set[id>0 & dt==d, trip])){    
    train_set_t <- train_set[trip==t]
    temp <- train_set[dt==d & readings>=train_set_t$readings*.9-1 & id<0, !c('dt', 'bear_cut', paste(c('lon', 'lat', 'readings'), 'sub', sep='_')), with=F]
    p <- p2[temp[, list(id)]][n>=train_set_t$readings*.8-2 & n<=train_set_t$readings*1.2+2 & n!=max_n+1L][, list(id, lon, lat, lon2, lat2)]
    temp <- merge(temp, p, all.x=T, by='id', suffix=c('', '_cut'), allow.cartesian=T)
    temp[, dist:=get_dist(lon, lat, train_set_t$lon, train_set_t$lat)]
    temp[, dist2:=get_dist(lon2_cut, lat2_cut, train_set_t$lon2, train_set_t$lat2)]
    temp[, dist_both:=(dist+dist2)]
    temp[, min_dist:=min(dist_both), id]    
    temp <- unique(temp[min_dist==dist_both])
    if (train_set_t$readings>15){
      temp[, bear:=get_bear(lon, lat, lon2_cut, lat2_cut)]      
      temp[, angle:=abs(bear-train_set_t$bear)]
      temp[angle>180, angle:=angle-180]
      temp[, bear_cut:=get_bear(lon_cut, lat_cut, lon2_cut, lat2_cut)]      
      temp[, angle2:=abs(bear_cut-train_set_t$bear_cut)]
      temp[angle2>180, angle2:=angle2-180]
      temp[lon_cut==lon2_cut & lat_cut==lat2_cut, angle2:=angle]  
      temp[lon==lon2 & lat==lat2, angle:=angle2]        
    } else {
      temp[, angle:=5]
      temp[, angle2:=5]
    }    
    temp[, w:=1/(abs(dist)+abs(dist2)+abs(angle)/5+abs(angle2)/5)]          
    
    test_set <- temp[dist<.5 & dist2<1 & angle<15 & angle2<30]
    if (test_set[,.N]>2){
      test_set <- test_set[order(w, decreasing=T)][1:50][!is.na(w)]
      test_set <- test_set[, list(lon=weighted.mean(lon2, w=w), lat=weighted.mean(lat2, w=w), readings=exp(weighted.mean(log(readings), w=w)))]
      train_set[id==train_set_t$id, paste(c('lon', 'lat', 'readings'), 'sub', sep='_'):=test_set]  
    } else {
      test_set <- temp[dist<.7 & dist2<2 & angle<30 & angle<45]  
      if (test_set[,.N]>2){  
        test_set <- test_set[order(w, decreasing=T)][1:50][!is.na(w)]
        test_set <- test_set[, list(lon=weighted.mean(lon2, w=w), lat=weighted.mean(lat2, w=w), readings=exp(weighted.mean(log(readings), w=w)))]
        train_set[id==train_set_t$id, paste(c('lon', 'lat', 'readings'), 'sub', sep='_'):=test_set]  
      } else {
        test_set <- temp[order(w, decreasing=T)][1:50][!is.na(w)]
        test_set <- test_set[, list(lon=weighted.mean(lon2, w=w), lat=weighted.mean(lat2, w=w), readings=exp(weighted.mean(log(readings), w=w)))]
        train_set[id==train_set_t$id, paste(c('lon', 'lat', 'readings'), 'sub', sep='_'):=test_set]  
      }        
    }  
    rm(temp, train_set_t, test_set, p); gc()
  }      
  rm(p2); gc()  
}

a<-write.csv(train_set[id>0, list(TRIP_ID=trip, LATITUDE=lat_sub, LONGITUDE=lon_sub)], 'output/final_loc.csv', row.names=F)
a1<-write.csv(train_set[id>0, list(TRIP_ID=trip, TRAVEL_TIME=readings_sub*15)], 'output/final_time.csv', row.names=F)  
head(a)
##   TRIP_ID LATITUDE LONGITUDE
## 1      T1 41.14997 -8.636295
## 2      T2 41.17923 -8.603827
## 3      T3 41.17815 -8.569510
## 4      T4 41.15219 -8.620015
## 5      T5 41.15618 -8.618449
## 6      T6 41.15597 -8.606692
head(a1)
##   TRIP_ID TRAVEL_TIME
## 1      T1    926.7360
## 2      T2    896.8615
## 3      T3    654.1118
## 4      T4    685.5907
## 5      T5    669.6103
## 6      T6   2716.8150