require(plyr, quietly = TRUE,warn.conflicts = FALSE)
require(dplyr, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'dplyr' was built under R version 3.1.2
require(tidyr, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'tidyr' was built under R version 3.1.2
require(reshape2, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'reshape2' was built under R version 3.1.2
require(data.table, quietly = TRUE,warn.conflicts = FALSE)
require(ggplot2, quietly = TRUE,warn.conflicts = FALSE)
setwd("/Users/SKSN/Documents/Kaggle/Xtelematics")
# STEP 1 - reading files with a fast reader "fread"
dataRead <- function(Files, driver) {
    tmp <- fread(paste0("drivers/",driver,"/",Files,".csv"), header=T, sep=",")
    tmp[, tripID:=Files]
    return(tmp)
}

(driverlist <- list.files("./drivers/")) # !Note that only 2 drivers data is processed with dataRead function and saved in local drive
## [1] "1" "2"
dir.create("./data/")
## Warning in dir.create("./data/"): './data' already exists
    length(driverlist) # 2 drivers each with 200 trips
## [1] 2
for (i in 1:length(driverlist)) {
    d <- driverlist[i]
    drives <- rbindlist(lapply(1:200, dataRead, d)) #rbindlist: Makes one data table from 200 trips
    save(drives, file = paste('./data/DriverData',d, sep=''))
    }
dir("/Users/SKSN/Documents/Kaggle/Xtelematics/data") #output of above for loop
## [1] "DriverData1" "DriverData2"
# STEP 2: Create 2 additional columns
setwd("/Users/SKSN/Documents/Kaggle/Xtelematics/drivers/1")
dir_list=list.files("./"); head(dir_list); tail(dir_list)
## [1] "1.csv"   "10.csv"  "100.csv" "101.csv" "102.csv" "103.csv"
## [1] "94.csv" "95.csv" "96.csv" "97.csv" "98.csv" "99.csv"
driver1 = lapply(dir_list, read.csv) # read all 200 csv files
 idx = unlist(lapply(driver1, nrow)); head(idx); tail(idx) # idx length of each route; first 6 idx length displayed
## [1]  863  238  231 1640  983 1369
## [1]  352  229  598  336 1010  329
driver1 = do.call(rbind, driver1) # row binding 200 files 
 trip = rep(1:length(dir_list), idx); head(trip); tail(trip) # trips: 1 thru 200 
## [1] 1 1 1 1 1 1
## [1] 200 200 200 200 200 200
 times = unlist(sapply(idx, function(x) seq(from=1, to=x, by=1))); head(times); tail(times) # each lat long times stamped
## [1] 1 2 3 4 5 6
## [1] 324 325 326 327 328 329
driver1 = cbind(driver1, trip, times); head(driver1); tail(driver1)
##      x     y trip times
## 1  0.0   0.0    1     1
## 2 18.6 -11.1    1     2
## 3 36.1 -21.9    1     3
## 4 53.7 -32.6    1     4
## 5 70.1 -42.8    1     5
## 6 86.5 -52.6    1     6
##            x      y trip times
## 116755 212.7 -236.9  200   324
## 116756 212.7 -236.9  200   325
## 116757 212.0 -236.1  200   326
## 116758 211.9 -236.2  200   327
## 116759 211.9 -236.2  200   328
## 116760 211.9 -236.2  200   329
# VISUALIZATION
driver1$dist = sqrt(driver1$x^2+driver1$y^2) # distance between x and y points
ggplot(data = driver1, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path() +
 geom_text(data= driver1 %>% group_by(trip) %>% filter(dist == max(dist)), color='black', size=3.5, 
 position = position_jitter())+ theme(legend.position='none')+  ggtitle('Driver 1 Trips')

#STEP 3: ROTATION for single trip for the demo purposes
df = driver1
(M = matrix(ncol=2)) #empty matrix
##      [,1] [,2]
## [1,]   NA   NA
length(unique(df$trip))  
## [1] 200
    xyDistance = df[df$trip == 1,c('x','y','dist')]; head(xyDistance) # trip by trip
##      x     y      dist
## 1  0.0   0.0   0.00000
## 2 18.6 -11.1  21.66033
## 3 36.1 -21.9  42.22345
## 4 53.7 -32.6  62.82078
## 5 70.1 -42.8  82.13312
## 6 86.5 -52.6 101.23739
    (maxDist = xyDistance[max.col(t(xyDistance$dist),'last'),1:2]) #max.col: find max position in matrix
##          x     y
## 390 4607.2 -2602
    matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)
##         [,1]   [,2]
## [1,]  4607.2 2602.0
## [2,] -2602.0 4607.2
    rotations = matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)/sqrt(maxDist$x^2+maxDist$y^2); head(rotations)
##            [,1]      [,2]
## [1,]  0.8707303 0.4917608
## [2,] -0.4917608 0.8707303
    rotated.points = as.matrix(df[df$trip==1,1:2])%*%rotations; head(rotated.points) # matrix multiplication
##        [,1]       [,2]
## 1   0.00000  0.0000000
## 2  21.65413 -0.5183559
## 3  42.20293 -1.3164297
## 4  62.78962 -1.9782543
## 5  82.08556 -2.7948267
## 6 101.18479 -3.2631069
    if (sum(sign(rotated.points[,2]))<0) rotated.points[,2] = - rotated.points[,2]
M = rbind(M, rotated.points); head(M) # matrix aggregated for all 200 trips
##       [,1]      [,2]
##         NA        NA
## 1  0.00000 0.0000000
## 2 21.65413 0.5183559
## 3 42.20293 1.3164297
## 4 62.78962 1.9782543
## 5 82.08556 2.7948267
M = M[-which(is.na(M)),] # remove any NA points
M = as.data.frame(M); 
M = cbind(M, df$trip[1:863], df$times[1:863], df$dist[1:863])
colnames(M) = c('x','y','trip','times', 'dist'); head(M); tail(M)
##           x         y trip times      dist
## 1   0.00000 0.0000000    1     1   0.00000
## 2  21.65413 0.5183559    1     2  21.66033
## 3  42.20293 1.3164297    1     3  42.22345
## 4  62.78962 1.9782543    1     4  62.82078
## 5  82.08556 2.7948267    1     5  82.13312
## 6 101.18479 3.2631069    1     6 101.23739
##             x         y trip times     dist
## 858 -968.0688 -625.1553    1   858 1152.379
## 859 -968.0196 -625.0683    1   859 1152.290
## 860 -968.0196 -625.0683    1   860 1152.290
## 861 -968.0196 -625.0683    1   861 1152.290
## 862 -968.0196 -625.0683    1   862 1152.290
## 863 -968.0196 -625.0683    1   863 1152.290
#STEP3: ROTATION OF ALL POINTS IN 200 TRIPS FOR DRIVER 1 
df = driver1
(M = matrix(ncol=2)) #empty matrix
##      [,1] [,2]
## [1,]   NA   NA
for(i in 1:200) { #200 trips
 xyDistance = df[df$trip == i,c('x','y','dist')] # trip by trip
 maxDist = xyDistance[max.col(t(xyDistance$dist),'last'),1:2] #max.col: find max position in matrix
 rotations = matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)/sqrt(maxDist$x^2+maxDist$y^2) 
 rotated.points = as.matrix(df[df$trip==i,1:2])%*%rotations # matrix multiplication
    if (sum(sign(rotated.points[,2]))<0) rotated.points[,2] = - rotated.points[,2]
     M = rbind(M, rotated.points) # matrix aggregated for all 200 trips
} 
M = M[-which(is.na(M)),] # remove any NA points
M = as.data.frame(M);
M = cbind(M, df$trip, df$times, df$dist)
colnames(M) = c('x','y','trip','times', 'dist'); dim(M); head(M); tail(M)
## [1] 116760      5
##           x         y trip times      dist
## 1   0.00000 0.0000000    1     1   0.00000
## 2  21.65413 0.5183559    1     2  21.66033
## 3  42.20293 1.3164297    1     3  42.22345
## 4  62.78962 1.9782543    1     4  62.82078
## 5  82.08556 2.7948267    1     5  82.13312
## 6 101.18479 3.2631069    1     6 101.23739
##               x        y trip times     dist
## 116755 90.78683 305.1568  200   324 318.3754
## 116756 90.78683 305.1568  200   325 318.3754
## 116757 90.47069 304.1418  200   326 317.3125
## 116758 90.60821 304.1089  200   327 317.3201
## 116759 90.60821 304.1089  200   328 317.3201
## 116760 90.60821 304.1089  200   329 317.3201
rdriver1 = M
#VISUALIZATION
rdriver1.sub = filter(rdriver1, trip == 48 | trip == 73 | trip == 145 )
ggplot(rdriver1.sub, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path()+
    geom_text(data= rdriver1.sub %>% group_by(trip) %>% filter(dist == max(dist)), color='black', size=4, 
    position = position_jitter())+theme(legend.position='none')+
    ggtitle('Selected overlapped (rotated) trips - trip 25, 55, 183') #selected routes

ggplot(rdriver1, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path()+theme_bw()+
    geom_text(data= rdriver1 %>% group_by(trip) %>% filter(y == max(y) | y==min(y)), color='black', size=3, 
    position = position_jitter())+theme(legend.position='none')+
    ggtitle('All 200 Rotated Trips for Driver 1') # all routes