require(plyr, quietly = TRUE,warn.conflicts = FALSE)
require(dplyr, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'dplyr' was built under R version 3.1.2
require(tidyr, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'tidyr' was built under R version 3.1.2
require(reshape2, quietly = TRUE,warn.conflicts = FALSE)
## Warning: package 'reshape2' was built under R version 3.1.2
require(data.table, quietly = TRUE,warn.conflicts = FALSE)
require(ggplot2, quietly = TRUE,warn.conflicts = FALSE)
setwd("/Users/SKSN/Documents/Kaggle/Xtelematics")
# STEP 1 - reading files with a fast reader "fread"
dataRead <- function(Files, driver) {
tmp <- fread(paste0("drivers/",driver,"/",Files,".csv"), header=T, sep=",")
tmp[, tripID:=Files]
return(tmp)
}
(driverlist <- list.files("./drivers/")) # !Note that only 2 drivers data is processed with dataRead function and saved in local drive
## [1] "1" "2"
dir.create("./data/")
## Warning in dir.create("./data/"): './data' already exists
length(driverlist) # 2 drivers each with 200 trips
## [1] 2
for (i in 1:length(driverlist)) {
d <- driverlist[i]
drives <- rbindlist(lapply(1:200, dataRead, d)) #rbindlist: Makes one data table from 200 trips
save(drives, file = paste('./data/DriverData',d, sep=''))
}
dir("/Users/SKSN/Documents/Kaggle/Xtelematics/data") #output of above for loop
## [1] "DriverData1" "DriverData2"
# STEP 2: Create 2 additional columns
setwd("/Users/SKSN/Documents/Kaggle/Xtelematics/drivers/1")
dir_list=list.files("./"); head(dir_list); tail(dir_list)
## [1] "1.csv" "10.csv" "100.csv" "101.csv" "102.csv" "103.csv"
## [1] "94.csv" "95.csv" "96.csv" "97.csv" "98.csv" "99.csv"
driver1 = lapply(dir_list, read.csv) # read all 200 csv files
idx = unlist(lapply(driver1, nrow)); head(idx); tail(idx) # idx length of each route; first 6 idx length displayed
## [1] 863 238 231 1640 983 1369
## [1] 352 229 598 336 1010 329
driver1 = do.call(rbind, driver1) # row binding 200 files
trip = rep(1:length(dir_list), idx); head(trip); tail(trip) # trips: 1 thru 200
## [1] 1 1 1 1 1 1
## [1] 200 200 200 200 200 200
times = unlist(sapply(idx, function(x) seq(from=1, to=x, by=1))); head(times); tail(times) # each lat long times stamped
## [1] 1 2 3 4 5 6
## [1] 324 325 326 327 328 329
driver1 = cbind(driver1, trip, times); head(driver1); tail(driver1)
## x y trip times
## 1 0.0 0.0 1 1
## 2 18.6 -11.1 1 2
## 3 36.1 -21.9 1 3
## 4 53.7 -32.6 1 4
## 5 70.1 -42.8 1 5
## 6 86.5 -52.6 1 6
## x y trip times
## 116755 212.7 -236.9 200 324
## 116756 212.7 -236.9 200 325
## 116757 212.0 -236.1 200 326
## 116758 211.9 -236.2 200 327
## 116759 211.9 -236.2 200 328
## 116760 211.9 -236.2 200 329
# VISUALIZATION
driver1$dist = sqrt(driver1$x^2+driver1$y^2) # distance between x and y points
ggplot(data = driver1, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path() +
geom_text(data= driver1 %>% group_by(trip) %>% filter(dist == max(dist)), color='black', size=3.5,
position = position_jitter())+ theme(legend.position='none')+ ggtitle('Driver 1 Trips')

#STEP 3: ROTATION for single trip for the demo purposes
df = driver1
(M = matrix(ncol=2)) #empty matrix
## [,1] [,2]
## [1,] NA NA
length(unique(df$trip))
## [1] 200
xyDistance = df[df$trip == 1,c('x','y','dist')]; head(xyDistance) # trip by trip
## x y dist
## 1 0.0 0.0 0.00000
## 2 18.6 -11.1 21.66033
## 3 36.1 -21.9 42.22345
## 4 53.7 -32.6 62.82078
## 5 70.1 -42.8 82.13312
## 6 86.5 -52.6 101.23739
(maxDist = xyDistance[max.col(t(xyDistance$dist),'last'),1:2]) #max.col: find max position in matrix
## x y
## 390 4607.2 -2602
matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)
## [,1] [,2]
## [1,] 4607.2 2602.0
## [2,] -2602.0 4607.2
rotations = matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)/sqrt(maxDist$x^2+maxDist$y^2); head(rotations)
## [,1] [,2]
## [1,] 0.8707303 0.4917608
## [2,] -0.4917608 0.8707303
rotated.points = as.matrix(df[df$trip==1,1:2])%*%rotations; head(rotated.points) # matrix multiplication
## [,1] [,2]
## 1 0.00000 0.0000000
## 2 21.65413 -0.5183559
## 3 42.20293 -1.3164297
## 4 62.78962 -1.9782543
## 5 82.08556 -2.7948267
## 6 101.18479 -3.2631069
if (sum(sign(rotated.points[,2]))<0) rotated.points[,2] = - rotated.points[,2]
M = rbind(M, rotated.points); head(M) # matrix aggregated for all 200 trips
## [,1] [,2]
## NA NA
## 1 0.00000 0.0000000
## 2 21.65413 0.5183559
## 3 42.20293 1.3164297
## 4 62.78962 1.9782543
## 5 82.08556 2.7948267
M = M[-which(is.na(M)),] # remove any NA points
M = as.data.frame(M);
M = cbind(M, df$trip[1:863], df$times[1:863], df$dist[1:863])
colnames(M) = c('x','y','trip','times', 'dist'); head(M); tail(M)
## x y trip times dist
## 1 0.00000 0.0000000 1 1 0.00000
## 2 21.65413 0.5183559 1 2 21.66033
## 3 42.20293 1.3164297 1 3 42.22345
## 4 62.78962 1.9782543 1 4 62.82078
## 5 82.08556 2.7948267 1 5 82.13312
## 6 101.18479 3.2631069 1 6 101.23739
## x y trip times dist
## 858 -968.0688 -625.1553 1 858 1152.379
## 859 -968.0196 -625.0683 1 859 1152.290
## 860 -968.0196 -625.0683 1 860 1152.290
## 861 -968.0196 -625.0683 1 861 1152.290
## 862 -968.0196 -625.0683 1 862 1152.290
## 863 -968.0196 -625.0683 1 863 1152.290
#STEP3: ROTATION OF ALL POINTS IN 200 TRIPS FOR DRIVER 1
df = driver1
(M = matrix(ncol=2)) #empty matrix
## [,1] [,2]
## [1,] NA NA
for(i in 1:200) { #200 trips
xyDistance = df[df$trip == i,c('x','y','dist')] # trip by trip
maxDist = xyDistance[max.col(t(xyDistance$dist),'last'),1:2] #max.col: find max position in matrix
rotations = matrix(c(maxDist$x,maxDist$y,-maxDist$y,maxDist$x),2,2)/sqrt(maxDist$x^2+maxDist$y^2)
rotated.points = as.matrix(df[df$trip==i,1:2])%*%rotations # matrix multiplication
if (sum(sign(rotated.points[,2]))<0) rotated.points[,2] = - rotated.points[,2]
M = rbind(M, rotated.points) # matrix aggregated for all 200 trips
}
M = M[-which(is.na(M)),] # remove any NA points
M = as.data.frame(M);
M = cbind(M, df$trip, df$times, df$dist)
colnames(M) = c('x','y','trip','times', 'dist'); dim(M); head(M); tail(M)
## [1] 116760 5
## x y trip times dist
## 1 0.00000 0.0000000 1 1 0.00000
## 2 21.65413 0.5183559 1 2 21.66033
## 3 42.20293 1.3164297 1 3 42.22345
## 4 62.78962 1.9782543 1 4 62.82078
## 5 82.08556 2.7948267 1 5 82.13312
## 6 101.18479 3.2631069 1 6 101.23739
## x y trip times dist
## 116755 90.78683 305.1568 200 324 318.3754
## 116756 90.78683 305.1568 200 325 318.3754
## 116757 90.47069 304.1418 200 326 317.3125
## 116758 90.60821 304.1089 200 327 317.3201
## 116759 90.60821 304.1089 200 328 317.3201
## 116760 90.60821 304.1089 200 329 317.3201
rdriver1 = M
#VISUALIZATION
rdriver1.sub = filter(rdriver1, trip == 48 | trip == 73 | trip == 145 )
ggplot(rdriver1.sub, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path()+
geom_text(data= rdriver1.sub %>% group_by(trip) %>% filter(dist == max(dist)), color='black', size=4,
position = position_jitter())+theme(legend.position='none')+
ggtitle('Selected overlapped (rotated) trips - trip 25, 55, 183') #selected routes

ggplot(rdriver1, aes(x=x,y=y, group=trip, color=factor(trip), label=trip))+geom_path()+theme_bw()+
geom_text(data= rdriver1 %>% group_by(trip) %>% filter(y == max(y) | y==min(y)), color='black', size=3,
position = position_jitter())+theme(legend.position='none')+
ggtitle('All 200 Rotated Trips for Driver 1') # all routes
