Popular routes

Synopsis

Company called Uber shared some anonymised data about routes gathered from GPS receivers of their clients. I will try to find most often used routes based on this data. Md5sum for data file: fbe6777e510ceab1b1ed112e712d7276

Data Processing

uber.data <- read.delim("infochimps_uber-anonymized-gps-logs/uber_gps_tsv/all.tsv", header = FALSE)

library(tools)
md5sum("infochimps_uber-anonymized-gps-logs/uber_gps_tsv/all.tsv")
## infochimps_uber-anonymized-gps-logs/uber_gps_tsv/all.tsv 
##                       "fbe6777e510ceab1b1ed112e712d7276"
names(uber.data) <- c("trip","timestamp","lat","lon")

Lets calculate left, right and top bottom points. And distance between them.

library(fossil)
## Loading required package: sp
## Loading required package: maps
## Loading required package: shapefiles
## Loading required package: foreign
## 
## Attaching package: 'shapefiles'
## 
## The following objects are masked from 'package:foreign':
## 
##     read.dbf, write.dbf
left <- c(mean(uber.data$lat), min(uber.data$lon))
right <- c(mean(uber.data$lat), max(uber.data$lon))
top <- c(max(uber.data$lat), mean(uber.data$lon))
bot <- c(min(uber.data$lat), mean(uber.data$lon))
edges <- rbind(left,right,top,bot)
edges
##        [,1]   [,2]
## left  37.77 -122.5
## right 37.77 -115.2
## top   37.96 -122.4
## bot   36.12 -122.4
distances <- earth.dist(edges)
left.to.right <- distances[1]
bot.to.top <- distances[6]

Lets build grid 10 x 10 meters. I need to calculate on how many parts should I divide data range.

horizontal.slice <- left.to.right / 0.01
vertical.slice <- bot.to.top / 0.01

Box generation:

uber.data$lat.sliced <- cut(uber.data$lat,vertical.slice)
uber.data$lon.sliced <- cut(uber.data$lon,horizontal.slice)

Putting mean as label for each box:

uber.data$lat.sliced2 <- ave(uber.data$lat, cut(uber.data$lat,vertical.slice), FUN = median)
uber.data$lon.sliced2 <- ave(uber.data$lon, cut(uber.data$lon,horizontal.slice), FUN = median)

Finding and removing the same boxes which are occurring one after another. I’m doing this to avoid situation where slow drive causes many locations aggregated to the same box.

uber.data$dup <- FALSE
for (box in 1:(nrow(uber.data)-1)) {
  if (uber.data$trip[box] == uber.data$trip[box + 1]) {
    if (uber.data$lat.sliced2[box] == uber.data$lat.sliced2[box + 1] & uber.data$lon.sliced2[box]  == uber.data$lon.sliced2[box + 1]) {
      uber.data$dup[box + 1] <- TRUE
    } 
  }
}
uber.data <- uber.data[uber.data$dup == FALSE, ]

Results

Frequency of each box:

freq.table <- as.data.frame(table(uber.data$lat.sliced2,uber.data$lon.sliced2))
freq.table <- freq.table[freq.table$Freq != 0, ]
summary(freq.table)
##         Var1                 Var2             Freq       
##  37.786802:   578   -122.406841:   312   Min.   :  1.00  
##  37.786634:   575   -122.406661:   298   1st Qu.:  1.00  
##  37.786466:   574   -122.406483:   298   Median :  2.00  
##  37.78697 :   553   -122.407022:   296   Mean   :  6.89  
##  37.787639:   552   -122.406753:   289   3rd Qu.:  7.00  
##  37.786301:   543   -122.406391:   289   Max.   :187.00  
##  (Other)  :151545   (Other)    :153138

Lets select top 1% of frequencies

top.freq <- quantile(freq.table$Freq,.99)
top.freq.table <- freq.table[freq.table$Freq >= top.freq, ]
names(top.freq.table) <- c("Lat","Lon","Freq")
 
top.freq.table$Lat <- as.numeric(levels(top.freq.table$Lat))[top.freq.table$Lat]
top.freq.table$Lon <- as.numeric(levels(top.freq.table$Lon))[top.freq.table$Lon]

Printing map

library(ggmap)
## Loading required package: ggplot2
cool.map <- get_map(location = c(lon = median(top.freq.table$Lon), lat = median(top.freq.table$Lat)), zoom = 13)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.777578,-122.422125&zoom=13&size=%20640x640&scale=%202&maptype=terrain&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
ggmap(cool.map) + geom_point(data = top.freq.table, aes(x = Lon, y = Lat)) + ggtitle("Most frequent routes")

plot of chunk unnamed-chunk-9

This analysis don’t give strict street names and routes. It just plot points on map which allow to estimate frequent location of cars. To prepare better analysis one could download road data from open street maps or use different technique to determine popular spots. One can also use reverse geo-coding to find addresses of particular points.

R and packages information

Following versions of R and packages were used.

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=pl_PL.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=pl_PL.UTF-8        LC_COLLATE=pl_PL.UTF-8    
##  [5] LC_MONETARY=pl_PL.UTF-8    LC_MESSAGES=pl_PL.UTF-8   
##  [7] LC_PAPER=pl_PL.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=pl_PL.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] tools     stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] mapproj_1.2-2  ggmap_2.3      ggplot2_1.0.0  fossil_0.3.7  
## [5] shapefiles_0.7 foreign_0.8-61 maps_2.3-7     sp_1.0-15     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.2-4    digest_0.6.4        evaluate_0.5.5     
##  [4] formatR_0.10        grid_3.1.1          gtable_0.1.2       
##  [7] htmltools_0.2.4     knitr_1.6           labeling_0.2       
## [10] lattice_0.20-29     MASS_7.3-33         munsell_0.4.2      
## [13] plyr_1.8.1          png_0.1-7           proto_0.3-10       
## [16] Rcpp_0.11.2         reshape2_1.4        RgoogleMaps_1.2.0.6
## [19] rjson_0.2.14        RJSONIO_1.2-0.2     rmarkdown_0.2.54   
## [22] scales_0.2.4        stringr_0.6.2