library(data.table)
data.table 1.10.4
  The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
  Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
  Release notes, videos and slides: http://r-datatable.com
library(dplyr)
-----------------------------------------------------------------------------------------------------------------------------
data.table + dplyr code now lives in dtplyr.
Please library(dtplyr)!
-----------------------------------------------------------------------------------------------------------------------------

Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(magrittr)
library(tidyr)

Attaching package: 'tidyr'

The following object is masked from 'package:magrittr':

    extract
library(stringi)
library(stringr)
library(xda)
library(caret)
Loading required package: lattice
Loading required package: ggplot2
library(doMC)
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
Loading required package: iterators
Loading required package: parallel
library(dummy)
dummy 0.1.3
dummyNews()
library(leaflet)
library(rgdal)
package 'rgdal' was built under R version 3.4.1Loading required package: sp
package 'sp' was built under R version 3.4.1rgdal: version: 1.2-8, (SVN revision 663)
 Geospatial Data Abstraction Library extensions to R successfully loaded
 Loaded GDAL runtime: GDAL 2.1.3, released 2017/20/01
 Path to GDAL shared files: /Users/CA/Library/R/3.4/library/rgdal/gdal
 Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
 Path to PROJ.4 shared files: /Users/CA/Library/R/3.4/library/rgdal/proj
 Linking to sp version: 1.2-4 
library(geojsonio)
package 'geojsonio' was built under R version 3.4.1
Attaching package: 'geojsonio'

The following object is masked from 'package:base':

    pretty
library(lubridate)

Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year

The following object is masked from 'package:base':

    date
library(geosphere)

Attaching package: 'geosphere'

The following object is masked from 'package:geojsonio':

    centroid
library(gridExtra)

Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
library(ggmap)
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.

Attaching package: 'ggmap'

The following object is masked from 'package:magrittr':

    inset
library(geohash)
package 'geohash' was built under R version 3.4.1
train <- fread("/Users/CA/Downloads/NY_taxi/train.csv", na.strings = "")
test <- fread("/Users/CA/Downloads/NY_taxi/test.csv", na.strings = "")
numSummary(train)
charSummary(train)
coord2distance <- Vectorize(function(lat1, lng1, lat2, lng2) {
  rad_per_deg = pi / 180
  rkm = 6371
  rm = rkm * 1000
  
  dlat_rad = (lat2 - lat1) * rad_per_deg
  dlng_rad = (lng2 - lng1) * rad_per_deg
  
  lat1_rad = lat1 * rad_per_deg
  lng1_rad = lng1 * rad_per_deg
  lat2_rad = lat2 * rad_per_deg
  lng2_rad = lng2 * rad_per_deg
  
  a = sin(dlat_rad/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlng_rad/2)**2
  c = 2 * atan2(sqrt(a), sqrt(1-a))
  
  return(rm * c)
})
## 시간 데이터( String -> DateTime ), 출도착 직선거리(meter), 직선거리 기준 속도(km/h) 추가 
## train data 의 기간은 1월 ~ 6월까지 총 6개월간의 데이터이다. 
train %<>%
  mutate(pickup_datetime   = ymd_hms(pickup_datetime), 
         dropoff_datetime  = ymd_hms(dropoff_datetime),
         distance          = coord2distance( pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude),
         speed             = round(distance / 1000 / (trip_duration/3600),2)) %>%
  mutate_each_(funs(as.double(.)), c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"))
head(train)
## vendor : a code indicating the provider associated with the trip record
## Q1 : 사업구역? #=> 지역별 사업구역은 별도로 확인되고 있지 않음. 
table(train$vendor_id)

     1      2 
678342 780302 
train$vendor_id <- factor(train$vendor_id)
qmplot(pickup_longitude, pickup_latitude, data = sample_frac(train,0.0001), colour = vendor_id,
       zoom = 12)
Map from URL : http://tile.stamen.com/toner-lite/12/1205/1538.png
Map from URL : http://tile.stamen.com/toner-lite/12/1206/1538.png
Map from URL : http://tile.stamen.com/toner-lite/12/1207/1538.png
Map from URL : http://tile.stamen.com/toner-lite/12/1208/1538.png
Map from URL : http://tile.stamen.com/toner-lite/12/1205/1539.png
Map from URL : http://tile.stamen.com/toner-lite/12/1206/1539.png
Map from URL : http://tile.stamen.com/toner-lite/12/1207/1539.png
Map from URL : http://tile.stamen.com/toner-lite/12/1208/1539.png
Map from URL : http://tile.stamen.com/toner-lite/12/1205/1540.png
Map from URL : http://tile.stamen.com/toner-lite/12/1206/1540.png
Map from URL : http://tile.stamen.com/toner-lite/12/1207/1540.png
Map from URL : http://tile.stamen.com/toner-lite/12/1208/1540.png
Map from URL : http://tile.stamen.com/toner-lite/12/1205/1541.png
Map from URL : http://tile.stamen.com/toner-lite/12/1206/1541.png
Map from URL : http://tile.stamen.com/toner-lite/12/1207/1541.png
Map from URL : http://tile.stamen.com/toner-lite/12/1208/1541.png
`panel.margin` is deprecated. Please use `panel.spacing` property instead

# pickup_datetime : date and time when the meter was engaged
train %<>%
  mutate(pickup_wday = wday(pickup_datetime, label = T), pickup_hour = hour(pickup_datetime)) %>%
  mutate(dropoff_wday = wday(dropoff_datetime, label = T), dropoff_hour = hour(dropoff_datetime))
## Q1. 요일별 pickup 수 분포     #=> 월요일이 제일 적음 ( 의외! , 휴일이라도 끼어 있나 ? ) 
## Q2. 시간대별 pickup 수 분포   #=> 출근보다 퇴근시간에 더 많음  
table(train$pickup_wday)

   Sun    Mon   Tues    Wed  Thurs    Fri    Sat 
195366 187418 202749 210136 218574 223533 220868 
table(train$pickup_hour)

    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19 
53248 38571 27972 20895 15792 15002 33248 55600 67053 67663 65437 68476 71873 71473 74292 71811 64313 76483 90600 90308 
   20    21    22    23 
84072 84185 80492 69785 
p1 <- ggplot(data = train, mapping = aes(x = pickup_wday, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
p2 <- ggplot(data = train, mapping = aes(x = pickup_hour, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
grid.arrange(p1, p2, nrow = 2, ncol = 1)

## Q3. 요일/시간대별 평균 속도 
##      #=> 요일별 평균속도는 월/일/토 , 화/금 , 수/목 순으로 좋음 
##      #=> 시간별 평균속도는 퇴근시간대보다 일과시간이 나쁘고, 새벽시간대가 제일 좋음 
## Q4. 요일/시간대별 평균 운행 직선 거리 
##      #=> 요일별 평균 운행 직선 거리는 월/일 이 상대적으로 길고, 기타 요일에는 대동소이 
##      #=> 시간별 평균 운행 직선 거리는 평균속도 패턴과 동일.
avg_speed_by_wday    <- train %>% group_by(pickup_wday) %>% summarise(avg_speed_by_wday = mean(speed))
avg_distance_by_wday <- train %>% group_by(pickup_wday) %>% summarise(avg_distance_by_wday = mean(distance))
avg_speed_by_hour    <- train %>% group_by(pickup_hour) %>% summarise(avg_speed_by_hour = mean(speed))
avg_distance_by_hour <- train %>% group_by(pickup_hour) %>% summarise(avg_distance_by_hour = mean(distance))
head(merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday") %>% arrange(desc(avg_distance_by_wday)), n = 7)
head(merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") %>% arrange(desc(avg_speed_by_hour)), n = 24)
hourly_data <- merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") 
#p1 <- ggplot(data = hourly_data, aes(pickup_hour, avg_speed_by_hour)) + geom_line() + geom_point()
#p2 <- ggplot(data = hourly_data, aes(pickup_hour, avg_distance_by_hour)) + geom_line() + geom_point()
#grid.arrange(p1, p2, nrow = 2, ncol = 1)
ggplot(data = hourly_data, aes(x = pickup_hour)) +
  geom_line(aes(y = avg_speed_by_hour, colour = "speed")) +
  geom_line(aes(y = round(avg_distance_by_hour/250 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")

# passenger count 에 따른 속도 변화 
# A1 : 합승자가 많으면, 경유지 증가에 따라 거리 증가 및 평균 속도 감소가 있을 것 이다.
table(train$passenger_count)

      0       1       2       3       4       5       6       7       8       9 
     60 1033540  210318   59896   28404   78088   48333       3       1       1 
train$passenger_count <- as.numeric(as.character(train$passenger_count))
avg_speed_by_pc    <- train %>% 
                        filter(passenger_count %in% c(1,2,3,4,5,6)) %>% 
                        group_by(passenger_count) %>% 
                        summarise(avg_speed = mean(speed), avg_distnace = mean(distance))
ggplot(data = avg_speed_by_pc, aes(x = passenger_count)) +
  geom_line(aes(y = avg_speed, colour = "speed")) +
  geom_line(aes(y = round(avg_distnace/260 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*260, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")

nrow(routes_5)                       #=> 860,132 개
[1] 2214
nrow(routes_5 %>% distinct(pickup))  #=> 9,238 개 
[1] 263
nrow(routes_5 %>% distinct(dropoff)) #=> 21,959 개
[1] 369
route_freq <- as.data.frame(table(train$pickup_geohash_5, train$dropoff_geohash_5))
Warning message:
In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
route_freq %<>% filter(Freq > 0)
library(plotly)
route_spec <- train %>% 
                group_by(pickup_geohash_5, dropoff_geohash_5) %>%
                summarise(count = n(), avg_speed = mean(speed), avg_distnace = mean(distance)) %>%
                filter(count > 0 & avg_speed > 1 & avg_speed < 70 )
plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$count, type = "heatmap", colorscale = "Greys")

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_speed, type = "heatmap", colorscale = "Greys")

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_distnace, type = "heatmap", colorscale = "Greys")

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = log(route_spec$count * route_spec$avg_distnace), type = "heatmap", colorscale = "Greys")
# 자.. 그전에 distance 들을 좀 살펴보자. 
par(mfrow = c(1,3))
Warning messages:
1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
plot(density(train$distance))         # 굉장히 심한 L-Shape
plot(density(log(train$distance)))    # log-transform
plot(density(log(train$distance+10))) # +10 씩 움직여주고, log-transform

summary(train$distance)               # 851 km 짜리 직선거리도 있다. ( 명백한 outlier ) , 역시 야들도 GPS 가 튄다.
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0     848    1531    2876    2847  851968 
# GPS 가 튀는 것들은 거리와 상관없이 발생할텐데, 851km 는 거리로 잘 보인 케이스일뿐이다.
# 이런 outlier 들은 평균 속도가 이상한 것들로 잡아보자 
summary(train$speed)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0       6      10      11      15    8619 
nrow(filter(train, speed > 60))       
[1] 932
# 932 건이다. 이렇게 그냥 평균 속도로 짜를 것인지, 
# 출도착지의 시간대별 평균 속도에 기반하여 솎아낼지는 판단해야한다... 귀찮다;; 
# 위와는 조금 다르게, GPS 가 튀어서 거리가 매우 작게 잡히는 경우도 있다. 
nrow(train %>% filter(distance < 50 | trip_duration > 3000 ))
[1] 35795
# 위치데이터 자체로는 이상값판별이 안되므로, 시간, 속도 등을 기반으로 다 걸러내야 한다. 
plot(train$distance, train$trip_duration)

plot(train$distance, train$speed)

remove_outliers <- function(df, col1, col2) {
  library(ldbod)
  scores <- ldbod(subset(df, select = c(col1, col2)), k = 50)
  head(scores$lrd); head(scores$rkof)
  
  s <- subset(df, select = c(col1, col2))
  plot(s)
  top50outliers <- s[order(scores$lpde,decreasing=FALSE)[1:50],]
  points(top50outliers,col=2)
  
  df <- df[!(as.matrix(df[col1]) %in% as.matrix(top50outliers[col1]) & as.matrix(df[col2]) %in% as.matrix(top50outliers[col2])),]
  
  return(df)
}
# package 'ldbod' : Flexible procedures to compute local density-based outlier scores for ranking outliers
# 이번 케이스는, 밀도기반의 outlier 탐지가 잘 동작할 것으로 기대해봄. 
# 계산량이 많아, parallel 이 지원되어야 함. 
# 몇가지 lof 관련 package 가 있지만, ldbod 가 제일 좋은 결과를 보여줌.. 하지만 속도가 망 
# train <- remove_outliers(train, "speed", "distance")
# train <- remove_outliers(train, "trip_duration", "distance")
spd_dist_time <- subset(train, select = c(speed, distance, trip_duration))
registerDoMC(cores = 4)
sample_train <- sample_frac(train, 0.1)
#sample_train <- rbind(sample_train, train %>% filter(speed > 100))
sample_train <- remove_outliers(sample_train, 'speed', 'distance')

sample_train <- remove_outliers(sample_train, 'trip_duration', 'distance')

sample_train <- remove_outliers(sample_train, 'trip_duration', 'speed')

library(MVN)
sROC 0.1-2 loaded
#res <- mvOutlier(spd_dist_time, label = F, position = 4)
library(Rlof)
Loading required package: doParallel
#sdt <- sample_frac(spd_dist_time, 0.01)
#sdt <- rbind(sdt, spd_dist_time %>% filter(speed > 100))
#sdt <- cbind(sdt,lof(sdt, c(1:5)))
#head(sdt %>% arrange(desc(`1`)), n = 100)
library(DMwR)
Loading required package: grid
#sdt <- sample_frac(spd_dist_time, 0.01)
#sdt <- rbind(sdt, spd_dist_time %>% filter(speed > 100))
#sdt %<>%
#  mutate(speed = round(speed / 10, 0)) %>%
#  mutate(distance = round(distance / 1000, 0)) 
#sdt <- cbind(sdt, lofactor(subset(sdt, select = c(speed, distance)), k = 10))
# density based 로 outlier 를 탐지하는 것의 약점은, normal case 도 제거가능하다는 것이긴하다......
# 일단 lof 로 제거된 후, 다시 distance~duration / distance~speed 상태를 보자 
par(mfrow = c(1,2))
plot(sample_train$distance, sample_train$trip_duration)
plot(sample_train$distance, sample_train$speed)

# distance ~ duration 그래프의 좌상단 과 distance ~ speed 에서 speed ~ 0 의 일직선 부분이 density based 로 못잡은
# outlier 들이다. 이들은 특정패턴 ( 출도착지 좌표는 거의 변화가 없는 케이스 ) 들이 조밀하게 모여 있어서 그런것이다. 
# 즉 특정패턴의 이상현상이 일정 빈도이상 발생하게 되면, density based 로는 잡을 수 없다.
# 이런 녀석들은 rule 로 잡아줘야 할 것 같다. 
# 아니면, density 를 먹이기 전에, 미리 특정 density 이상의 패턴을 갖는 outlier 들을 제거한 후, density 를 먹여도 된다.
# ( 음.. 그러니까.. 다시해야한다.. )
# 사설이 좀 길어졌지만, 일단은 거리, 운행시간, 속도를 각각 최대/최소 기준값으로 1차 filtering 을 한다.
quantile(train$speed, 0.999)          #=> 56 km/h
99.9% 
 56.9 
quantile(train$distance, 0.9999)      #=> 45 km
 100% 
45676 
quantile(train$trip_duration, 0.999)  #=> 85000 seconds
99.9% 
85128 
train %<>% filter(
  speed < quantile(train$speed, 0.999) & 
  distance < quantile(train$distance, 0.9999) & 
  trip_duration < quantile(train$trip_duration, 0.999)
)
par(mfrow = c(1,2))
plot(train$distance, train$trip_duration)
plot(train$distance, train$speed)
# [todo] 일단 geojson 으로 pickup 위치를 변환하여, map 에 올려놓고 대충 파악해본다. ( geojson 은 일단 너무 느리다.. R 에서.. 내가 못하는 것일 수도 있지만  )
#pickups <- sample_frac(train, 0.001)
#coordinates(pickups) = c("pickup_latitude", "pickup_longitude")
#class(pickups)
#writeOGR(pickups, "/Users/CA/pickup_locations.geojson", layer = "train", driver = "GeoJSON")
#pickups_geojson <- rgdal::readOGR("/Users/CA/pickup_locations.geojson", "OGRGeoJSON")
#plot(pickups_geojson)
library(leaflet)
Warning messages:
1: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
2: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
3: In scan(file = file, what = what, sep = sep, quote = quote, dec = dec,  :
  EOF within quoted string
pickups <- sample_frac(train, 0.001)
leaflet(data = pickups, width = "100%") %>%
  setView(lng = -73.9821548461914, lat = 40.767936706543, zoom = 12) %>%
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addMarkers(lng = ~pickup_longitude, lat = ~pickup_latitude, clusterOptions = markerClusterOptions())
#library(tmap)
#library(tmaptools)
#library(sp)

#pickups <- sample_frac(train, 0.001)
#coordinates(pickups) <- ~ pickup_longitude + pickup_latitude
#pickups              <- set_projection(pickups, current.projection = "longlat")#, projection = 27700)

#tmap_mode("plot") # plot | view
#qtm(pickups)

#pickups_osm <- read_osm(pickups)
#qtm(pickups_osm) + qtm(pickups, symbols.col = "red", symbols.size = 0.01)
# netcount 를 중심으로, 도심 업무지구, 교외 거주지구 등을 구분해본다. 
pickup_count <- train %>% group_by(pickup_geohash_5) %>% summarise(pc = n())
dropoff_count <- train %>% group_by(dropoff_geohash_5) %>% summarise(dc = n())
colnames(pickup_count) <- c("geohash", "pc")
colnames(dropoff_count) <- c("geohash", "dc")
m <- merge(pickup_count, dropoff_count, by = "geohash", all = T) %>% replace_na(list(pc = 0, dc = 0)) 
m %<>% mutate(net_count = pc - dc, total = pc + dc)
plot(mm$total, mm$net_count)
abline(v=median(mm$total), h=median(mm$net_count))

# 속도
# outgoing, incoming, internal 을 구분해서 살펴본다. 
train %<>% mutate(is_internal = pickup_geohash_5 == dropoff_geohash_5)
internal_route <- train %>% filter(is_internal == T)
internal_spec <- internal_route %>% 
  group_by(pickup_geohash_5, pickup_hour) %>% 
  summarise(count = n(), avg_speed = mean(speed), avg_distance = mean(distance))
# internal speed  
boxplot(avg_speed ~ pickup_hour, data = internal_spec %>% filter(count > 1000))

# intersection speed 
intersection_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(pickup_geohash_5, dropoff_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))
plot_ly(x = intersection_spec$pickup_geohash_5, 
        y = intersection_spec$dropoff_geohash_5, 
        z = intersection_spec$avg_speed, 
        type = "heatmap", colorscale = "Viridis")
outgoing_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(pickup_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))
plot(outgoing_spec$avg_distance, outgoing_spec$avg_speed)

incoming_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(dropoff_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))
plot(incoming_spec$avg_distance, incoming_spec$avg_speed)

# try#1
# 경로별 데이터 프레임을 만든다. 
# 경로별 데이터 프레임은 geohash 기준 5, 6, 7 레벨에 대해 각각 만든다. 
# 이들 frame 에서 추가되는 feature 들에는, 경로별 평균 시간, 속도, 거리 와 최빈 운행 시간대 등이다. 
# trip duration 은 각 경로 data frame 에 의해 예측된 값의 mean 을 취한다. 
# baseline 은 요일/시간 + 출도착지별 평균시간 예측값의 RMSE 
routes_lvl_5 <- train %>% 
  subset(select = c("pickup_geohash_5", "dropoff_geohash_5", "route5", 
                    "pickup_datetime", 
                    "passenger_count", 
                    "store_and_fwd_flag",
                    "trip_duration", "distance", "speed")) %>%
  mutate(wday = wday(pickup_datetime, label = T), hour = hour(pickup_datetime), month = month(pickup_datetime)) 
routes_lvl_5.monthly <- routes_lvl_5 %>% 
  group_by(month, route5) %>%
  summarise(count = n(), avg_speed = mean(speed), avg_duration = mean(trip_duration), avg_distance = mean(distance))
# 누적 월 운행수 1,000 건 이상인 경로들에서는 월별로 운행시간 차이가 크지 않음, 
# 이들 경로에서의 운행발생건수는 전체의 86.3% 정도 
boxplot(avg_speed ~ route5, data = routes_lvl_5.monthly %>% filter(count > 1000))

sum((routes_lvl_5.monthly %>% filter(count >= 1000))$count) * 1.0 / sum(routes_lvl_5.monthly$count)
[1] 0.8637101
# 누적 월 운행수 100 ~ 1000 건 경로들에서는 다소 fluctuation 이 있음. 당연;;
boxplot(avg_speed ~ route5, data = routes_lvl_5.monthly %>% filter(count < 1000 & count > 100))

# 월운행건수 평균 1000건 이상 경로들에 대해서만 자세히 살펴보자 ( 핵심 경로 )
core_routes <- routes_lvl_5.monthly %>% group_by(route5) %>% summarise(avg_count = mean(count)) %>% filter(avg_count > 1000)
core_routes_stat <- routes_lvl_5 %>% filter(route5 %in% core_routes$route5)
nrow(core_routes_stat) / nrow(routes_lvl_5) * 1.0 # 86% 
[1] 0.8634319
boxplot(speed ~ route5, data = core_routes_stat)

core_routes_stat %<>% 
  mutate(time_window = ifelse(hour > 22 | hour < 7, "dawn", ifelse(hour>12, "afternoon", "morning")))
ggplot(core_routes_stat, aes(route5, speed)) +
  geom_boxplot(outlier.alpha = 0.1, outlier.colour = "red", aes(colour = time_window)) +
  coord_flip() 

NA
ggplot(core_routes_stat %>% filter(month == 1 & hour == 12), aes(route5, speed)) +
  geom_boxplot(outlier.alpha = 0.1, outlier.colour = "red") +
  coord_flip() 

routes_lvl_5 %>% 
  group_by(route5) %>%
  summarise(sd_speed = sd(speed), sd_distance = sd(distance), count = n()) %>%
  filter(count > 1) -> route5_sd
View(route5_sd)
# dr5r4 에서 dr5r4 로 가는 운행건들은 모두 출도착 좌표가 동일, speed , distance 모두 0, trip_duration 만 있음;;
# 이런 케이스의 경우에는, 아래 density plot 의 모양을 갖는 분포에서 sampling 하는 수밖에 없어 보임 
head(train %>% filter(pickup_geohash_5 == "dr5r4" & dropoff_geohash_5 == "dr5r4"))
plot(density((train %>% filter(pickup_geohash_5 == "dr5r4" & dropoff_geohash_5 == "dr5r4"))$trip_duration))

par(mfrow = c(1,2))
plot(density(route5_sd$avg_distance))
plot(density(route5_sd$avg_speed))

routes_lvl_5 %>%
  filter(pickup_geohash_5 != dropoff_geohash_5) %>%
  group_by(route5, round(hour/3,0)) %>%
  summarise(count = n()) %>% 
  group_by(route5) %>% 
  top_n(1, count) -> route5.most_freq_hour
routes_lvl_5 %>% group_by(route5) %>% summarise(total_count = n()) -> tmp
route5.most_freq_hour %<>% 
  merge(tmp, by = "route5") %>%
  mutate(raito = count * 1.0 / total_count)
colnames(route5.most_freq_hour) <- c("route5", "time_group", "count", "total_count", "ratio")
route5.most_freq_hour %<>% mutate(time_group = paste(3*time_group, "~", 3*(time_group+1)-1))
geo_tokens <- strsplit(route5.most_freq_hour$route5, ">")
route5.most_freq_hour$start = unlist(geo_tokens)[2*(1:nrow(route5.most_freq_hour))-1]
route5.most_freq_hour$end   = unlist(geo_tokens)[2*(1:nrow(route5.most_freq_hour))]
head(route5.most_freq_hour)
table(
  (route5.most_freq_hour %>% filter(start == "dr5ru", total_count > 100))$time_group
)

  0 ~ 2 15 ~ 17 18 ~ 20 21 ~ 23   3 ~ 5  9 ~ 11 
      1       7       2      23       3       1 
route5.most_freq_hour %>% 
  filter(total_count > 50) %>%
  group_by(start, time_group) %>%
  summarise(time_group_count = n()) %>%
  group_by(start) %>%
  top_n(1, time_group_count) -> start_spec
route5.most_freq_hour %>% filter(total_count > 50) %>% group_by(start) %>% summarise(count = n()) -> start_spec_2
start_spec %<>% merge(start_spec_2, by = "start") %>% mutate(ratio = time_group_count * 1.0 / count)
# 특정 지역에서 다른 지역으로 나가는 시간대를 Grouping 하여, 
# 주로 발생되는 시간대를 보면 저녁시간 , 새벽시간 , 아침 시간 으로 나뉜다.
# 이 중 전형적으로 특정시간대에 운행이 집중되는 지역들이 있는데,
# 저녁시간대면 업무지구, 새벽시간대면 유흥지구, 아침시간대면 거주지구 로 추측해 볼 수 있을 것 같다.
head(start_spec %>% arrange(desc(count)), n = 20)
---
title: "New York City Taxi Trip Duration"
output: html_notebook
---

```{r}
library(data.table)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringi)
library(stringr)
library(xda)
library(caret)
library(doMC)
library(dummy)
library(leaflet)
library(rgdal)
library(geojsonio)
library(lubridate)
library(geosphere)
library(gridExtra)
library(ggmap)
library(geohash)
```

```{r}
train <- fread("/Users/CA/Downloads/NY_taxi/train.csv", na.strings = "")
test <- fread("/Users/CA/Downloads/NY_taxi/test.csv", na.strings = "")
```

```{r}
numSummary(train)
charSummary(train)
```

```{r}
coord2distance <- Vectorize(function(lat1, lng1, lat2, lng2) {
  rad_per_deg = pi / 180
  rkm = 6371
  rm = rkm * 1000
  
  dlat_rad = (lat2 - lat1) * rad_per_deg
  dlng_rad = (lng2 - lng1) * rad_per_deg
  
  lat1_rad = lat1 * rad_per_deg
  lng1_rad = lng1 * rad_per_deg
  lat2_rad = lat2 * rad_per_deg
  lng2_rad = lng2 * rad_per_deg
  
  a = sin(dlat_rad/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlng_rad/2)**2
  c = 2 * atan2(sqrt(a), sqrt(1-a))
  
  return(rm * c)
})
```


```{r}
## 시간 데이터( String -> DateTime ), 출도착 직선거리(meter), 직선거리 기준 속도(km/h) 추가 
## train data 의 기간은 1월 ~ 6월까지 총 6개월간의 데이터이다. 
train %<>%
  mutate(pickup_datetime   = ymd_hms(pickup_datetime), 
         dropoff_datetime  = ymd_hms(dropoff_datetime),
         distance          = coord2distance( pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude),
         speed             = round(distance / 1000 / (trip_duration/3600),2)) %>%
  mutate_each_(funs(as.double(.)), c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"))

head(train)
```


```{r}
## vendor : a code indicating the provider associated with the trip record
## Q1 : 사업구역? #=> 지역별 사업구역은 별도로 확인되고 있지 않음. 
table(train$vendor_id)
train$vendor_id <- factor(train$vendor_id)

qmplot(pickup_longitude, pickup_latitude, data = sample_frac(train,0.0001), colour = vendor_id,
       zoom = 12)
```

```{r}
# pickup_datetime : date and time when the meter was engaged
train %<>%
  mutate(pickup_wday = wday(pickup_datetime, label = T), pickup_hour = hour(pickup_datetime)) %>%
  mutate(dropoff_wday = wday(dropoff_datetime, label = T), dropoff_hour = hour(dropoff_datetime))
```

```{r}
## Q1. 요일별 pickup 수 분포     #=> 월요일이 제일 적음 ( 의외! , 휴일이라도 끼어 있나 ? ) 
## Q2. 시간대별 pickup 수 분포   #=> 출근보다 퇴근시간에 더 많음  

table(train$pickup_wday)
table(train$pickup_hour)

p1 <- ggplot(data = train, mapping = aes(x = pickup_wday, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
p2 <- ggplot(data = train, mapping = aes(x = pickup_hour, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()

grid.arrange(p1, p2, nrow = 2, ncol = 1)
```

```{r}
## Q3. 요일/시간대별 평균 속도 
##      #=> 요일별 평균속도는 월/일/토 , 화/금 , 수/목 순으로 좋음 
##      #=> 시간별 평균속도는 퇴근시간대보다 일과시간이 나쁘고, 새벽시간대가 제일 좋음 
## Q4. 요일/시간대별 평균 운행 직선 거리 
##      #=> 요일별 평균 운행 직선 거리는 월/일 이 상대적으로 길고, 기타 요일에는 대동소이 
##      #=> 시간별 평균 운행 직선 거리는 평균속도 패턴과 동일.

avg_speed_by_wday    <- train %>% group_by(pickup_wday) %>% summarise(avg_speed_by_wday = mean(speed))
avg_distance_by_wday <- train %>% group_by(pickup_wday) %>% summarise(avg_distance_by_wday = mean(distance))

avg_speed_by_hour    <- train %>% group_by(pickup_hour) %>% summarise(avg_speed_by_hour = mean(speed))
avg_distance_by_hour <- train %>% group_by(pickup_hour) %>% summarise(avg_distance_by_hour = mean(distance))

head(merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday") %>% arrange(desc(avg_distance_by_wday)), n = 7)
head(merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") %>% arrange(desc(avg_speed_by_hour)), n = 24)
```

```{r}
hourly_data <- merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") 

#p1 <- ggplot(data = hourly_data, aes(pickup_hour, avg_speed_by_hour)) + geom_line() + geom_point()
#p2 <- ggplot(data = hourly_data, aes(pickup_hour, avg_distance_by_hour)) + geom_line() + geom_point()
#grid.arrange(p1, p2, nrow = 2, ncol = 1)

ggplot(data = hourly_data, aes(x = pickup_hour)) +
  geom_line(aes(y = avg_speed_by_hour, colour = "speed")) +
  geom_line(aes(y = round(avg_distance_by_hour/250 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")
```

```{r}
# passenger count 에 따른 속도 변화 
# A1 : 합승자가 많으면, 경유지 증가에 따라 거리 증가 및 평균 속도 감소가 있을 것 이다.
table(train$passenger_count)
train$passenger_count <- as.numeric(as.character(train$passenger_count))

avg_speed_by_pc    <- train %>% 
                        filter(passenger_count %in% c(1,2,3,4,5,6)) %>% 
                        group_by(passenger_count) %>% 
                        summarise(avg_speed = mean(speed), avg_distnace = mean(distance))

ggplot(data = avg_speed_by_pc, aes(x = passenger_count)) +
  geom_line(aes(y = avg_speed, colour = "speed")) +
  geom_line(aes(y = round(avg_distnace/260 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*260, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")
```

```{r}
# 경로 : 출도착지 좌표를 geohash 로 바꿔, geohash 기준으로 경로를 뽑는다. 
# precision level 7 : 150m * 150m 
train %<>%
  mutate(pickup_geohash_7  = gh_encode(pickup_latitude, pickup_longitude, precision = 7)) %>%
  mutate(dropoff_geohash_7 = gh_encode(dropoff_latitude, dropoff_longitude, precision = 7))  %>%
  mutate(pickup_geohash_6 = gh_encode(pickup_latitude, pickup_longitude, precision = 6))  %>%
  mutate(dropoff_geohash_6 = gh_encode(dropoff_latitude, dropoff_longitude, precision = 6)) %>%
  mutate(pickup_geohash_5 = gh_encode(pickup_latitude, pickup_longitude, precision = 5))  %>%
  mutate(dropoff_geohash_5 = gh_encode(dropoff_latitude, dropoff_longitude, precision = 5)) %>%
  mutate(route7 = paste(pickup_geohash_7, dropoff_geohash_7, sep = ">")) %>%
  mutate(route6 = paste(pickup_geohash_6, dropoff_geohash_6, sep = ">")) %>%
  mutate(route5 = paste(pickup_geohash_5, dropoff_geohash_5, sep = ">"))


routes_7 <- as.data.frame(table(train$pickup_geohash_7, train$dropoff_geohash_7)) %>% filter(Freq > 0)
routes_6 <- as.data.frame(table(train$pickup_geohash_6, train$dropoff_geohash_6)) %>% filter(Freq > 0)
routes_5 <- as.data.frame(table(train$pickup_geohash_5, train$dropoff_geohash_5)) %>% filter(Freq > 0)
colnames(routes_7) <- c("pickup", "dropoff", "Freq")
colnames(routes_6) <- c("pickup", "dropoff", "Freq")
colnames(routes_5) <- c("pickup", "dropoff", "Freq")

nrow(routes_6)                       #=> 55,609 개
nrow(routes_6 %>% distinct(pickup))  #=> 1,436 개 
nrow(routes_6 %>% distinct(dropoff)) #=> 2,675 개

nrow(routes_7)                       #=> 860,132 개
nrow(routes_7 %>% distinct(pickup))  #=> 9,238 개 
nrow(routes_7 %>% distinct(dropoff)) #=> 21,959 개

nrow(routes_5)                       #=> 2,214 개
nrow(routes_5 %>% distinct(pickup))  #=> 263 개 
nrow(routes_5 %>% distinct(dropoff)) #=> 369 개

# 자.. 이제 문제를 
# 출발지 1500 개와 도착지 2700 개의 특성과
# 경로 55,000 개에 대해서 시간,요일,탑승자,거리 feature 를 기반으로 소요시간을 예측하는 모델로 접근을 하자. 
#
# kaggle discussion 을 보면, 추가 데이터 사용도 가능한 것 같다. 
# 예를들어서 landmark / wheather / realtime traffic speed
# 하지만, original NYC dataset on BigQuery 를 통한 운행건별 직접 정보는 사용할 수 없다. ( 실제 운행 거리 , 요금 .. )
# https://www.kaggle.com/c/nyc-taxi-trip-duration/discussion/36699
```

```{r}
route_freq <- as.data.frame(table(train$pickup_geohash_5, train$dropoff_geohash_5))
route_freq %<>% filter(Freq > 0)
library(plotly)



route_spec <- train %>% 
                group_by(pickup_geohash_5, dropoff_geohash_5) %>%
                summarise(count = n(), avg_speed = mean(speed), avg_distnace = mean(distance)) %>%
                filter(count > 0 & avg_speed > 1 & avg_speed < 70 )

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$count, type = "heatmap", colorscale = "Greys")
plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_speed, type = "heatmap", colorscale = "Greys")
plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_distnace, type = "heatmap", colorscale = "Greys")

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = log(route_spec$count * route_spec$avg_distnace), type = "heatmap", colorscale = "Greys")
```


```{r}
# 자.. 그전에 distance 들을 좀 살펴보자. 
par(mfrow = c(1,3))

plot(density(train$distance))         # 굉장히 심한 L-Shape
plot(density(log(train$distance)))    # log-transform
plot(density(log(train$distance+10))) # +10 씩 움직여주고, log-transform

summary(train$distance)               # 851 km 짜리 직선거리도 있다. ( 명백한 outlier ) , 역시 야들도 GPS 가 튄다.

# GPS 가 튀는 것들은 거리와 상관없이 발생할텐데, 851km 는 거리로 잘 보인 케이스일뿐이다.
# 이런 outlier 들은 평균 속도가 이상한 것들로 잡아보자 

summary(train$speed)
nrow(filter(train, speed > 60))       

# 932 건이다. 이렇게 그냥 평균 속도로 짜를 것인지, 
# 출도착지의 시간대별 평균 속도에 기반하여 솎아낼지는 판단해야한다... 귀찮다;; 

# 위와는 조금 다르게, GPS 가 튀어서 거리가 매우 작게 잡히는 경우도 있다. 
nrow(train %>% filter(distance < 50 | trip_duration > 3000 ))
```

```{r}
# 위치데이터 자체로는 이상값판별이 안되므로, 시간, 속도 등을 기반으로 다 걸러내야 한다. 
plot(train$distance, train$trip_duration)
plot(train$distance, train$speed)
```

```{r}
remove_outliers <- function(df, col1, col2) {
  library(ldbod)
  scores <- ldbod(subset(df, select = c(col1, col2)), k = 50)
  head(scores$lrd); head(scores$rkof)
  
  s <- subset(df, select = c(col1, col2))
  plot(s)
  top50outliers <- s[order(scores$lpde,decreasing=FALSE)[1:50],]
  points(top50outliers,col=2)
  
  df <- df[!(as.matrix(df[col1]) %in% as.matrix(top50outliers[col1]) & as.matrix(df[col2]) %in% as.matrix(top50outliers[col2])),]
  
  return(df)
}

# package 'ldbod' : Flexible procedures to compute local density-based outlier scores for ranking outliers
# 이번 케이스는, 밀도기반의 outlier 탐지가 잘 동작할 것으로 기대해봄. 
# 계산량이 많아, parallel 이 지원되어야 함. 
# 몇가지 lof 관련 package 가 있지만, ldbod 가 제일 좋은 결과를 보여줌.. 하지만 속도가 망 
# train <- remove_outliers(train, "speed", "distance")
# train <- remove_outliers(train, "trip_duration", "distance")

spd_dist_time <- subset(train, select = c(speed, distance, trip_duration))

registerDoMC(cores = 4)
sample_train <- sample_frac(train, 0.1)
#sample_train <- rbind(sample_train, train %>% filter(speed > 100))
sample_train <- remove_outliers(sample_train, 'speed', 'distance')
sample_train <- remove_outliers(sample_train, 'trip_duration', 'distance')
sample_train <- remove_outliers(sample_train, 'trip_duration', 'speed')

library(MVN)
#res <- mvOutlier(spd_dist_time, label = F, position = 4)

library(Rlof)
#sdt <- sample_frac(spd_dist_time, 0.01)
#sdt <- rbind(sdt, spd_dist_time %>% filter(speed > 100))
#sdt <- cbind(sdt,lof(sdt, c(1:5)))
#head(sdt %>% arrange(desc(`1`)), n = 100)

library(DMwR)
#sdt <- sample_frac(spd_dist_time, 0.01)
#sdt <- rbind(sdt, spd_dist_time %>% filter(speed > 100))
#sdt %<>%
#  mutate(speed = round(speed / 10, 0)) %>%
#  mutate(distance = round(distance / 1000, 0)) 
#sdt <- cbind(sdt, lofactor(subset(sdt, select = c(speed, distance)), k = 10))
```

```{r}
# density based 로 outlier 를 탐지하는 것의 약점은, normal case 도 제거가능하다는 것이긴하다......
# 일단 lof 로 제거된 후, 다시 distance~duration / distance~speed 상태를 보자 
par(mfrow = c(1,2))
plot(sample_train$distance, sample_train$trip_duration)
plot(sample_train$distance, sample_train$speed)
```

```{r}
# distance ~ duration 그래프의 좌상단 과 distance ~ speed 에서 speed ~ 0 의 일직선 부분이 density based 로 못잡은
# outlier 들이다. 이들은 특정패턴 ( 출도착지 좌표는 거의 변화가 없는 케이스 ) 들이 조밀하게 모여 있어서 그런것이다. 
# 즉 특정패턴의 이상현상이 일정 빈도이상 발생하게 되면, density based 로는 잡을 수 없다.
# 이런 녀석들은 rule 로 잡아줘야 할 것 같다. 
# 아니면, density 를 먹이기 전에, 미리 특정 density 이상의 패턴을 갖는 outlier 들을 제거한 후, density 를 먹여도 된다.
# ( 음.. 그러니까.. 다시해야한다.. )
```

```{r}
# 사설이 좀 길어졌지만, 일단은 거리, 운행시간, 속도를 각각 최대/최소 기준값으로 1차 filtering 을 한다.
quantile(train$speed, 0.999)          #=> 56 km/h
quantile(train$distance, 0.9999)      #=> 45 km
quantile(train$trip_duration, 0.999)  #=> 85000 seconds

train %<>% filter(
  speed < quantile(train$speed, 0.999) & 
  distance < quantile(train$distance, 0.9999) & 
  trip_duration < quantile(train$trip_duration, 0.999)
)

par(mfrow = c(1,2))
plot(train$distance, train$trip_duration)
plot(train$distance, train$speed)
```


```{r}
# [todo] 일단 geojson 으로 pickup 위치를 변환하여, map 에 올려놓고 대충 파악해본다. ( geojson 은 일단 너무 느리다.. R 에서.. 내가 못하는 것일 수도 있지만  )
#pickups <- sample_frac(train, 0.001)
#coordinates(pickups) = c("pickup_latitude", "pickup_longitude")
#class(pickups)
#writeOGR(pickups, "/Users/CA/pickup_locations.geojson", layer = "train", driver = "GeoJSON")

#pickups_geojson <- rgdal::readOGR("/Users/CA/pickup_locations.geojson", "OGRGeoJSON")

#plot(pickups_geojson)
```

```{r}
library(leaflet)

pickups <- sample_frac(train, 0.001)

leaflet(data = pickups, width = "100%") %>%
  setView(lng = -73.9821548461914, lat = 40.767936706543, zoom = 12) %>%
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addMarkers(lng = ~pickup_longitude, lat = ~pickup_latitude, clusterOptions = markerClusterOptions())
```

```{r}
#library(tmap)
#library(tmaptools)
#library(sp)

#pickups <- sample_frac(train, 0.001)
#coordinates(pickups) <- ~ pickup_longitude + pickup_latitude
#pickups              <- set_projection(pickups, current.projection = "longlat")#, projection = 27700)

#tmap_mode("plot") # plot | view
#qtm(pickups)

#pickups_osm <- read_osm(pickups)
#qtm(pickups_osm) + qtm(pickups, symbols.col = "red", symbols.size = 0.01)
```

```{r}
route_spec <- train %>% 
                group_by(pickup_geohash_5, dropoff_geohash_5) %>%
                summarise(count = n(), avg_speed = mean(speed), avg_distnace = mean(distance)) %>%
                filter(count > 0 & avg_speed > 1 & avg_speed < 100 )

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = log(route_spec$count+10), type = "heatmap", colorscale = "Greys")
plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_speed, type = "heatmap", colorscale = "Greys")
plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = route_spec$avg_distnace, type = "heatmap", colorscale = "Greys")

plot_ly(x = route_spec$pickup_geohash_5, y = route_spec$dropoff_geohash_5, z = log(route_spec$count * route_spec$avg_distnace), type = "heatmap", colorscale = "Greys")
```

```{r}
# netcount 를 중심으로, 도심 업무지구, 교외 거주지구 등을 구분해본다. 
pickup_count <- train %>% group_by(pickup_geohash_5) %>% summarise(pc = n())
dropoff_count <- train %>% group_by(dropoff_geohash_5) %>% summarise(dc = n())

colnames(pickup_count) <- c("geohash", "pc")
colnames(dropoff_count) <- c("geohash", "dc")

m <- merge(pickup_count, dropoff_count, by = "geohash", all = T) %>% replace_na(list(pc = 0, dc = 0)) 
m %<>% mutate(net_count = pc - dc, total = pc + dc)

plot(mm$total, mm$net_count)
abline(v=median(mm$total), h=median(mm$net_count))
```

```{r}
# 속도
# outgoing, incoming, internal 을 구분해서 살펴본다. 

train %<>% mutate(is_internal = pickup_geohash_5 == dropoff_geohash_5)

internal_route <- train %>% filter(is_internal == T)
internal_spec <- internal_route %>% 
  group_by(pickup_geohash_5, pickup_hour) %>% 
  summarise(count = n(), avg_speed = mean(speed), avg_distance = mean(distance))

# internal speed  
boxplot(avg_speed ~ pickup_hour, data = internal_spec %>% filter(count > 1000))
```

```{r}
# intersection speed 
intersection_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(pickup_geohash_5, dropoff_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))

plot_ly(x = intersection_spec$pickup_geohash_5, 
        y = intersection_spec$dropoff_geohash_5, 
        z = intersection_spec$avg_speed, 
        type = "heatmap", colorscale = "Viridis")
```

```{r}
outgoing_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(pickup_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))

plot(outgoing_spec$avg_distance, outgoing_spec$avg_speed)
```

```{r}
incoming_spec <- train %>% 
  filter(is_internal == F) %>%
  group_by(dropoff_geohash_5) %>%
  summarise(count = n(), avg_speed = median(speed), avg_distance = median(distance))

plot(incoming_spec$avg_distance, incoming_spec$avg_speed)
```

```{r}
# try#1
# 경로별 데이터 프레임을 만든다. 
# 경로별 데이터 프레임은 geohash 기준 5, 6, 7 레벨에 대해 각각 만든다. 
# 이들 frame 에서 추가되는 feature 들에는, 경로별 평균 시간, 속도, 거리 와 최빈 운행 시간대 등이다. 
# trip duration 은 각 경로 data frame 에 의해 예측된 값의 mean 을 취한다. 
# baseline 은 요일/시간 + 출도착지별 평균시간 예측값의 RMSE 

routes_lvl_5 <- train %>% 
  subset(select = c("pickup_geohash_5", "dropoff_geohash_5", "route5", 
                    "pickup_datetime", 
                    "passenger_count", 
                    "store_and_fwd_flag",
                    "trip_duration", "distance", "speed")) %>%
  mutate(wday = wday(pickup_datetime, label = T), hour = hour(pickup_datetime), month = month(pickup_datetime)) 

routes_lvl_5.monthly <- routes_lvl_5 %>% 
  group_by(month, route5) %>%
  summarise(count = n(), avg_speed = mean(speed), avg_duration = mean(trip_duration), avg_distance = mean(distance))

# 누적 월 운행수 1,000 건 이상인 경로들에서는 월별로 운행시간 차이가 크지 않음, 
# 이들 경로에서의 운행발생건수는 전체의 86.3% 정도 
boxplot(avg_speed ~ route5, data = routes_lvl_5.monthly %>% filter(count > 1000))
```

```{r}
sum((routes_lvl_5.monthly %>% filter(count >= 1000))$count) * 1.0 / sum(routes_lvl_5.monthly$count)

# 누적 월 운행수 100 ~ 1000 건 경로들에서는 다소 fluctuation 이 있음. 당연;;
boxplot(avg_speed ~ route5, data = routes_lvl_5.monthly %>% filter(count < 1000 & count > 100))
```

```{r}
# 월운행건수 평균 1000건 이상 경로들에 대해서만 자세히 살펴보자 ( 핵심 경로 )
core_routes <- routes_lvl_5.monthly %>% group_by(route5) %>% summarise(avg_count = mean(count)) %>% filter(avg_count > 1000)

core_routes_stat <- routes_lvl_5 %>% filter(route5 %in% core_routes$route5)
nrow(core_routes_stat) / nrow(routes_lvl_5) * 1.0 # 86% 

boxplot(speed ~ route5, data = core_routes_stat)
```

```{r}
core_routes_stat %<>% 
  mutate(time_window = ifelse(hour > 22 | hour < 7, "dawn", ifelse(hour>12, "afternoon", "morning")))

ggplot(core_routes_stat, aes(route5, speed)) +
  geom_boxplot(outlier.alpha = 0.1, outlier.colour = "red", aes(colour = time_window)) +
  coord_flip() 
  
```

```{r}
ggplot(core_routes_stat %>% filter(month == 1 & hour == 12), aes(route5, speed)) +
  geom_boxplot(outlier.alpha = 0.1, outlier.colour = "red") +
  coord_flip() 
```

```{r}
routes_lvl_5 %>% 
  group_by(route5) %>%
  summarise(sd_speed = sd(speed), sd_distance = sd(distance), avg_speed = mean(speed), avg_distance = mean(distance), count = n()) %>%
  filter(count > 1) -> route5_sd
```

```{r}
# dr5r4 에서 dr5r4 로 가는 운행건들은 모두 출도착 좌표가 동일, speed , distance 모두 0, trip_duration 만 있음;;
# 이런 케이스의 경우에는, 아래 density plot 의 모양을 갖는 분포에서 sampling 하는 수밖에 없어 보임 
head(train %>% filter(pickup_geohash_5 == "dr5r4" & dropoff_geohash_5 == "dr5r4"))
plot(density((train %>% filter(pickup_geohash_5 == "dr5r4" & dropoff_geohash_5 == "dr5r4"))$trip_duration))
```

```{r}
par(mfrow = c(1,2))

plot(density(route5_sd$avg_distance))
plot(density(route5_sd$avg_speed))
```

```{r}
routes_lvl_5 %>%
  filter(pickup_geohash_5 != dropoff_geohash_5) %>%
  group_by(route5, round(hour/3,0)) %>%
  summarise(count = n()) %>% 
  group_by(route5) %>% 
  top_n(1, count) -> route5.most_freq_hour

routes_lvl_5 %>% group_by(route5) %>% summarise(total_count = n()) -> tmp

route5.most_freq_hour %<>% 
  merge(tmp, by = "route5") %>%
  mutate(raito = count * 1.0 / total_count)

colnames(route5.most_freq_hour) <- c("route5", "time_group", "count", "total_count", "ratio")
route5.most_freq_hour %<>% mutate(time_group = paste(3*time_group, "~", 3*(time_group+1)-1))


geo_tokens <- strsplit(route5.most_freq_hour$route5, ">")
route5.most_freq_hour$start = unlist(geo_tokens)[2*(1:nrow(route5.most_freq_hour))-1]
route5.most_freq_hour$end   = unlist(geo_tokens)[2*(1:nrow(route5.most_freq_hour))]

head(route5.most_freq_hour)

table(
  (route5.most_freq_hour %>% filter(start == "dr5ru", total_count > 100))$time_group
)

route5.most_freq_hour %>% 
  filter(total_count > 50) %>%
  group_by(start, time_group) %>%
  summarise(time_group_count = n()) %>%
  group_by(start) %>%
  top_n(1, time_group_count) -> start_spec

route5.most_freq_hour %>% filter(total_count > 50) %>% group_by(start) %>% summarise(count = n()) -> start_spec_2

start_spec %<>% merge(start_spec_2, by = "start") %>% mutate(ratio = time_group_count * 1.0 / count)

# 특정 지역에서 다른 지역으로 나가는 시간대를 Grouping 하여, 
# 주로 발생되는 시간대를 보면 저녁시간 , 새벽시간 , 아침 시간 으로 나뉜다.
# 이 중 전형적으로 특정시간대에 운행이 집중되는 지역들이 있는데,
# 저녁시간대면 업무지구, 새벽시간대면 유흥지구, 아침시간대면 거주지구 로 추측해 볼 수 있을 것 같다.
head(start_spec %>% arrange(desc(count)), n = 20)
```









































```{r}
```