# baseline 의 RMSE : 
# 출도착지를 geohash lvl 5 로 나누어, 출도착지+시간별(1), 출발지+시간별(2), 시간별(3), 전체평속(4) 을 준비한다.
# traget 경로가 들어오면, (1), (2), (3), (4) 에서 존재하는 값만 취하여 평균값을 취한다.
# traget 경로의 distance 를 위의 평균 속도 값으로 나누어, trip duration 을 예상한다.
# Try2 도 위의 아이디어와 비슷하다.
# geohash 를 기준으로 경로별 특성값과 동승자수 요일, 계절(month) 의 feature 를 더하여 GLM 으로 시도한다. 
library(data.table)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringi)
library(stringr)
library(xda)
library(caret)
library(doMC)
library(dummy)
library(leaflet)
library(rgdal)
library(lubridate)
library(gridExtra)
library(ggmap)
library(geohash)
train <- fread("/Users/CA/Downloads/NY_taxi/train.csv", na.strings = "")

Read 43.2% of 1458644 rows
Read 64.4% of 1458644 rows
Read 85.7% of 1458644 rows
Read 86.4% of 1458644 rows
Read 1458644 rows and 11 (of 11) columns from 0.187 GB file in 00:00:08
test <- fread("/Users/CA/Downloads/NY_taxi/test.csv", na.strings = "")

Read 56.0% of 625134 rows
Read 625134 rows and 9 (of 9) columns from 0.066 GB file in 00:00:04
numSummary(train)
numSummary(test)
charSummary(train)
charSummary(test)
coord2distance <- Vectorize(function(lng1, lat1, lng2, lat2) {
  rad_per_deg = pi / 180
  rkm = 6371
  rm = rkm * 1000
  
  dlng_rad = (lng2 - lng1) * rad_per_deg
  dlat_rad = (lat2 - lat1) * rad_per_deg
  
  lng1_rad = lng1 * rad_per_deg
  lat1_rad = lat1 * rad_per_deg
  lng2_rad = lng2 * rad_per_deg
  lat2_rad = lat2 * rad_per_deg
  
  a = sin(dlng_rad/2)**2 + cos(lng1_rad) * cos(lng2_rad) * sin(dlat_rad/2)**2
  c = 2 * atan2(sqrt(a), sqrt(1-a))
  
  return(rm * c)
})
refine <- function(df) {
  df %<>%
    mutate(pickup_datetime   = ymd_hms(pickup_datetime), 
           dropoff_datetime  = ymd_hms(dropoff_datetime),
           distance          = coord2distance( pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude),
           speed             = round(distance / 1000 / (trip_duration/3600),2)) %>%
    mutate(pickup_wday = wday(pickup_datetime, label = T), pickup_hour = hour(pickup_datetime)) %>%
    mutate(dropoff_wday = wday(dropoff_datetime, label = T), dropoff_hour = hour(dropoff_datetime)) %>%
    mutate(pickup_month = month(pickup_datetime, label = T), dropoff_month = month(dropoff_datetime)) %>%
    mutate_each_(funs(as.factor(.)), c("vendor_id", "store_and_fwd_flag", "passenger_count"))
  
  return(df)
}
train %<>% refine
summary(train)
      id            vendor_id  pickup_datetime               dropoff_datetime              passenger_count  
 Length:1458644     1:678342   Min.   :2016-01-01 00:00:17   Min.   :2016-01-01 00:03:31   1      :1033540  
 Class :character   2:780302   1st Qu.:2016-02-17 16:46:04   1st Qu.:2016-02-17 17:05:32   2      : 210318  
 Mode  :character              Median :2016-04-01 17:19:40   Median :2016-04-01 17:35:12   5      :  78088  
                               Mean   :2016-04-01 10:10:24   Mean   :2016-04-01 10:26:24   3      :  59896  
                               3rd Qu.:2016-05-15 03:56:08   3rd Qu.:2016-05-15 04:10:51   6      :  48333  
                               Max.   :2016-06-30 23:59:39   Max.   :2016-07-01 23:02:03   4      :  28404  
                                                                                           (Other):     65  
 pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration        distance     
 Min.   :-121.9   Min.   :34.4    Min.   :-121.9    Min.   :32.2     N:1450599          Min.   :      1   Min.   :     0  
 1st Qu.: -74.0   1st Qu.:40.7    1st Qu.: -74.0    1st Qu.:40.7     Y:   8045          1st Qu.:    397   1st Qu.:   848  
 Median : -74.0   Median :40.8    Median : -74.0    Median :40.8                        Median :    662   Median :  1531  
 Mean   : -74.0   Mean   :40.8    Mean   : -74.0    Mean   :40.8                        Mean   :    959   Mean   :  2876  
 3rd Qu.: -74.0   3rd Qu.:40.8    3rd Qu.: -74.0    3rd Qu.:40.8                        3rd Qu.:   1075   3rd Qu.:  2847  
 Max.   : -61.3   Max.   :51.9    Max.   : -61.3    Max.   :43.9                        Max.   :3526282   Max.   :851968  
                                                                                                                          
     speed      pickup_wday     pickup_hour   dropoff_wday    dropoff_hour   pickup_month    dropoff_month 
 Min.   :   0   Sun  :195366   Min.   : 0.0   Sun  :197224   Min.   : 0.0   Mar    :256189   Min.   :1.00  
 1st Qu.:   6   Mon  :187418   1st Qu.: 9.0   Mon  :187433   1st Qu.: 9.0   Apr    :251645   1st Qu.:2.00  
 Median :  10   Tues :202749   Median :14.0   Tues :202518   Median :14.0   May    :248487   Median :4.00  
 Mean   :  11   Wed  :210136   Mean   :13.6   Wed  :209790   Mean   :13.6   Feb    :238300   Mean   :3.52  
 3rd Qu.:  15   Thurs:218574   3rd Qu.:19.0   Thurs:217746   3rd Qu.:19.0   Jun    :234316   3rd Qu.:5.00  
 Max.   :8619   Fri  :223533   Max.   :23.0   Fri  :223031   Max.   :23.0   Jan    :229707   Max.   :7.00  
                Sat  :220868                  Sat  :220902                  (Other):     0                 
# plot#1 : 시간별 운행량 변화 
p1 <- ggplot(data = train, mapping = aes(x = pickup_wday, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
p2 <- ggplot(data = train, mapping = aes(x = pickup_hour, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
p3 <- ggplot(data = train, mapping = aes(x = pickup_month, group = vendor_id, fill = vendor_id)) + 
  geom_bar(position="identity", alpha=0.7) + 
  theme_bw()
grid.arrange(p1, p2, p3, nrow = 2, ncol = 2)

# plot#2 : 시간별 운행 속도 변화 
## Q3. 요일/시간대별 평균 속도 
##      #=> 요일별 평균속도는 월/일/토 , 화/금 , 수/목 순으로 좋음 
##      #=> 시간별 평균속도는 퇴근시간대보다 일과시간이 나쁘고, 새벽시간대가 제일 좋음 
## Q4. 요일/시간대별 평균 운행 직선 거리 
##      #=> 요일별 평균 운행 직선 거리는 월/일 이 상대적으로 길고, 기타 요일에는 대동소이 
##      #=> 시간별 평균 운행 직선 거리는 평균속도 패턴과 동일.
avg_speed_by_wday    <- train %>% group_by(pickup_wday) %>% summarise(avg_speed_by_wday = mean(speed))
avg_distance_by_wday <- train %>% group_by(pickup_wday) %>% summarise(avg_distance_by_wday = mean(distance))
avg_speed_by_hour    <- train %>% group_by(pickup_hour) %>% summarise(avg_speed_by_hour = mean(speed))
avg_distance_by_hour <- train %>% group_by(pickup_hour) %>% summarise(avg_distance_by_hour = mean(distance))
daily_data  <- merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday")
hourly_data <- merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") 
p1 <- ggplot(data = daily_data, aes(x = pickup_wday)) +
  geom_line(aes(y = avg_speed_by_wday, colour = "speed", group = 1)) +
  geom_line(aes(y = round(avg_distance_by_wday/250 ,2), colour = "distance", group = 1)) +
  scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")
p2 <- ggplot(data = hourly_data, aes(x = pickup_hour)) +
  geom_line(aes(y = avg_speed_by_hour, colour = "speed")) +
  geom_line(aes(y = round(avg_distance_by_hour/250 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")
multiplot(p1, p2, cols = 1)

#p1 <- ggplot(data = hourly_data, aes(pickup_hour, avg_speed_by_hour)) + geom_line() + geom_point()
#p2 <- ggplot(data = hourly_data, aes(pickup_hour, avg_distance_by_hour)) + geom_line() + geom_point()
#grid.arrange(p1, p2, nrow = 2, ncol = 1)
#head(merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday") %>% arrange(desc(avg_distance_by_wday)), n = 7)
#head(merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") %>% arrange(desc(avg_speed_by_hour)), n = 24)
# plot 3 : 탑승자 수 
table(train$passenger_count)

      0       1       2       3       4       5       6       7       8       9 
     60 1033540  210318   59896   28404   78088   48333       3       1       1 
train$passenger_count <- as.numeric(as.character(train$passenger_count))
avg_speed_by_pc <- train %>% 
                     filter(passenger_count %in% c(1,2,3,4,5,6)) %>% 
                     group_by(passenger_count) %>% 
                     summarise(avg_speed = mean(speed), avg_distnace = mean(distance))
avg_pc_by_hour <- train %>% 
                     filter(passenger_count %in% c(1,2,3,4,5,6)) %>% 
                     group_by(pickup_hour) %>% 
                     summarise(avg_pc = mean(passenger_count))
p1 <- ggplot(data = avg_speed_by_pc, aes(x = passenger_count)) +
  geom_line(aes(y = avg_speed, colour = "speed")) +
  geom_line(aes(y = round(avg_distnace/260 ,2), colour = "distance")) +
  scale_y_continuous(sec.axis = sec_axis(~.*260, name = "Distance ( meter )")) +
  labs(y = "Speed ( km/h )")
p2 <- ggplot(data = avg_pc_by_hour, aes(x = pickup_hour)) +
  geom_line(aes(y = avg_pc)) + labs(y = "average passenger count")
multiplot(p1, p2, cols = 1)

