# baseline 의 RMSE :
# 출도착지를 geohash lvl 5 로 나누어, 출도착지+시간별(1), 출발지+시간별(2), 시간별(3), 전체평속(4) 을 준비한다.
# traget 경로가 들어오면, (1), (2), (3), (4) 에서 존재하는 값만 취하여 평균값을 취한다.
# traget 경로의 distance 를 위의 평균 속도 값으로 나누어, trip duration 을 예상한다.
# Try2 도 위의 아이디어와 비슷하다.
# geohash 를 기준으로 경로별 특성값과 동승자수 요일, 계절(month) 의 feature 를 더하여 GLM 으로 시도한다.
library(data.table)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringi)
library(stringr)
library(xda)
library(caret)
library(doMC)
library(dummy)
library(leaflet)
library(rgdal)
library(lubridate)
library(gridExtra)
library(ggmap)
library(geohash)
train <- fread("/Users/CA/Downloads/NY_taxi/train.csv", na.strings = "")
Read 43.2% of 1458644 rows
Read 64.4% of 1458644 rows
Read 85.7% of 1458644 rows
Read 86.4% of 1458644 rows
Read 1458644 rows and 11 (of 11) columns from 0.187 GB file in 00:00:08
test <- fread("/Users/CA/Downloads/NY_taxi/test.csv", na.strings = "")
Read 56.0% of 625134 rows
Read 625134 rows and 9 (of 9) columns from 0.066 GB file in 00:00:04
numSummary(train)
numSummary(test)
charSummary(train)
charSummary(test)
coord2distance <- Vectorize(function(lng1, lat1, lng2, lat2) {
rad_per_deg = pi / 180
rkm = 6371
rm = rkm * 1000
dlng_rad = (lng2 - lng1) * rad_per_deg
dlat_rad = (lat2 - lat1) * rad_per_deg
lng1_rad = lng1 * rad_per_deg
lat1_rad = lat1 * rad_per_deg
lng2_rad = lng2 * rad_per_deg
lat2_rad = lat2 * rad_per_deg
a = sin(dlng_rad/2)**2 + cos(lng1_rad) * cos(lng2_rad) * sin(dlat_rad/2)**2
c = 2 * atan2(sqrt(a), sqrt(1-a))
return(rm * c)
})
refine <- function(df) {
df %<>%
mutate(pickup_datetime = ymd_hms(pickup_datetime),
dropoff_datetime = ymd_hms(dropoff_datetime),
distance = coord2distance( pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude),
speed = round(distance / 1000 / (trip_duration/3600),2)) %>%
mutate(pickup_wday = wday(pickup_datetime, label = T), pickup_hour = hour(pickup_datetime)) %>%
mutate(dropoff_wday = wday(dropoff_datetime, label = T), dropoff_hour = hour(dropoff_datetime)) %>%
mutate(pickup_month = month(pickup_datetime, label = T), dropoff_month = month(dropoff_datetime)) %>%
mutate_each_(funs(as.factor(.)), c("vendor_id", "store_and_fwd_flag", "passenger_count"))
return(df)
}
train %<>% refine
summary(train)
id vendor_id pickup_datetime dropoff_datetime passenger_count
Length:1458644 1:678342 Min. :2016-01-01 00:00:17 Min. :2016-01-01 00:03:31 1 :1033540
Class :character 2:780302 1st Qu.:2016-02-17 16:46:04 1st Qu.:2016-02-17 17:05:32 2 : 210318
Mode :character Median :2016-04-01 17:19:40 Median :2016-04-01 17:35:12 5 : 78088
Mean :2016-04-01 10:10:24 Mean :2016-04-01 10:26:24 3 : 59896
3rd Qu.:2016-05-15 03:56:08 3rd Qu.:2016-05-15 04:10:51 6 : 48333
Max. :2016-06-30 23:59:39 Max. :2016-07-01 23:02:03 4 : 28404
(Other): 65
pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration distance
Min. :-121.9 Min. :34.4 Min. :-121.9 Min. :32.2 N:1450599 Min. : 1 Min. : 0
1st Qu.: -74.0 1st Qu.:40.7 1st Qu.: -74.0 1st Qu.:40.7 Y: 8045 1st Qu.: 397 1st Qu.: 848
Median : -74.0 Median :40.8 Median : -74.0 Median :40.8 Median : 662 Median : 1531
Mean : -74.0 Mean :40.8 Mean : -74.0 Mean :40.8 Mean : 959 Mean : 2876
3rd Qu.: -74.0 3rd Qu.:40.8 3rd Qu.: -74.0 3rd Qu.:40.8 3rd Qu.: 1075 3rd Qu.: 2847
Max. : -61.3 Max. :51.9 Max. : -61.3 Max. :43.9 Max. :3526282 Max. :851968
speed pickup_wday pickup_hour dropoff_wday dropoff_hour pickup_month dropoff_month
Min. : 0 Sun :195366 Min. : 0.0 Sun :197224 Min. : 0.0 Mar :256189 Min. :1.00
1st Qu.: 6 Mon :187418 1st Qu.: 9.0 Mon :187433 1st Qu.: 9.0 Apr :251645 1st Qu.:2.00
Median : 10 Tues :202749 Median :14.0 Tues :202518 Median :14.0 May :248487 Median :4.00
Mean : 11 Wed :210136 Mean :13.6 Wed :209790 Mean :13.6 Feb :238300 Mean :3.52
3rd Qu.: 15 Thurs:218574 3rd Qu.:19.0 Thurs:217746 3rd Qu.:19.0 Jun :234316 3rd Qu.:5.00
Max. :8619 Fri :223533 Max. :23.0 Fri :223031 Max. :23.0 Jan :229707 Max. :7.00
Sat :220868 Sat :220902 (Other): 0
# plot#1 : 시간별 운행량 변화
p1 <- ggplot(data = train, mapping = aes(x = pickup_wday, group = vendor_id, fill = vendor_id)) +
geom_bar(position="identity", alpha=0.7) +
theme_bw()
p2 <- ggplot(data = train, mapping = aes(x = pickup_hour, group = vendor_id, fill = vendor_id)) +
geom_bar(position="identity", alpha=0.7) +
theme_bw()
p3 <- ggplot(data = train, mapping = aes(x = pickup_month, group = vendor_id, fill = vendor_id)) +
geom_bar(position="identity", alpha=0.7) +
theme_bw()
grid.arrange(p1, p2, p3, nrow = 2, ncol = 2)

# plot#2 : 시간별 운행 속도 변화
## Q3. 요일/시간대별 평균 속도
## #=> 요일별 평균속도는 월/일/토 , 화/금 , 수/목 순으로 좋음
## #=> 시간별 평균속도는 퇴근시간대보다 일과시간이 나쁘고, 새벽시간대가 제일 좋음
## Q4. 요일/시간대별 평균 운행 직선 거리
## #=> 요일별 평균 운행 직선 거리는 월/일 이 상대적으로 길고, 기타 요일에는 대동소이
## #=> 시간별 평균 운행 직선 거리는 평균속도 패턴과 동일.
avg_speed_by_wday <- train %>% group_by(pickup_wday) %>% summarise(avg_speed_by_wday = mean(speed))
avg_distance_by_wday <- train %>% group_by(pickup_wday) %>% summarise(avg_distance_by_wday = mean(distance))
avg_speed_by_hour <- train %>% group_by(pickup_hour) %>% summarise(avg_speed_by_hour = mean(speed))
avg_distance_by_hour <- train %>% group_by(pickup_hour) %>% summarise(avg_distance_by_hour = mean(distance))
daily_data <- merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday")
hourly_data <- merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour")
p1 <- ggplot(data = daily_data, aes(x = pickup_wday)) +
geom_line(aes(y = avg_speed_by_wday, colour = "speed", group = 1)) +
geom_line(aes(y = round(avg_distance_by_wday/250 ,2), colour = "distance", group = 1)) +
scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
labs(y = "Speed ( km/h )")
p2 <- ggplot(data = hourly_data, aes(x = pickup_hour)) +
geom_line(aes(y = avg_speed_by_hour, colour = "speed")) +
geom_line(aes(y = round(avg_distance_by_hour/250 ,2), colour = "distance")) +
scale_y_continuous(sec.axis = sec_axis(~.*250, name = "Distance ( meter )")) +
labs(y = "Speed ( km/h )")
multiplot(p1, p2, cols = 1)

#p1 <- ggplot(data = hourly_data, aes(pickup_hour, avg_speed_by_hour)) + geom_line() + geom_point()
#p2 <- ggplot(data = hourly_data, aes(pickup_hour, avg_distance_by_hour)) + geom_line() + geom_point()
#grid.arrange(p1, p2, nrow = 2, ncol = 1)
#head(merge(avg_speed_by_wday, avg_distance_by_wday, by = "pickup_wday") %>% arrange(desc(avg_distance_by_wday)), n = 7)
#head(merge(avg_speed_by_hour, avg_distance_by_hour, by = "pickup_hour") %>% arrange(desc(avg_speed_by_hour)), n = 24)
# plot 3 : 탑승자 수
table(train$passenger_count)
0 1 2 3 4 5 6 7 8 9
60 1033540 210318 59896 28404 78088 48333 3 1 1
train$passenger_count <- as.numeric(as.character(train$passenger_count))
avg_speed_by_pc <- train %>%
filter(passenger_count %in% c(1,2,3,4,5,6)) %>%
group_by(passenger_count) %>%
summarise(avg_speed = mean(speed), avg_distnace = mean(distance))
avg_pc_by_hour <- train %>%
filter(passenger_count %in% c(1,2,3,4,5,6)) %>%
group_by(pickup_hour) %>%
summarise(avg_pc = mean(passenger_count))
p1 <- ggplot(data = avg_speed_by_pc, aes(x = passenger_count)) +
geom_line(aes(y = avg_speed, colour = "speed")) +
geom_line(aes(y = round(avg_distnace/260 ,2), colour = "distance")) +
scale_y_continuous(sec.axis = sec_axis(~.*260, name = "Distance ( meter )")) +
labs(y = "Speed ( km/h )")
p2 <- ggplot(data = avg_pc_by_hour, aes(x = pickup_hour)) +
geom_line(aes(y = avg_pc)) + labs(y = "average passenger count")
multiplot(p1, p2, cols = 1)

