library(data.table)
library(tidyverse)
library(lubridate)
library(gmt)
library(caret)
library(corrplot)
dataset <- fread("C:/Users/user/Desktop/Kaggle/New York Taxi Trip/train.csv")
##
Read 6.2% of 1458644 rows
Read 11.7% of 1458644 rows
Read 19.9% of 1458644 rows
Read 28.1% of 1458644 rows
Read 30.2% of 1458644 rows
Read 39.8% of 1458644 rows
Read 48.7% of 1458644 rows
Read 57.6% of 1458644 rows
Read 60.3% of 1458644 rows
Read 66.5% of 1458644 rows
Read 75.4% of 1458644 rows
Read 84.3% of 1458644 rows
Read 85.7% of 1458644 rows
Read 94.6% of 1458644 rows
Read 1458644 rows and 11 (of 11) columns from 0.187 GB file in 00:00:18
library(h2o)
h2o.init(nthreads = -1, max_mem_size = "16G")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\user\AppData\Local\Temp\Rtmpg5B8cq/h2o_user_started_from_r.out
## C:\Users\user\AppData\Local\Temp\Rtmpg5B8cq/h2o_user_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 920 milliseconds
## H2O cluster version: 3.14.0.7
## H2O cluster version age: 2 months and 28 days
## H2O cluster name: H2O_started_from_R_user_yuu053
## H2O cluster total nodes: 1
## H2O cluster total memory: 14.22 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: AutoML, Algos, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
dataset.hex <- as.h2o(dataset)
##
|
| | 0%
|
|=================================================================| 100%
splits <- h2o.splitFrame(data = dataset.hex, ratios = c(0.7, 0.15),seed = 1)
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
nrow(train)
## [1] 1021195
nrow(valid)
## [1] 218619
nrow(test)
## [1] 218830
y <- "trip_duration"
x <- setdiff(names(train), c(y))
print(x)
## [1] "id" "vendor_id" "pickup_datetime"
## [4] "dropoff_datetime" "passenger_count" "pickup_longitude"
## [7] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
## [10] "store_and_fwd_flag"
gbm_fit <- h2o.gbm(x = x, y = y,
training_frame = train,
validation_frame = valid,
distribution="gamma",
model_id = "gbm_fit",
seed = 1)
##
|
| | 0%
|
|=== | 4%
|
|====== | 10%
|
|============= | 20%
|
|================== | 28%
|
|========================= | 38%
|
|============================= | 44%
|
|=================================== | 54%
|
|========================================== | 64%
|
|================================================ | 74%
|
|==================================================== | 80%
|
|=================================================================| 100%
print(gbm_fit)
## Model Details:
## ==============
##
## H2ORegressionModel: gbm
## Model ID: gbm_fit
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 50 50 20708 0
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 5 4.40000 1 32 27.94000
##
##
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 25922019
## RMSE: 5091.367
## MAE: 461.1278
## RMSLE: 0.6954538
## Mean Residual Deviance : 15.48613
##
##
## H2ORegressionMetrics: gbm
## ** Reported on validation data. **
##
## MSE: 10060792
## RMSE: 3171.875
## MAE: 460.2297
## RMSLE: 0.6963126
## Mean Residual Deviance : 15.50769
h2o.varimp(gbm_fit)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 pickup_latitude 724342.750000 1.000000 0.361419
## 2 pickup_longitude 510416.531250 0.704662 0.254678
## 3 dropoff_longitude 436893.093750 0.603158 0.217993
## 4 dropoff_latitude 245843.500000 0.339402 0.122666
## 5 vendor_id 70997.726562 0.098017 0.035425
## 6 passenger_count 15671.399414 0.021635 0.007819
gbm_pred = h2o.predict(gbm_fit, test)
##
|
| | 0%
|
|=================================================================| 100%
gbm_perf <- h2o.performance(model = gbm_fit,newdata = test)
print(gbm_perf)
## H2ORegressionMetrics: gbm
##
## MSE: 49685165
## RMSE: 7048.77
## MAE: 475.4797
## RMSLE: 0.6907691
## Mean Residual Deviance : 15.54497
class(dataset)
## [1] "data.table" "data.frame"
dim(dataset)
## [1] 1458644 11
names(dataset)
## [1] "id" "vendor_id" "pickup_datetime"
## [4] "dropoff_datetime" "passenger_count" "pickup_longitude"
## [7] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
## [10] "store_and_fwd_flag" "trip_duration"
str(dataset)
## Classes 'data.table' and 'data.frame': 1458644 obs. of 11 variables:
## $ id : chr "id2875421" "id2377394" "id3858529" "id3504673" ...
## $ vendor_id : int 2 1 2 2 2 2 1 2 1 2 ...
## $ pickup_datetime : chr "2016-03-14 17:24:55" "2016-06-12 00:43:35" "2016-01-19 11:35:24" "2016-04-06 19:32:31" ...
## $ dropoff_datetime : chr "2016-03-14 17:32:30" "2016-06-12 00:54:38" "2016-01-19 12:10:48" "2016-04-06 19:39:40" ...
## $ passenger_count : int 1 1 1 1 1 6 4 1 1 1 ...
## $ pickup_longitude : num -74 -74 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
## $ store_and_fwd_flag: chr "N" "N" "N" "N" ...
## $ trip_duration : int 455 663 2124 429 435 443 341 1551 255 1225 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(dataset)
## id vendor_id pickup_datetime dropoff_datetime
## Length:1458644 Min. :1.000 Length:1458644 Length:1458644
## Class :character 1st Qu.:1.000 Class :character Class :character
## Mode :character Median :2.000 Mode :character Mode :character
## Mean :1.535
## 3rd Qu.:2.000
## Max. :2.000
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## Min. :0.000 Min. :-121.93 Min. :34.36 Min. :-121.93
## 1st Qu.:1.000 1st Qu.: -73.99 1st Qu.:40.74 1st Qu.: -73.99
## Median :1.000 Median : -73.98 Median :40.75 Median : -73.98
## Mean :1.665 Mean : -73.97 Mean :40.75 Mean : -73.97
## 3rd Qu.:2.000 3rd Qu.: -73.97 3rd Qu.:40.77 3rd Qu.: -73.96
## Max. :9.000 Max. : -61.34 Max. :51.88 Max. : -61.34
## dropoff_latitude store_and_fwd_flag trip_duration
## Min. :32.18 Length:1458644 Min. : 1
## 1st Qu.:40.74 Class :character 1st Qu.: 397
## Median :40.75 Mode :character Median : 662
## Mean :40.75 Mean : 959
## 3rd Qu.:40.77 3rd Qu.: 1075
## Max. :43.92 Max. :3526282
glimpse(dataset)
## Observations: 1,458,644
## Variables: 11
## $ id <chr> "id2875421", "id2377394", "id3858529", "id3...
## $ vendor_id <int> 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2...
## $ pickup_datetime <chr> "2016-03-14 17:24:55", "2016-06-12 00:43:35...
## $ dropoff_datetime <chr> "2016-03-14 17:32:30", "2016-06-12 00:54:38...
## $ passenger_count <int> 1, 1, 1, 1, 1, 6, 4, 1, 1, 1, 1, 4, 2, 1, 1...
## $ pickup_longitude <dbl> -73.98215, -73.98042, -73.97903, -74.01004,...
## $ pickup_latitude <dbl> 40.76794, 40.73856, 40.76394, 40.71997, 40....
## $ dropoff_longitude <dbl> -73.96463, -73.99948, -74.00533, -74.01227,...
## $ dropoff_latitude <dbl> 40.76560, 40.73115, 40.71009, 40.70672, 40....
## $ store_and_fwd_flag <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N"...
## $ trip_duration <int> 455, 663, 2124, 429, 435, 443, 341, 1551, 2...
pMiss <- function(dataset){sum(is.na(dataset))/length(dataset)*100}
apply(dataset,2,pMiss)
## id vendor_id pickup_datetime
## 0 0 0
## dropoff_datetime passenger_count pickup_longitude
## 0 0 0
## pickup_latitude dropoff_longitude dropoff_latitude
## 0 0 0
## store_and_fwd_flag trip_duration
## 0 0
dataset <- dataset %>%
mutate(pickup_datetime = ymd_hms(pickup_datetime),
dropoff_datetime = ymd_hms(dropoff_datetime))
dataset$pickup_date <- as.Date(dataset$pickup_datetime,format='%m/%d/%Y')
dataset$pickup_year <- as.numeric(format(dataset$pickup_date, format = "%Y"))
dataset$pickup_month <- as.numeric(format(dataset$pickup_date, format = "%m"))
dataset$pickup_day <- as.numeric(format(dataset$pickup_date, format = "%d"))
dataset$pickup_dayweek <- weekdays(dataset$pickup_date)
dataset$pickup_time <-strftime(dataset$pickup_datetime,format='%H:%M:%S')
dataset$dropoff_date <- as.Date(dataset$dropoff_datetime,format='%m/%d/%Y')
dataset$dropoff_year <- as.numeric(format(dataset$dropoff_date, format = "%Y"))
dataset$dropoff_month <- as.numeric(format(dataset$dropoff_date, format = "%m"))
dataset$dropoff_day <- as.numeric(format(dataset$dropoff_date, format = "%d"))
dataset$dropoff_time <- strftime(dataset$dropoff_datetime,format='%H:%M:%S')
dataset$distance <- geodist(dataset$pickup_latitude,dataset$pickup_longitude,
dataset$dropoff_latitude, dataset$dropoff_longitude, units="km")
dataset$distance[which(is.na(dataset$distance))] <- mean(dataset$distance, na.rm=T)
summary(dataset$distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.231 2.093 3.439 3.873 1240.072
dataset$velocity <- dataset$distance/(dataset$trip_duration/3600)
dataset$store_and_fwd_flag <- as.factor(dataset$store_and_fwd_flag)
dataset$pickup_dayweek <- as.factor(dataset$pickup_dayweek)
dataset$passenger_count <- as.numeric(dataset$passenger_count)
dataset$trip_duration <- as.numeric(dataset$trip_duration)
dataset$pickup_time <- as.numeric(as.factor(dataset$pickup_time))
dataset$dropoff_time <- as.numeric(as.factor(dataset$dropoff_time))
dataset$pickup_d <- (dataset$pickup_year+(dataset$pickup_month-1)/12+dataset$pickup_day/365)
dataset$dropoff_d <- (dataset$dropoff_year+(dataset$dropoff_month-1)/12+dataset$dropoff_day/365)
dataset$traff_morn_time <- c(60*60*8)
dataset$traff_midday_time <- c(60*60*12)
dataset$traff_even_time <- c(60*60*18)
dataset$traff_morn_distance <- abs(dataset$pickup_time - dataset$traff_morn_time)
dataset$traff_midday_distance <- abs(dataset$pickup_time - dataset$traff_midday_time)
dataset$traff_even_distance <- abs(dataset$pickup_time - dataset$traff_even_time)
str(dataset)
## 'data.frame': 1458644 obs. of 32 variables:
## $ id : chr "id2875421" "id2377394" "id3858529" "id3504673" ...
## $ vendor_id : int 2 1 2 2 2 2 1 2 1 2 ...
## $ pickup_datetime : POSIXct, format: "2016-03-14 17:24:55" "2016-06-12 00:43:35" ...
## $ dropoff_datetime : POSIXct, format: "2016-03-14 17:32:30" "2016-06-12 00:54:38" ...
## $ passenger_count : num 1 1 1 1 1 6 4 1 1 1 ...
## $ pickup_longitude : num -74 -74 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
## $ store_and_fwd_flag : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ trip_duration : num 455 663 2124 429 435 ...
## $ pickup_date : Date, format: "2016-03-14" "2016-06-12" ...
## $ pickup_year : num 2016 2016 2016 2016 2016 ...
## $ pickup_month : num 3 6 1 4 3 1 6 5 5 3 ...
## $ pickup_day : num 14 12 19 6 26 30 17 21 27 10 ...
## $ pickup_dayweek : Factor w/ 7 levels "domenica","giovedì",..: 3 1 4 5 6 6 7 6 7 2 ...
## $ pickup_time : num 66193 9816 45222 77449 52153 ...
## $ dropoff_date : Date, format: "2016-03-14" "2016-06-12" ...
## $ dropoff_year : num 2016 2016 2016 2016 2016 ...
## $ dropoff_month : num 3 6 1 4 3 1 6 5 5 3 ...
## $ dropoff_day : num 14 12 19 6 26 30 17 21 27 10 ...
## $ dropoff_time : num 66660 10479 47358 77890 52600 ...
## $ distance : num 1.5 1.8 6.38 1.48 1.19 ...
## $ velocity : num 11.85 9.8 10.81 12.46 9.83 ...
## $ pickup_d : num 2016 2016 2016 2016 2016 ...
## $ dropoff_d : num 2016 2016 2016 2016 2016 ...
## $ traff_morn_time : num 28800 28800 28800 28800 28800 28800 28800 28800 28800 28800 ...
## $ traff_midday_time : num 43200 43200 43200 43200 43200 43200 43200 43200 43200 43200 ...
## $ traff_even_time : num 64800 64800 64800 64800 64800 64800 64800 64800 64800 64800 ...
## $ traff_morn_distance : num 37393 18984 16422 48649 23353 ...
## $ traff_midday_distance: num 22993 33384 2022 34249 8953 ...
## $ traff_even_distance : num 1393 54984 19578 12649 12647 ...
dataset <- dataset[,-c(1:4,12:15,18:21,27:29)]
str(dataset)
## 'data.frame': 1458644 obs. of 17 variables:
## $ passenger_count : num 1 1 1 1 1 6 4 1 1 1 ...
## $ pickup_longitude : num -74 -74 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
## $ store_and_fwd_flag : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ trip_duration : num 455 663 2124 429 435 ...
## $ pickup_dayweek : Factor w/ 7 levels "domenica","giovedì",..: 3 1 4 5 6 6 7 6 7 2 ...
## $ pickup_time : num 66193 9816 45222 77449 52153 ...
## $ dropoff_time : num 66660 10479 47358 77890 52600 ...
## $ distance : num 1.5 1.8 6.38 1.48 1.19 ...
## $ velocity : num 11.85 9.8 10.81 12.46 9.83 ...
## $ pickup_d : num 2016 2016 2016 2016 2016 ...
## $ dropoff_d : num 2016 2016 2016 2016 2016 ...
## $ traff_morn_distance : num 37393 18984 16422 48649 23353 ...
## $ traff_midday_distance: num 22993 33384 2022 34249 8953 ...
## $ traff_even_distance : num 1393 54984 19578 12649 12647 ...
nzv <- nearZeroVar(dataset, saveMetrics= TRUE)
nzv[nzv$nzv,][1:10,]
## freqRatio percentUnique zeroVar nzv
## store_and_fwd_flag 180.3106 0.0001371136 FALSE TRUE
## NA NA NA NA NA
## NA.1 NA NA NA NA
## NA.2 NA NA NA NA
## NA.3 NA NA NA NA
## NA.4 NA NA NA NA
## NA.5 NA NA NA NA
## NA.6 NA NA NA NA
## NA.7 NA NA NA NA
## NA.8 NA NA NA NA
nzv <- nearZeroVar(dataset)
dataset <- dataset[, -nzv]
dim(dataset)
## [1] 1458644 12
str(dataset)
## 'data.frame': 1458644 obs. of 12 variables:
## $ passenger_count : num 1 1 1 1 1 6 4 1 1 1 ...
## $ pickup_longitude : num -74 -74 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
## $ trip_duration : num 455 663 2124 429 435 ...
## $ pickup_dayweek : Factor w/ 7 levels "domenica","giovedì",..: 3 1 4 5 6 6 7 6 7 2 ...
## $ pickup_time : num 66193 9816 45222 77449 52153 ...
## $ distance : num 1.5 1.8 6.38 1.48 1.19 ...
## $ velocity : num 11.85 9.8 10.81 12.46 9.83 ...
## $ pickup_d : num 2016 2016 2016 2016 2016 ...
## $ traff_midday_distance: num 22993 33384 2022 34249 8953 ...
training2 <- dataset[1:1458644,]
training2 <- training2[,-7]
str(training2)
## 'data.frame': 1458644 obs. of 11 variables:
## $ passenger_count : num 1 1 1 1 1 6 4 1 1 1 ...
## $ pickup_longitude : num -74 -74 -74 -74 -74 ...
## $ pickup_latitude : num 40.8 40.7 40.8 40.7 40.8 ...
## $ dropoff_longitude : num -74 -74 -74 -74 -74 ...
## $ dropoff_latitude : num 40.8 40.7 40.7 40.7 40.8 ...
## $ trip_duration : num 455 663 2124 429 435 ...
## $ pickup_time : num 66193 9816 45222 77449 52153 ...
## $ distance : num 1.5 1.8 6.38 1.48 1.19 ...
## $ velocity : num 11.85 9.8 10.81 12.46 9.83 ...
## $ pickup_d : num 2016 2016 2016 2016 2016 ...
## $ traff_midday_distance: num 22993 33384 2022 34249 8953 ...
comboInfo <- findLinearCombos(training2)
comboInfo
## $linearCombos
## list()
##
## $remove
## NULL
dataset.hex <- as.h2o(dataset)
##
|
| | 0%
|
|=================================================================| 100%
splits <- h2o.splitFrame(data = dataset.hex, ratios = c(0.7, 0.15),seed = 1)
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
nrow(train)
## [1] 1021195
nrow(valid)
## [1] 218619
nrow(test)
## [1] 218830
y <- "trip_duration"
x <- setdiff(names(train), c(y))
print(x)
## [1] "passenger_count" "pickup_longitude"
## [3] "pickup_latitude" "dropoff_longitude"
## [5] "dropoff_latitude" "pickup_dayweek"
## [7] "pickup_time" "distance"
## [9] "velocity" "pickup_d"
## [11] "traff_midday_distance"
gbm_fit1 <- h2o.gbm(x = x, y = y,
training_frame = train,
validation_frame= valid,
distribution="gamma",
model_id = "gbm_fit1",
seed = 1)
##
|
| | 0%
|
|=== | 4%
|
|===== | 8%
|
|======== | 12%
|
|============ | 18%
|
|================ | 24%
|
|===================== | 32%
|
|========================= | 38%
|
|============================= | 44%
|
|================================ | 50%
|
|==================================== | 56%
|
|======================================== | 62%
|
|============================================ | 68%
|
|================================================ | 74%
|
|==================================================== | 80%
|
|======================================================== | 86%
|
|============================================================= | 94%
|
|=================================================================| 100%
print(gbm_fit1)
## Model Details:
## ==============
##
## H2ORegressionModel: gbm
## Model ID: gbm_fit1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 50 50 23076 5
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 5 5.00000 29 32 31.74000
##
##
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 13194839
## RMSE: 3632.47
## MAE: 62.84331
## RMSLE: 0.1435497
## Mean Residual Deviance : 14.93906
##
##
## H2ORegressionMetrics: gbm
## ** Reported on validation data. **
##
## MSE: 619042.8
## RMSE: 786.7927
## MAE: 60.47336
## RMSLE: 0.1466302
## Mean Residual Deviance : 14.94595
h2o.varimp(gbm_fit1)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 velocity 16315355.000000 1.000000 0.621210
## 2 distance 8940252.000000 0.547966 0.340402
## 3 pickup_time 983382.625000 0.060273 0.037442
## 4 traff_midday_distance 11712.104492 0.000718 0.000446
## 5 pickup_longitude 4912.696289 0.000301 0.000187
## 6 pickup_d 4146.401855 0.000254 0.000158
## 7 passenger_count 2160.564941 0.000132 0.000082
## 8 pickup_dayweek 947.537170 0.000058 0.000036
## 9 dropoff_longitude 652.957703 0.000040 0.000025
## 10 pickup_latitude 197.280212 0.000012 0.000008
## 11 dropoff_latitude 110.416336 0.000007 0.000004
plot(gbm_fit,
timestep = "number_of_trees",
metric = "rmse")
plot(gbm_fit,
timestep = "number_of_trees",
metric = "mae")
plot(gbm_fit,
timestep = "number_of_trees",
metric = "deviance")
gbm_pred1 = h2o.predict(gbm_fit1, test)
##
|
| | 0%
|
|=================================================================| 100%
summary(gbm_pred1)
## predict
## Min. :2.299e+00
## 1st Qu.:3.715e+02
## Median :6.177e+02
## Mean :9.345e+02
## 3rd Qu.:9.869e+02
## Max. :1.231e+05
gbm_perf1 <- h2o.performance(model = gbm_fit1,newdata = test)
print(gbm_perf1)
## H2ORegressionMetrics: gbm
##
## MSE: 35921037
## RMSE: 5993.416
## MAE: 76.58464
## RMSLE: 0.1426097
## Mean Residual Deviance : 14.95484