Github Code: Unifinished God/Kaggle
library(tidyverse)
library(lubridate)
library(stringr)
library(caret)
library(readr)
library(gridExtra)
library(xgboost)
library(Metrics)
train_set <- read_csv("train.csv")
test_set <- read_csv("test.csv")
submission <- read_csv("sampleSubmission.csv")
# Data Fields
# datetime - hourly date + timestamp
# season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
# 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals
# remove casual registered
train_set <- train_set %>%
select(-casual, -registered) %>%
mutate(
year = year(datetime),
month = month(datetime),
hour = hour(datetime),
minute = minute(datetime)) %>%
mutate(group = sample(
c("train", "valid"),
size = nrow(train_set),
replace = TRUE,
prob = c(0.7, 0.3) # Set weights for each group here
))
train_set_2 <- train_set
train_set$season <- as.factor(train_set$season)
levels(train_set$season) <- c('Spring','Summer','Fall','Winter')
test_set <- test_set %>%
mutate(
year = year(datetime),
month = month(datetime),
hour = hour(datetime),
minute = minute(datetime))
# valid_set <- train_set %>%
# filter(group == "valid")
#
#
# train_set <- train_set %>%
# filter(group == "train")
dim(train_set)
## [1] 10886 15
# dim(valid_set)
str(train_set)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10886 obs. of 15 variables:
## $ datetime : POSIXct, format: "2011-01-01 00:00:00" "2011-01-01 01:00:00" ...
## $ season : Factor w/ 4 levels "Spring","Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: num 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : num 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : num 81 80 80 75 75 75 80 86 75 76 ...
## $ windspeed : num 0 0 0 0 0 ...
## $ count : num 16 40 32 13 1 1 2 3 8 14 ...
## $ year : num 2011 2011 2011 2011 2011 ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ hour : int 0 1 2 3 4 5 6 7 8 9 ...
## $ minute : int 0 0 0 0 0 0 0 0 0 0 ...
## $ group : chr "train" "train" "valid" "train" ...
# str(valid_set)
colnames(train_set)
## [1] "datetime" "season" "holiday" "workingday" "weather"
## [6] "temp" "atemp" "humidity" "windspeed" "count"
## [11] "year" "month" "hour" "minute" "group"
# colnames(valid_set)
head(train_set)
# head(valid_set)
# Data Visualization
# The count vs temperature plot shows that rental count increases as the temperature increases.
# Temperature v Count plot
# Scatter Plot to show the relationship between count (number of total rentals) and temp (temperature in Celsius)
aa <- ggplot(data = train_set, aes(temp,count)) +
geom_point(alpha=.2,aes(color=temp)) +
ggtitle("Count vs Temperature") + xlab("Temp (Celsius)") +
ylab("Rental Count") + labs(color='Temp(C)') +
theme_bw() +
theme(legend.position = "bottom")
# Scatter Plot to show the relationship between count (number of total rentals) and date time.
bb <- ggplot(data = train_set, aes(datetime,count)) +
geom_point(alpha = .2,aes(color=temp)) +
scale_colour_continuous(low = "yellow", high = 'red') + theme_bw() +
ggtitle("Count vs Datetime") +
xlab("Date") +
ylab("Rental Count") +
labs(color='Temp(C)') +
theme(legend.position = "bottom")
grid.arrange(aa, bb, ncol=2)
# There is a clear seasonal trend where the total rental bikes seems to decrease during Winters i.e month of January and Feburary of the year and the total rental bikes seems to increase during summers.
# The other trend which is quite evident is that the number of rental bike counts is increasing from year 2011 to year 2013.
## Correlation between temperature and count.
cor(train_set[,c('temp','count')])
## temp count
## temp 1.0000000 0.3944536
## count 0.3944536 1.0000000
# There is not so strong correlation between temp and count.
## Box Plot
ggplot(data=train_set,aes(season,count,color = season)) +
geom_boxplot( alpha = .2) +
ggtitle("Rental count by season") +
xlab("Season") +
ylab("Rental Count") +
labs(color='Season', labels=c("Spring","Summer","Fall","Winter")) +
theme_bw() +
theme(legend.position = "bottom")
# The box plot between the number of bike rentals and season shows that the line can not capture the non linear relationship and that there’s is more rentals in winter as compared to spring.
## Feature Engineering
# As part of feature engineering I have added an hour column in the dataset.
## Relationship between hour of the working day and the count of bikes rented.
ggplot(filter(train_set,workingday == 1), aes(hour,count)) +
geom_point()
# This scatter plot shows an interesting trend where count of rented bikes increases during the evening hours when people leave from office i.e. around 5 PM and morning hours when people leave for office i.e. around 8 AM.
# ggplot(filter(train_set,workingday == 1), aes(hour,count)) +
# geom_point(position=position_jitter(w=1,h=0),aes(color = temp),alpha=0.5) +
# scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red')) +
# theme_bw()
# scale_color_gradientn(colors = c('blue','green','yellow','orangedred','red','darkred')) +
aa <- train_set %>%
filter(workingday == 1) %>%
ggplot(aes(hour,count)) +
geom_point( alpha = .5,position = position_jitter(w=1,h=0),aes(color=temp)) +
scale_color_gradientn(colors = c('blue','green','yellow','orangered','red')) +
ggtitle("workingday Rental Count") + xlab("Hour") + ylab("Rental Count") +
labs(color='Temp(C)') +
theme(legend.position = "bottom")
# This plot gives an interesting finding regarding temperature and bike rental count. As the temperature increases i.e. gets hotter the count of bike rental increases and for cold temperature there is a decline in count of bike rental.
## Relationship between hour of the non-working day and the count of bikes rented.
bb <- train_set %>%
filter(workingday == 0) %>%
ggplot(aes(hour,count)) +
geom_point( alpha = .5,position = position_jitter(w=1,h=0),aes(color=temp)) +
scale_color_gradientn(colors = c('blue','green','yellow','orangered','red')) +
ggtitle("Weekday Rental Count") + xlab("Hour") + ylab("Rental Count") +
labs(color='Temp(C)') +
theme(legend.position = "bottom")
grid.arrange(aa,bb, ncol=2)
## Model Building
# This model will be predicting the count of the bike rental based on the temp variable.
# temp.model <- lm(count ~ temp, train_set)
# print(summary(temp.model))
## Model Interpretation
# ** Based on the value of Intercept which is 6.0462, linear regression model predicts that there will be 6 bike rental when the temperature is 0. ** For temp variable Estimated Std. value is 9.1705 which signigies that a temperature increase of 1 celsius holding all things equal is associated with a rental increase of about 9.1 bikes.
# ** The above findings is not a Causation and Beta 1 would be negative if an increase in temperature was associated with a decrease in rentals.
#
# Next we want to know is how many bikes would we predict to be rented if the temperature was 25 degrees celsius.
## Building Second Model with more features
# Model that attempts to predict count based off of the following features :-
# season
# holiday
# workingday
# weather
# temp
# humidity
# windspeed
# hour (factor)
## Important Finding
# This sort of model doesn’t work well given our seasonal and time series data. We need a model that can account for this type of trend. We will get thrown off with the growth of our dataset accidentaly attributing to the winter season instead of realizing it’s just overall demand growing.
##########################################################################################
##########################################################################################
# train_set <- train_set_2 %>%
# filter(group == "train")
#
# valid_set <- train_set_2 %>%
# filter(group == "valid")
#
#
# train_set$count = log1p(train_set$count)
#
# X_train <- train_set %>%
# select(-count, - datetime, -group) %>%
# as.matrix()
#
# y_train <- train_set$count
#
#
# dtrain = xgb.DMatrix(X_train, label = y_train)
#
#
# # cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
# # max_depth = 3, eta = 1, objective = "reg:squarederror")
# #
# # print(cv)
# # print(cv, verbose=TRUE)
#
#
# model = xgb.train(data = dtrain,
# nround = 150,
# max_depth = 5,
# eta = 0.1,
# subsample = 0.9)
#
#
# ## cv grid search
# searchGridSubCol <- expand.grid(subsample = c(0.5, 0.6),
# colsample_bytree = c(0.5, 0.6),
# max_depth = c(3, 4, 5),
# min_child = seq(1),
# eta = c(0.1)
# )
#
# ntrees <- 100
#
# system.time(
# rmseErrorsHyperparameters <- apply(searchGridSubCol, 1, function(parameterList){
#
# #Extract Parameters to test
# currentSubsampleRate <- parameterList[["subsample"]]
# currentColsampleRate <- parameterList[["colsample_bytree"]]
# currentDepth <- parameterList[["max_depth"]]
# currentEta <- parameterList[["eta"]]
# currentMinChild <- parameterList[["min_child"]]
#
# xgboostModelCV <- xgb.cv(data = dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE,
# metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
# "objective" = "reg:linear", "max.depth" = currentDepth, "eta" = currentEta,
# "subsample" = currentSubsampleRate, "colsample_bytree" = currentColsampleRate
# , print_every_n = 10, "min_child_weight" = currentMinChild, booster = "gbtree",
# early_stopping_rounds = 10)
#
# xvalidationScores <- as.data.frame(xgboostModelCV$evaluation_log)
# rmse <- tail(xvalidationScores$test_rmse_mean, 1)
# trmse <- tail(xvalidationScores$train_rmse_mean,1)
# output <- return(c(rmse, trmse, currentSubsampleRate, currentColsampleRate, currentDepth, currentEta, currentMinChild))}))
#
#
# output <- as.data.frame(t(rmseErrorsHyperparameters))
# varnames <- c("TestRMSE", "TrainRMSE", "SubSampRate", "ColSampRate", "Depth", "eta", "currentMinChild")
# names(output) <- varnames
# head(output)
#
# # xgb.importance(feature_names = colnames(X_train), model) %>%
# # xgb.plot.importance()
#
#
#
# X_valid = valid_set %>%
# select(- datetime, -group, -count) %>%
# as.matrix()
#
# y_valid = valid_set$count
#
# preds = predict(model, y_valid)
# preds = expm1(preds)
#
# solution = data.frame(datetime = valid_set$datetime, count = preds)
#
# rmsle(y_valid, preds)
# write.csv(solution, "solution.csv", row.names = FALSE)
##########################################################################################
##########################################################################################
train_set_2$count = log1p(train_set_2$count)
X_train <- train_set_2 %>%
select(-count, - datetime, -group) %>%
as.matrix()
y_train <- train_set_2$count
dtrain = xgb.DMatrix(X_train, label = y_train)
# cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
# max_depth = 3, eta = 1, objective = "reg:squarederror")
# print(cv)
# print(cv, verbose=TRUE)
#####
#####
## cv grid search
searchGridSubCol <- expand.grid(subsample = c(0.5, 0.6),
colsample_bytree = c(0.5, 0.6),
max_depth = c(3, 4, 5),
min_child = seq(1),
eta = c(0.1)
)
# xgboostModelCV <- xgb.cv(data = dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE,
# metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
# "objective" = "reg:linear",
# "max.depth" = 3,
# "eta" = 0.1,
# "subsample" = 0.7,
# "colsample_bytree" = 0.5,
# print_every_n = 10,
# "min_child_weight" = 1,
# booster = "gbtree",
# early_stopping_rounds = 10)
ntrees <- 100
system.time(
rmseErrorsHyperparameters <- apply(searchGridSubCol, 1, function(parameterList){
#Extract Parameters to test
currentSubsampleRate <- parameterList[["subsample"]]
currentColsampleRate <- parameterList[["colsample_bytree"]]
currentDepth <- parameterList[["max_depth"]]
currentEta <- parameterList[["eta"]]
currentMinChild <- parameterList[["min_child"]]
xgboostModelCV <- xgb.cv(data = dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE,
metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
"objective" = "reg:linear", "max.depth" = currentDepth, "eta" = currentEta,
"subsample" = currentSubsampleRate, "colsample_bytree" = currentColsampleRate
, print_every_n = 10, "min_child_weight" = currentMinChild, booster = "gbtree",
early_stopping_rounds = 10)
xvalidationScores <- as.data.frame(xgboostModelCV$evaluation_log)
rmse <- tail(xvalidationScores$test_rmse_mean, 1)
trmse <- tail(xvalidationScores$train_rmse_mean,1)
output <- return(c(rmse, trmse,currentSubsampleRate, currentColsampleRate, currentDepth, currentEta, currentMinChild))})
)
## [1] train-rmse:3.927939+0.013082 test-rmse:3.927945+0.035913
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.590040+0.050895 test-rmse:1.592463+0.070776
## [21] train-rmse:0.823419+0.024557 test-rmse:0.827830+0.031172
## [31] train-rmse:0.608933+0.015789 test-rmse:0.615345+0.018163
## [41] train-rmse:0.542733+0.013222 test-rmse:0.549638+0.013453
## [51] train-rmse:0.503688+0.018574 test-rmse:0.511247+0.019685
## [61] train-rmse:0.476399+0.022033 test-rmse:0.484991+0.022961
## [71] train-rmse:0.445876+0.019389 test-rmse:0.454943+0.013955
## [81] train-rmse:0.434424+0.016952 test-rmse:0.444165+0.010733
## [91] train-rmse:0.422002+0.019330 test-rmse:0.432792+0.015258
## [100] train-rmse:0.413198+0.021944 test-rmse:0.424452+0.018866
## [1] train-rmse:3.927370+0.009949 test-rmse:3.927830+0.023788
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.594597+0.023829 test-rmse:1.596777+0.026034
## [21] train-rmse:0.834017+0.020128 test-rmse:0.839063+0.023939
## [31] train-rmse:0.625952+0.005011 test-rmse:0.633077+0.011091
## [41] train-rmse:0.546531+0.009099 test-rmse:0.555252+0.009290
## [51] train-rmse:0.507585+0.014042 test-rmse:0.516916+0.013908
## [61] train-rmse:0.476963+0.011618 test-rmse:0.486545+0.008633
## [71] train-rmse:0.450076+0.009226 test-rmse:0.459295+0.007910
## [81] train-rmse:0.434468+0.009433 test-rmse:0.444144+0.011733
## [91] train-rmse:0.425972+0.010301 test-rmse:0.436275+0.013141
## [100] train-rmse:0.415702+0.010800 test-rmse:0.426663+0.013045
## [1] train-rmse:3.916826+0.007558 test-rmse:3.917244+0.033409
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.548375+0.023547 test-rmse:1.549220+0.031000
## [21] train-rmse:0.822912+0.020262 test-rmse:0.827208+0.021172
## [31] train-rmse:0.611511+0.016719 test-rmse:0.617043+0.010178
## [41] train-rmse:0.532947+0.017655 test-rmse:0.540265+0.012853
## [51] train-rmse:0.495812+0.015695 test-rmse:0.504283+0.006791
## [61] train-rmse:0.462200+0.018641 test-rmse:0.472091+0.013125
## [71] train-rmse:0.439526+0.018771 test-rmse:0.449956+0.018496
## [81] train-rmse:0.420369+0.023883 test-rmse:0.431512+0.024394
## [91] train-rmse:0.409649+0.026334 test-rmse:0.421682+0.028637
## [100] train-rmse:0.398098+0.028104 test-rmse:0.410753+0.028737
## [1] train-rmse:3.921496+0.015966 test-rmse:3.920677+0.020475
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.583231+0.031846 test-rmse:1.584711+0.018806
## [21] train-rmse:0.825928+0.027460 test-rmse:0.829451+0.024001
## [31] train-rmse:0.616744+0.024663 test-rmse:0.623038+0.025668
## [41] train-rmse:0.540662+0.016995 test-rmse:0.547994+0.019945
## [51] train-rmse:0.501504+0.015459 test-rmse:0.508677+0.016681
## [61] train-rmse:0.471862+0.010597 test-rmse:0.479123+0.012501
## [71] train-rmse:0.451952+0.013370 test-rmse:0.460485+0.012022
## [81] train-rmse:0.432666+0.010886 test-rmse:0.441693+0.007894
## [91] train-rmse:0.421227+0.009522 test-rmse:0.430872+0.012349
## [100] train-rmse:0.415291+0.007321 test-rmse:0.425544+0.010794
## [1] train-rmse:3.925714+0.011809 test-rmse:3.925520+0.020100
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.555602+0.039215 test-rmse:1.557441+0.041948
## [21] train-rmse:0.785280+0.021219 test-rmse:0.791421+0.026139
## [31] train-rmse:0.568564+0.013066 test-rmse:0.578304+0.012096
## [41] train-rmse:0.486451+0.009894 test-rmse:0.499558+0.012038
## [51] train-rmse:0.441475+0.010910 test-rmse:0.456358+0.011741
## [61] train-rmse:0.404062+0.015151 test-rmse:0.420743+0.011235
## [71] train-rmse:0.378873+0.014850 test-rmse:0.396816+0.010222
## [81] train-rmse:0.365650+0.014619 test-rmse:0.385275+0.009261
## [91] train-rmse:0.352015+0.013196 test-rmse:0.373062+0.008284
## [100] train-rmse:0.341831+0.009388 test-rmse:0.364025+0.007318
## [1] train-rmse:3.914791+0.010045 test-rmse:3.915450+0.023042
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.563047+0.041964 test-rmse:1.567308+0.048683
## [21] train-rmse:0.800263+0.014681 test-rmse:0.808473+0.012516
## [31] train-rmse:0.569701+0.024749 test-rmse:0.581251+0.026655
## [41] train-rmse:0.481975+0.020303 test-rmse:0.495781+0.027604
## [51] train-rmse:0.435978+0.012685 test-rmse:0.452435+0.022973
## [61] train-rmse:0.405534+0.010080 test-rmse:0.423645+0.017965
## [71] train-rmse:0.381624+0.012640 test-rmse:0.400909+0.018885
## [81] train-rmse:0.367977+0.013861 test-rmse:0.388804+0.022347
## [91] train-rmse:0.353035+0.012874 test-rmse:0.375329+0.021590
## [100] train-rmse:0.346789+0.015000 test-rmse:0.370340+0.023541
## [1] train-rmse:3.925675+0.009405 test-rmse:3.926019+0.040841
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.559878+0.022007 test-rmse:1.562979+0.030714
## [21] train-rmse:0.772958+0.023735 test-rmse:0.779599+0.025288
## [31] train-rmse:0.540792+0.010375 test-rmse:0.551421+0.015050
## [41] train-rmse:0.466386+0.016544 test-rmse:0.479945+0.021073
## [51] train-rmse:0.416312+0.013908 test-rmse:0.430880+0.024447
## [61] train-rmse:0.383233+0.013055 test-rmse:0.399322+0.022325
## [71] train-rmse:0.360766+0.007582 test-rmse:0.378485+0.015137
## [81] train-rmse:0.344778+0.007508 test-rmse:0.363676+0.014802
## [91] train-rmse:0.335753+0.005597 test-rmse:0.355960+0.011430
## [100] train-rmse:0.328096+0.003061 test-rmse:0.349245+0.010191
## [1] train-rmse:3.915963+0.013481 test-rmse:3.915500+0.027944
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.528056+0.021250 test-rmse:1.530451+0.031975
## [21] train-rmse:0.754601+0.007260 test-rmse:0.760630+0.011202
## [31] train-rmse:0.534459+0.006380 test-rmse:0.544263+0.010065
## [41] train-rmse:0.470631+0.017232 test-rmse:0.483103+0.018629
## [51] train-rmse:0.420535+0.012034 test-rmse:0.434652+0.011242
## [61] train-rmse:0.388309+0.008676 test-rmse:0.404194+0.010591
## [71] train-rmse:0.362132+0.006555 test-rmse:0.380124+0.008167
## [81] train-rmse:0.345296+0.005084 test-rmse:0.364197+0.005713
## [91] train-rmse:0.335034+0.006742 test-rmse:0.355369+0.008042
## [100] train-rmse:0.326902+0.003885 test-rmse:0.348211+0.004703
## [1] train-rmse:3.926832+0.013099 test-rmse:3.927547+0.034990
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.568232+0.049995 test-rmse:1.573803+0.065665
## [21] train-rmse:0.765808+0.035389 test-rmse:0.778744+0.039748
## [31] train-rmse:0.527638+0.035374 test-rmse:0.547298+0.039905
## [41] train-rmse:0.455121+0.015080 test-rmse:0.479227+0.017965
## [51] train-rmse:0.405173+0.006817 test-rmse:0.432197+0.006481
## [61] train-rmse:0.372348+0.010170 test-rmse:0.401761+0.009414
## [71] train-rmse:0.344056+0.005025 test-rmse:0.375476+0.005608
## [81] train-rmse:0.327755+0.006147 test-rmse:0.361196+0.007901
## [91] train-rmse:0.317275+0.006218 test-rmse:0.353427+0.006261
## [100] train-rmse:0.308716+0.005482 test-rmse:0.346847+0.006638
## [1] train-rmse:3.920258+0.012710 test-rmse:3.920423+0.028197
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.558861+0.047645 test-rmse:1.563924+0.057555
## [21] train-rmse:0.779338+0.033298 test-rmse:0.791069+0.032633
## [31] train-rmse:0.527843+0.035118 test-rmse:0.546044+0.033415
## [41] train-rmse:0.423115+0.027502 test-rmse:0.445467+0.024010
## [51] train-rmse:0.378379+0.015760 test-rmse:0.403759+0.019003
## [61] train-rmse:0.353778+0.011127 test-rmse:0.381720+0.011588
## [71] train-rmse:0.338957+0.010567 test-rmse:0.369181+0.007509
## [81] train-rmse:0.321325+0.007972 test-rmse:0.353781+0.004692
## [91] train-rmse:0.310686+0.007015 test-rmse:0.345130+0.003760
## [100] train-rmse:0.302583+0.007112 test-rmse:0.338889+0.006436
## [1] train-rmse:3.914368+0.009559 test-rmse:3.914667+0.024498
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.530554+0.033545 test-rmse:1.534331+0.039981
## [21] train-rmse:0.733997+0.021026 test-rmse:0.745243+0.028940
## [31] train-rmse:0.492979+0.014762 test-rmse:0.510274+0.018956
## [41] train-rmse:0.400595+0.018420 test-rmse:0.421616+0.014162
## [51] train-rmse:0.360564+0.017941 test-rmse:0.384571+0.011294
## [61] train-rmse:0.337961+0.014682 test-rmse:0.364641+0.009665
## [71] train-rmse:0.314417+0.004292 test-rmse:0.343508+0.002221
## [81] train-rmse:0.304974+0.004645 test-rmse:0.336801+0.003078
## [91] train-rmse:0.295830+0.002921 test-rmse:0.329692+0.003903
## [100] train-rmse:0.289773+0.002167 test-rmse:0.325823+0.004674
## [1] train-rmse:3.924184+0.013032 test-rmse:3.923813+0.015279
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:1.543331+0.023046 test-rmse:1.548840+0.023393
## [21] train-rmse:0.771772+0.019111 test-rmse:0.784462+0.016589
## [31] train-rmse:0.499379+0.020853 test-rmse:0.518189+0.022712
## [41] train-rmse:0.410806+0.012905 test-rmse:0.435137+0.021908
## [51] train-rmse:0.366691+0.007316 test-rmse:0.394261+0.014559
## [61] train-rmse:0.339655+0.009372 test-rmse:0.369760+0.015987
## [71] train-rmse:0.321909+0.007815 test-rmse:0.354386+0.013800
## [81] train-rmse:0.311281+0.007180 test-rmse:0.346035+0.010358
## [91] train-rmse:0.298816+0.005305 test-rmse:0.335769+0.009494
## [100] train-rmse:0.291484+0.004177 test-rmse:0.330297+0.008169
## user system elapsed
## 50.542 0.152 12.825
output <- as.data.frame(t(rmseErrorsHyperparameters))
varnames <- c("TestRMSE", "TrainRMSE", "SubSampRate", "ColSampRate", "Depth", "eta", "currentMinChild")
names(output) <- varnames
head(output)
#####
#####
model = xgb.train(data = dtrain,
nround = 150,
max_depth = 5,
eta = 0.1,
subsample = 0.9)
xgb.importance(feature_names = colnames(X_train), model) %>%
xgb.plot.importance()
X_test = test_set %>%
select(- datetime) %>%
as.matrix()
# y_test = test_set$count
preds = predict(model, X_test)
preds = expm1(preds)
solution = data.frame(datetime = test_set$datetime, count = preds)
write.csv(solution, "solution.csv", row.names = FALSE)