Github Code: Unifinished God/Kaggle

library(tidyverse)
library(lubridate)
library(stringr)
library(caret)
library(readr)
library(gridExtra)
library(xgboost)
library(Metrics)


train_set <- read_csv("train.csv")
test_set <- read_csv("test.csv")
submission <- read_csv("sampleSubmission.csv")


# Data Fields
# datetime - hourly date + timestamp  
# season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
# 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
# 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
# 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals



# remove casual registered
train_set <- train_set %>% 
  select(-casual, -registered) %>%  
  mutate(
    year = year(datetime),
    month = month(datetime),
    hour = hour(datetime),
    minute = minute(datetime)) %>% 
  mutate(group = sample(
    c("train", "valid"),
    size = nrow(train_set),
    replace = TRUE,
    prob = c(0.7, 0.3) # Set weights for each group here
  ))

train_set_2 <- train_set

train_set$season <- as.factor(train_set$season)
levels(train_set$season) <- c('Spring','Summer','Fall','Winter')

test_set <- test_set %>% 
  mutate(
    year = year(datetime),
    month = month(datetime),
    hour = hour(datetime),
    minute = minute(datetime))



# valid_set <- train_set %>% 
#   filter(group == "valid")
# 
# 
# train_set <- train_set %>% 
#   filter(group == "train")



dim(train_set)
## [1] 10886    15
# dim(valid_set)

str(train_set)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10886 obs. of  15 variables:
##  $ datetime  : POSIXct, format: "2011-01-01 00:00:00" "2011-01-01 01:00:00" ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : num  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : num  81 80 80 75 75 75 80 86 75 76 ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ count     : num  16 40 32 13 1 1 2 3 8 14 ...
##  $ year      : num  2011 2011 2011 2011 2011 ...
##  $ month     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ hour      : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ minute    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ group     : chr  "train" "train" "valid" "train" ...
# str(valid_set)

colnames(train_set)
##  [1] "datetime"   "season"     "holiday"    "workingday" "weather"   
##  [6] "temp"       "atemp"      "humidity"   "windspeed"  "count"     
## [11] "year"       "month"      "hour"       "minute"     "group"
# colnames(valid_set)

head(train_set)
# head(valid_set)

# Data Visualization
# The count vs temperature plot shows that rental count increases as the temperature increases.


# Temperature v Count plot
# Scatter Plot to show the relationship between count (number of total rentals) and temp (temperature in Celsius)
aa <- ggplot(data = train_set, aes(temp,count)) +
  geom_point(alpha=.2,aes(color=temp)) +
  ggtitle("Count vs Temperature") + xlab("Temp (Celsius)") +
  ylab("Rental Count") + labs(color='Temp(C)') +
  theme_bw() +  
  theme(legend.position = "bottom")


# Scatter Plot to show the relationship between count (number of total rentals) and date time.
bb <- ggplot(data = train_set, aes(datetime,count)) +
  geom_point(alpha = .2,aes(color=temp)) +
  scale_colour_continuous(low = "yellow", high = 'red') + theme_bw() +
  ggtitle("Count vs Datetime") + 
  xlab("Date") + 
  ylab("Rental Count") +
  labs(color='Temp(C)') + 
  theme(legend.position = "bottom")

grid.arrange(aa, bb, ncol=2)

# There is a clear seasonal trend where the total rental bikes seems to decrease during Winters i.e month of January and Feburary of the year and the total rental bikes seems to increase during summers.

# The other trend which is quite evident is that the number of rental bike counts is increasing from year 2011 to year 2013.


## Correlation between temperature and count.

cor(train_set[,c('temp','count')])
##            temp     count
## temp  1.0000000 0.3944536
## count 0.3944536 1.0000000
# There is not so strong correlation between temp and count.

## Box Plot
ggplot(data=train_set,aes(season,count,color = season)) +
  geom_boxplot( alpha = .2) + 
  ggtitle("Rental count by season") + 
  xlab("Season") +
  ylab("Rental Count") +
  labs(color='Season', labels=c("Spring","Summer","Fall","Winter")) +
  theme_bw() +
  theme(legend.position = "bottom")

# The box plot between the number of bike rentals and season shows that the line can not capture the non linear relationship and that there’s is more rentals in winter as compared to spring.


## Feature Engineering
# As part of feature engineering I have added an hour column in the dataset.



## Relationship between hour of the working day and the count of bikes rented.
ggplot(filter(train_set,workingday == 1), aes(hour,count)) + 
  geom_point()

# This scatter plot shows an interesting trend where count of rented bikes increases during the evening hours when people leave from office i.e. around 5 PM and morning hours when people leave for office i.e. around 8 AM.
# ggplot(filter(train_set,workingday == 1), aes(hour,count)) +
#   geom_point(position=position_jitter(w=1,h=0),aes(color = temp),alpha=0.5) +
#   scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red')) +
#   theme_bw()

# scale_color_gradientn(colors = c('blue','green','yellow','orangedred','red','darkred')) +
  

aa <- train_set %>% 
  filter(workingday == 1) %>%
  ggplot(aes(hour,count)) +
  geom_point( alpha = .5,position = position_jitter(w=1,h=0),aes(color=temp)) +
  scale_color_gradientn(colors = c('blue','green','yellow','orangered','red')) +
  ggtitle("workingday Rental Count") + xlab("Hour") + ylab("Rental Count") +
  labs(color='Temp(C)') +  
  theme(legend.position = "bottom")

# This plot gives an interesting finding regarding temperature and bike rental count. As the temperature increases i.e. gets hotter the count of bike rental increases and for cold temperature there is a decline in count of bike rental.

## Relationship between hour of the non-working day and the count of bikes rented.

bb <- train_set %>% 
  filter(workingday == 0) %>%
  ggplot(aes(hour,count)) +
  geom_point( alpha = .5,position = position_jitter(w=1,h=0),aes(color=temp)) +
  scale_color_gradientn(colors = c('blue','green','yellow','orangered','red')) +
  ggtitle("Weekday Rental Count") + xlab("Hour") + ylab("Rental Count") + 
  labs(color='Temp(C)') +  
  theme(legend.position = "bottom")


grid.arrange(aa,bb, ncol=2)

## Model Building
# This model will be predicting the count of the bike rental based on the temp variable.

# temp.model <- lm(count ~ temp, train_set)
# print(summary(temp.model))


## Model Interpretation
# ** Based on the value of Intercept which is 6.0462, linear regression model predicts that there will be 6 bike rental when the temperature is 0. ** For temp variable Estimated Std. value is 9.1705 which signigies that a temperature increase of 1 celsius holding all things equal is associated with a rental increase of about 9.1 bikes.
# ** The above findings is not a Causation and Beta 1 would be negative if an increase in temperature was associated with a decrease in rentals.
# 
# Next we want to know is how many bikes would we predict to be rented if the temperature was 25 degrees celsius.



## Building Second Model with more features
# Model that attempts to predict count based off of the following features :-

#   season
# holiday
# workingday
# weather
# temp
# humidity
# windspeed
# hour (factor)

## Important Finding
# This sort of model doesn’t work well given our seasonal and time series data. We need a model that can account for this type of trend. We will get thrown off with the growth of our dataset accidentaly attributing to the winter season instead of realizing it’s just overall demand growing.


##########################################################################################
##########################################################################################

# train_set <- train_set_2 %>% 
#   filter(group == "train") 
# 
# valid_set <- train_set_2 %>% 
#   filter(group == "valid") 
# 
# 
# train_set$count = log1p(train_set$count)
# 
# X_train <- train_set %>%
#   select(-count, - datetime, -group) %>% 
#   as.matrix()
# 
# y_train <- train_set$count
# 
# 
# dtrain = xgb.DMatrix(X_train, label = y_train)
# 
# 
# # cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
# #              max_depth = 3, eta = 1, objective = "reg:squarederror")
# # 
# # print(cv)
# # print(cv, verbose=TRUE)
# 
# 
# model = xgb.train(data = dtrain, 
#                   nround = 150, 
#                   max_depth = 5, 
#                   eta = 0.1, 
#                   subsample = 0.9)
# 
# 
# ## cv grid search
# searchGridSubCol <- expand.grid(subsample = c(0.5, 0.6), 
#                                 colsample_bytree = c(0.5, 0.6),
#                                 max_depth = c(3, 4, 5),
#                                 min_child = seq(1), 
#                                 eta = c(0.1)
# )
# 
# ntrees <- 100
# 
# system.time(
#   rmseErrorsHyperparameters <- apply(searchGridSubCol, 1, function(parameterList){
#     
#     #Extract Parameters to test
#     currentSubsampleRate <- parameterList[["subsample"]]
#     currentColsampleRate <- parameterList[["colsample_bytree"]]
#     currentDepth <- parameterList[["max_depth"]]
#     currentEta <- parameterList[["eta"]]
#     currentMinChild <- parameterList[["min_child"]]
#     
#     xgboostModelCV <- xgb.cv(data =  dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE, 
#                              metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
#                              "objective" = "reg:linear", "max.depth" = currentDepth, "eta" = currentEta,                               
#                              "subsample" = currentSubsampleRate, "colsample_bytree" = currentColsampleRate
#                              , print_every_n = 10, "min_child_weight" = currentMinChild, booster = "gbtree",
#                              early_stopping_rounds = 10)
#     
#     xvalidationScores <- as.data.frame(xgboostModelCV$evaluation_log)
#     rmse <- tail(xvalidationScores$test_rmse_mean, 1)
#     trmse <- tail(xvalidationScores$train_rmse_mean,1)
#     output <- return(c(rmse, trmse, currentSubsampleRate, currentColsampleRate, currentDepth, currentEta, currentMinChild))}))
# 
# 
# output <- as.data.frame(t(rmseErrorsHyperparameters))
# varnames <- c("TestRMSE", "TrainRMSE", "SubSampRate", "ColSampRate", "Depth", "eta", "currentMinChild")
# names(output) <- varnames
# head(output)
# 
# # xgb.importance(feature_names = colnames(X_train), model) %>% 
# #   xgb.plot.importance()
# 
# 
# 
# X_valid = valid_set %>% 
#   select(- datetime, -group, -count) %>% 
#   as.matrix()
# 
# y_valid = valid_set$count
# 
# preds = predict(model, y_valid)
# preds = expm1(preds)
# 
# solution = data.frame(datetime = valid_set$datetime, count = preds)
# 
# rmsle(y_valid, preds)

# write.csv(solution, "solution.csv", row.names = FALSE)
##########################################################################################
##########################################################################################

train_set_2$count = log1p(train_set_2$count)

X_train <- train_set_2 %>%
  select(-count, - datetime, -group) %>% 
  as.matrix()

y_train <- train_set_2$count


dtrain = xgb.DMatrix(X_train, label = y_train)


# cv <- xgb.cv(data = dtrain, nrounds = 3, nthread = 2, nfold = 5, metrics = list("rmse","auc"),
#              max_depth = 3, eta = 1, objective = "reg:squarederror")

# print(cv)
# print(cv, verbose=TRUE)




#####
#####
## cv grid search
searchGridSubCol <- expand.grid(subsample = c(0.5, 0.6), 
                                colsample_bytree = c(0.5, 0.6),
                                max_depth = c(3, 4, 5),
                                min_child = seq(1), 
                                eta = c(0.1)
)

# xgboostModelCV <- xgb.cv(data =  dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE, 
#                          metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
#                          "objective" = "reg:linear", 
#                          "max.depth" = 3, 
#                          "eta" = 0.1,                            
#                          "subsample" = 0.7, 
#                          "colsample_bytree" = 0.5, 
#                          print_every_n = 10,
#                          "min_child_weight" = 1,
#                          booster = "gbtree",
#                          early_stopping_rounds = 10)


ntrees <- 100

system.time(
  rmseErrorsHyperparameters <- apply(searchGridSubCol, 1, function(parameterList){
    
    #Extract Parameters to test
    currentSubsampleRate <- parameterList[["subsample"]]
    currentColsampleRate <- parameterList[["colsample_bytree"]]
    currentDepth <- parameterList[["max_depth"]]
    currentEta <- parameterList[["eta"]]
    currentMinChild <- parameterList[["min_child"]]
    
    xgboostModelCV <- xgb.cv(data =  dtrain, nrounds = ntrees, nfold = 5, showsd = TRUE, 
                             metrics = "rmse", verbose = TRUE, "eval_metric" = "rmse",
                             "objective" = "reg:linear", "max.depth" = currentDepth, "eta" = currentEta,                            
                             "subsample" = currentSubsampleRate, "colsample_bytree" = currentColsampleRate
                             , print_every_n = 10, "min_child_weight" = currentMinChild, booster = "gbtree",
                             early_stopping_rounds = 10)
    
    xvalidationScores <- as.data.frame(xgboostModelCV$evaluation_log)
    rmse <- tail(xvalidationScores$test_rmse_mean, 1)
    trmse <- tail(xvalidationScores$train_rmse_mean,1)
    output <- return(c(rmse, trmse,currentSubsampleRate, currentColsampleRate, currentDepth, currentEta, currentMinChild))})
  )
## [1]  train-rmse:3.927939+0.013082    test-rmse:3.927945+0.035913 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.590040+0.050895    test-rmse:1.592463+0.070776 
## [21] train-rmse:0.823419+0.024557    test-rmse:0.827830+0.031172 
## [31] train-rmse:0.608933+0.015789    test-rmse:0.615345+0.018163 
## [41] train-rmse:0.542733+0.013222    test-rmse:0.549638+0.013453 
## [51] train-rmse:0.503688+0.018574    test-rmse:0.511247+0.019685 
## [61] train-rmse:0.476399+0.022033    test-rmse:0.484991+0.022961 
## [71] train-rmse:0.445876+0.019389    test-rmse:0.454943+0.013955 
## [81] train-rmse:0.434424+0.016952    test-rmse:0.444165+0.010733 
## [91] train-rmse:0.422002+0.019330    test-rmse:0.432792+0.015258 
## [100]    train-rmse:0.413198+0.021944    test-rmse:0.424452+0.018866 
## [1]  train-rmse:3.927370+0.009949    test-rmse:3.927830+0.023788 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.594597+0.023829    test-rmse:1.596777+0.026034 
## [21] train-rmse:0.834017+0.020128    test-rmse:0.839063+0.023939 
## [31] train-rmse:0.625952+0.005011    test-rmse:0.633077+0.011091 
## [41] train-rmse:0.546531+0.009099    test-rmse:0.555252+0.009290 
## [51] train-rmse:0.507585+0.014042    test-rmse:0.516916+0.013908 
## [61] train-rmse:0.476963+0.011618    test-rmse:0.486545+0.008633 
## [71] train-rmse:0.450076+0.009226    test-rmse:0.459295+0.007910 
## [81] train-rmse:0.434468+0.009433    test-rmse:0.444144+0.011733 
## [91] train-rmse:0.425972+0.010301    test-rmse:0.436275+0.013141 
## [100]    train-rmse:0.415702+0.010800    test-rmse:0.426663+0.013045 
## [1]  train-rmse:3.916826+0.007558    test-rmse:3.917244+0.033409 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.548375+0.023547    test-rmse:1.549220+0.031000 
## [21] train-rmse:0.822912+0.020262    test-rmse:0.827208+0.021172 
## [31] train-rmse:0.611511+0.016719    test-rmse:0.617043+0.010178 
## [41] train-rmse:0.532947+0.017655    test-rmse:0.540265+0.012853 
## [51] train-rmse:0.495812+0.015695    test-rmse:0.504283+0.006791 
## [61] train-rmse:0.462200+0.018641    test-rmse:0.472091+0.013125 
## [71] train-rmse:0.439526+0.018771    test-rmse:0.449956+0.018496 
## [81] train-rmse:0.420369+0.023883    test-rmse:0.431512+0.024394 
## [91] train-rmse:0.409649+0.026334    test-rmse:0.421682+0.028637 
## [100]    train-rmse:0.398098+0.028104    test-rmse:0.410753+0.028737 
## [1]  train-rmse:3.921496+0.015966    test-rmse:3.920677+0.020475 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.583231+0.031846    test-rmse:1.584711+0.018806 
## [21] train-rmse:0.825928+0.027460    test-rmse:0.829451+0.024001 
## [31] train-rmse:0.616744+0.024663    test-rmse:0.623038+0.025668 
## [41] train-rmse:0.540662+0.016995    test-rmse:0.547994+0.019945 
## [51] train-rmse:0.501504+0.015459    test-rmse:0.508677+0.016681 
## [61] train-rmse:0.471862+0.010597    test-rmse:0.479123+0.012501 
## [71] train-rmse:0.451952+0.013370    test-rmse:0.460485+0.012022 
## [81] train-rmse:0.432666+0.010886    test-rmse:0.441693+0.007894 
## [91] train-rmse:0.421227+0.009522    test-rmse:0.430872+0.012349 
## [100]    train-rmse:0.415291+0.007321    test-rmse:0.425544+0.010794 
## [1]  train-rmse:3.925714+0.011809    test-rmse:3.925520+0.020100 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.555602+0.039215    test-rmse:1.557441+0.041948 
## [21] train-rmse:0.785280+0.021219    test-rmse:0.791421+0.026139 
## [31] train-rmse:0.568564+0.013066    test-rmse:0.578304+0.012096 
## [41] train-rmse:0.486451+0.009894    test-rmse:0.499558+0.012038 
## [51] train-rmse:0.441475+0.010910    test-rmse:0.456358+0.011741 
## [61] train-rmse:0.404062+0.015151    test-rmse:0.420743+0.011235 
## [71] train-rmse:0.378873+0.014850    test-rmse:0.396816+0.010222 
## [81] train-rmse:0.365650+0.014619    test-rmse:0.385275+0.009261 
## [91] train-rmse:0.352015+0.013196    test-rmse:0.373062+0.008284 
## [100]    train-rmse:0.341831+0.009388    test-rmse:0.364025+0.007318 
## [1]  train-rmse:3.914791+0.010045    test-rmse:3.915450+0.023042 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.563047+0.041964    test-rmse:1.567308+0.048683 
## [21] train-rmse:0.800263+0.014681    test-rmse:0.808473+0.012516 
## [31] train-rmse:0.569701+0.024749    test-rmse:0.581251+0.026655 
## [41] train-rmse:0.481975+0.020303    test-rmse:0.495781+0.027604 
## [51] train-rmse:0.435978+0.012685    test-rmse:0.452435+0.022973 
## [61] train-rmse:0.405534+0.010080    test-rmse:0.423645+0.017965 
## [71] train-rmse:0.381624+0.012640    test-rmse:0.400909+0.018885 
## [81] train-rmse:0.367977+0.013861    test-rmse:0.388804+0.022347 
## [91] train-rmse:0.353035+0.012874    test-rmse:0.375329+0.021590 
## [100]    train-rmse:0.346789+0.015000    test-rmse:0.370340+0.023541 
## [1]  train-rmse:3.925675+0.009405    test-rmse:3.926019+0.040841 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.559878+0.022007    test-rmse:1.562979+0.030714 
## [21] train-rmse:0.772958+0.023735    test-rmse:0.779599+0.025288 
## [31] train-rmse:0.540792+0.010375    test-rmse:0.551421+0.015050 
## [41] train-rmse:0.466386+0.016544    test-rmse:0.479945+0.021073 
## [51] train-rmse:0.416312+0.013908    test-rmse:0.430880+0.024447 
## [61] train-rmse:0.383233+0.013055    test-rmse:0.399322+0.022325 
## [71] train-rmse:0.360766+0.007582    test-rmse:0.378485+0.015137 
## [81] train-rmse:0.344778+0.007508    test-rmse:0.363676+0.014802 
## [91] train-rmse:0.335753+0.005597    test-rmse:0.355960+0.011430 
## [100]    train-rmse:0.328096+0.003061    test-rmse:0.349245+0.010191 
## [1]  train-rmse:3.915963+0.013481    test-rmse:3.915500+0.027944 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.528056+0.021250    test-rmse:1.530451+0.031975 
## [21] train-rmse:0.754601+0.007260    test-rmse:0.760630+0.011202 
## [31] train-rmse:0.534459+0.006380    test-rmse:0.544263+0.010065 
## [41] train-rmse:0.470631+0.017232    test-rmse:0.483103+0.018629 
## [51] train-rmse:0.420535+0.012034    test-rmse:0.434652+0.011242 
## [61] train-rmse:0.388309+0.008676    test-rmse:0.404194+0.010591 
## [71] train-rmse:0.362132+0.006555    test-rmse:0.380124+0.008167 
## [81] train-rmse:0.345296+0.005084    test-rmse:0.364197+0.005713 
## [91] train-rmse:0.335034+0.006742    test-rmse:0.355369+0.008042 
## [100]    train-rmse:0.326902+0.003885    test-rmse:0.348211+0.004703 
## [1]  train-rmse:3.926832+0.013099    test-rmse:3.927547+0.034990 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.568232+0.049995    test-rmse:1.573803+0.065665 
## [21] train-rmse:0.765808+0.035389    test-rmse:0.778744+0.039748 
## [31] train-rmse:0.527638+0.035374    test-rmse:0.547298+0.039905 
## [41] train-rmse:0.455121+0.015080    test-rmse:0.479227+0.017965 
## [51] train-rmse:0.405173+0.006817    test-rmse:0.432197+0.006481 
## [61] train-rmse:0.372348+0.010170    test-rmse:0.401761+0.009414 
## [71] train-rmse:0.344056+0.005025    test-rmse:0.375476+0.005608 
## [81] train-rmse:0.327755+0.006147    test-rmse:0.361196+0.007901 
## [91] train-rmse:0.317275+0.006218    test-rmse:0.353427+0.006261 
## [100]    train-rmse:0.308716+0.005482    test-rmse:0.346847+0.006638 
## [1]  train-rmse:3.920258+0.012710    test-rmse:3.920423+0.028197 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.558861+0.047645    test-rmse:1.563924+0.057555 
## [21] train-rmse:0.779338+0.033298    test-rmse:0.791069+0.032633 
## [31] train-rmse:0.527843+0.035118    test-rmse:0.546044+0.033415 
## [41] train-rmse:0.423115+0.027502    test-rmse:0.445467+0.024010 
## [51] train-rmse:0.378379+0.015760    test-rmse:0.403759+0.019003 
## [61] train-rmse:0.353778+0.011127    test-rmse:0.381720+0.011588 
## [71] train-rmse:0.338957+0.010567    test-rmse:0.369181+0.007509 
## [81] train-rmse:0.321325+0.007972    test-rmse:0.353781+0.004692 
## [91] train-rmse:0.310686+0.007015    test-rmse:0.345130+0.003760 
## [100]    train-rmse:0.302583+0.007112    test-rmse:0.338889+0.006436 
## [1]  train-rmse:3.914368+0.009559    test-rmse:3.914667+0.024498 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.530554+0.033545    test-rmse:1.534331+0.039981 
## [21] train-rmse:0.733997+0.021026    test-rmse:0.745243+0.028940 
## [31] train-rmse:0.492979+0.014762    test-rmse:0.510274+0.018956 
## [41] train-rmse:0.400595+0.018420    test-rmse:0.421616+0.014162 
## [51] train-rmse:0.360564+0.017941    test-rmse:0.384571+0.011294 
## [61] train-rmse:0.337961+0.014682    test-rmse:0.364641+0.009665 
## [71] train-rmse:0.314417+0.004292    test-rmse:0.343508+0.002221 
## [81] train-rmse:0.304974+0.004645    test-rmse:0.336801+0.003078 
## [91] train-rmse:0.295830+0.002921    test-rmse:0.329692+0.003903 
## [100]    train-rmse:0.289773+0.002167    test-rmse:0.325823+0.004674 
## [1]  train-rmse:3.924184+0.013032    test-rmse:3.923813+0.015279 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:1.543331+0.023046    test-rmse:1.548840+0.023393 
## [21] train-rmse:0.771772+0.019111    test-rmse:0.784462+0.016589 
## [31] train-rmse:0.499379+0.020853    test-rmse:0.518189+0.022712 
## [41] train-rmse:0.410806+0.012905    test-rmse:0.435137+0.021908 
## [51] train-rmse:0.366691+0.007316    test-rmse:0.394261+0.014559 
## [61] train-rmse:0.339655+0.009372    test-rmse:0.369760+0.015987 
## [71] train-rmse:0.321909+0.007815    test-rmse:0.354386+0.013800 
## [81] train-rmse:0.311281+0.007180    test-rmse:0.346035+0.010358 
## [91] train-rmse:0.298816+0.005305    test-rmse:0.335769+0.009494 
## [100]    train-rmse:0.291484+0.004177    test-rmse:0.330297+0.008169
##    user  system elapsed 
##  50.542   0.152  12.825
output <- as.data.frame(t(rmseErrorsHyperparameters))
varnames <- c("TestRMSE", "TrainRMSE", "SubSampRate", "ColSampRate", "Depth", "eta", "currentMinChild")
names(output) <- varnames
head(output)
#####
#####

model = xgb.train(data = dtrain, 
                  nround = 150, 
                  max_depth = 5, 
                  eta = 0.1, 
                  subsample = 0.9)

xgb.importance(feature_names = colnames(X_train), model) %>% 
  xgb.plot.importance()

X_test = test_set %>% 
  select(- datetime) %>% 
  as.matrix()

# y_test = test_set$count

preds = predict(model, X_test)
preds = expm1(preds)

solution = data.frame(datetime = test_set$datetime, count = preds)


write.csv(solution, "solution.csv", row.names = FALSE)