Bicycle prediction-Kaggle (XG Boost and model ensembling)

How I Scored 0.479 with this code using XGBoost and model ensembling

The below code is commented to make code sufficiently understandable

setwd("C:\\R-Programming\\Bike rental")
train<-read.csv("train.csv")

##Doing this to remove the outliers(this can be changed though to a lower value which might also increase the score)
train<-subset(train,count<800)

train$season<-as.factor(train$season)
train$holiday<-as.factor(train$holiday)
train$weather<-as.factor(train$weather)
train$workingday<-as.factor(train$workingday)


train$datetime<-as.character(train$datetime)
model<-lm(count~.-datetime-casual-registered,data=train)
summary(model)

## 
## Call:
## lm(formula = count ~ . - datetime - casual - registered, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340.78  -97.74  -28.64   65.02  639.08 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 124.52030    8.60003  14.479  < 2e-16 ***
## season2      -0.80577    5.13055  -0.157  0.87521    
## season3     -38.72723    6.57675  -5.889 4.01e-09 ***
## season4      61.80222    4.31382  14.327  < 2e-16 ***
## holiday1     -8.88431    8.70476  -1.021  0.30745    
## workingday1  -9.08046    3.11885  -2.911  0.00360 ** 
## weather2     14.73994    3.42528   4.303 1.70e-05 ***
## weather3    -10.09841    5.75771  -1.754  0.07948 .  
## weather4    181.76350  146.27227   1.243  0.21403    
## temp          7.76278    1.14756   6.765 1.41e-11 ***
## atemp         2.57271    1.00638   2.556  0.01059 *  
## humidity     -2.67073    0.08909 -29.976  < 2e-16 ***
## windspeed     0.59726    0.18927   3.156  0.00161 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146.2 on 10785 degrees of freedom
## Multiple R-squared:  0.2756, Adjusted R-squared:  0.2748 
## F-statistic: 341.9 on 12 and 10785 DF,  p-value: < 2.2e-16

##Extracting date, month and hours component from the stringdate column
month<-c()
date<-c()
hours<-c()

train$datetime<-as.character(train$datetime)
for(i in train$datetime){
  strings<-strsplit(i," ")
  dateString<-strsplit(strings[[1]][1],"-")
  month<-c(month,dateString[[1]][2])
  date<-c(date,dateString[[1]][3])
  timeString<-strsplit(strings[[1]][2],":")
  hours<-c(hours,timeString[[1]][1])
}


train$month<-as.factor((month))
train$date<-as.numeric(date)
train$hours<-as.numeric(hours)


##To check if the day was weekend or not 
library(lubridate)

## Warning: package 'lubridate' was built under R version 3.5.1

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

dayOfWeek<-wday(train$datetime)
train$dayOfWeek<-as.factor(dayOfWeek)
train$isWeekend<-ifelse(dayOfWeek %in% c(1,7),1,0)
train$isWeekend<-as.factor(train$isWeekend)


##breaking the hours into time periods
train$categoryhours<-cut(train$hours,breaks = c(0,5,11,16,19,24),include.lowest = TRUE)
levels(train$categoryhours)<-c("Night","Morning","Afternoon","Evening","Night")

##breaking the humidity into levels
train$humidityCategory<-cut(train$humidity,breaks = c(0,30,50,80,101),include.lowest = TRUE)
levels(train$humidityCategory)<-c("Low","Mid","High","Very High")

##breaking the wind speed into levels
train$windspeedCategory<-cut(train$windspeed,breaks = c(0,7,20,30,60),include.lowest = TRUE)
levels(train$windspeedCategory)<-c("Low","Mid","High","Very High")

##cretaing new column if thetemperature was good or not on that day (values chosen by seeing the graph of distribution of temp Vs count)
train$isTempGood<-as.factor(ifelse((train$temp<35 & train$temp>12),1,0))
train$isaTempGood<-as.factor(ifelse((train$atemp<=35 & train$atemp>=17),1,0))

train$date<-NULL

##predicting casual column
modelcasual<-lm(casual~.,data=train[,-c(1,11,12)])
train$o1<-predict(modelcasual,train)

## Warning in predict.lm(modelcasual, train): prediction from a rank-deficient
## fit may be misleading

##predicting registered column
modelregistered<-lm(registered~.,data=train[,-c(1,10,12)])
train$o2<-predict(modelregistered,train)

## Warning in predict.lm(modelregistered, train): prediction from a rank-
## deficient fit may be misleading

##predicting count column
modelcount<-lm(count~.,data=train[,-c(1,10,11)])
train$o3<-predict(modelcount,train)

## Warning in predict.lm(modelcount, train): prediction from a rank-deficient
## fit may be misleading

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("xgboost")

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library("Ckmeans.1d.dp")

t<- train %>% mutate_if(is.factor,as.numeric)
t$casual<-NULL
t$registered<-NULL
t$datetime<-NULL



data_variables <- as.matrix(t[,-9])
data_label <- t[,"count"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)


xgb_params <- list("objective" = "reg:linear",eta=0.05,gamma=0.9,max_depth=10,eval_metric="rmse")
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 300, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 10, maximize = F)

## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").

## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").

## [1]  train-rmse:241.371886+0.513958  test-rmse:241.458801+4.733973 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [11] train-rmse:154.869686+0.349015  test-rmse:157.070825+3.949479 
## [21] train-rmse:104.718185+0.282131  test-rmse:110.503851+3.451212 
## [31] train-rmse:75.997510+0.241797   test-rmse:86.357245+3.061250 
## [41] train-rmse:59.576576+0.434476   test-rmse:74.777360+2.606574 
## [51] train-rmse:49.797395+0.549152   test-rmse:69.330086+2.213214 
## [61] train-rmse:43.555334+0.540961   test-rmse:66.722548+1.887851 
## [71] train-rmse:39.135011+0.432785   test-rmse:65.364407+1.758038 
## [81] train-rmse:35.696813+0.449775   test-rmse:64.505077+1.600509 
## [91] train-rmse:33.296328+0.464943   test-rmse:64.048376+1.541181 
## [101]    train-rmse:31.415567+0.446056   test-rmse:63.746746+1.538553 
## [111]    train-rmse:29.708969+0.445177   test-rmse:63.509613+1.546352 
## [121]    train-rmse:28.151886+0.362917   test-rmse:63.173001+1.536523 
## [131]    train-rmse:26.821979+0.484156   test-rmse:62.942278+1.507856 
## [141]    train-rmse:25.704980+0.538742   test-rmse:62.701207+1.447580 
## [151]    train-rmse:24.637851+0.626814   test-rmse:62.526959+1.438313 
## [161]    train-rmse:23.752792+0.670531   test-rmse:62.368913+1.426365 
## [171]    train-rmse:22.990796+0.602196   test-rmse:62.256218+1.446475 
## [181]    train-rmse:22.205176+0.486906   test-rmse:62.112324+1.443606 
## [191]    train-rmse:21.479510+0.499520   test-rmse:62.001143+1.435013 
## [201]    train-rmse:20.844796+0.459602   test-rmse:61.929614+1.442571 
## [211]    train-rmse:20.207739+0.381496   test-rmse:61.835682+1.457668 
## [221]    train-rmse:19.551760+0.400044   test-rmse:61.799272+1.468363 
## [231]    train-rmse:18.998403+0.467510   test-rmse:61.753053+1.471821 
## [241]    train-rmse:18.493114+0.461396   test-rmse:61.707334+1.490324 
## [251]    train-rmse:17.992181+0.423061   test-rmse:61.654713+1.477912 
## [261]    train-rmse:17.475707+0.473905   test-rmse:61.581411+1.484088 
## [271]    train-rmse:17.083227+0.421014   test-rmse:61.552593+1.481111 
## [281]    train-rmse:16.598484+0.434973   test-rmse:61.517161+1.488966 
## [291]    train-rmse:16.195632+0.433981   test-rmse:61.483906+1.495207 
## [300]    train-rmse:15.838624+0.426136   test-rmse:61.455986+1.505569

nround    <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold  <- 5

# Fit cv.nfold * cv.nround XGB models and save OOF predictions

bst_model <- xgb.train(params = xgb_params,
                       data = data_matrix,
                       nrounds = nround)


## Test data (same process done as on train data)
test<-read.csv("test.csv")
test$season<-as.factor(test$season)
test$holiday<-as.factor(test$holiday)
test$weather<-as.factor(test$weather)
test$workingday<-as.factor(test$workingday)


test$datetime<-as.character(test$datetime)


month<-c()
date<-c()
hours<-c()

test$datetime<-as.character(test$datetime)
for(i in test$datetime){
  strings<-strsplit(i," ")
  dateString<-strsplit(strings[[1]][1],"-")
  month<-c(month,dateString[[1]][2])
  date<-c(date,dateString[[1]][3])
  timeString<-strsplit(strings[[1]][2],":")
  hours<-c(hours,timeString[[1]][1])
}


test$month<-as.factor((month))
test$date<-as.numeric(date)
test$hours<-as.numeric(hours)

library(lubridate)
dayOfWeek<-wday(test$datetime)
test$dayOfWeek<-as.factor(dayOfWeek)
test$isWeekend<-ifelse(dayOfWeek %in% c(1,7),1,0)
test$isWeekend<-as.factor(test$isWeekend)


test$categoryhours<-cut(test$hours,breaks = c(0,5,11,16,19,24),include.lowest = TRUE)
levels(test$categoryhours)<-c("Night","Morning","Afternoon","Evening","Night")


test$humidityCategory<-cut(test$humidity,breaks = c(0,30,50,80,101),include.lowest = TRUE)
levels(test$humidityCategory)<-c("Low","Mid","High","Very High")

test$windspeedCategory<-cut(test$windspeed,breaks = c(0,7,20,30,50),include.lowest = TRUE)
levels(test$windspeedCategory)<-c("Low","Mid","High","Very High")

test$isTempGood<-as.factor(ifelse((test$temp<35 & test$temp>12),1,0))
test$isaTempGood<-as.factor(ifelse((test$atemp<=35 & test$atemp>=17),1,0))

test$date<-NULL
test$datetime<-NULL

test$o1<-predict(modelcasual,test)

## Warning in predict.lm(modelcasual, test): prediction from a rank-deficient
## fit may be misleading

test$o2<-predict(modelregistered,test)

## Warning in predict.lm(modelregistered, test): prediction from a rank-
## deficient fit may be misleading

test$o3<-predict(modelcount,test)

## Warning in predict.lm(modelcount, test): prediction from a rank-deficient
## fit may be misleading

t<-test%>%mutate_if(is.factor,as.numeric)

predictions<-predict(bst_model,as.matrix(t))
predictions[predictions<0]<-0

temp<-read.csv("test.csv")

df<-data.frame(datetime=temp$datetime,count=predictions)

write.csv(df,"output.csv",row.names = FALSE)

Bicycle prediction-Kaggle (XG Boost and model ensembling)

Pulkit Kalia

July 27, 2018

How I Scored 0.479 with this code using XGBoost and model ensembling

The below code is commented to make code sufficiently understandable

Data Explorations :

Conclusion: