setwd("C:\\R-Programming\\Bike rental")
train<-read.csv("train.csv")
##Doing this to remove the outliers(this can be changed though to a lower value which might also increase the score)
train<-subset(train,count<800)
train$season<-as.factor(train$season)
train$holiday<-as.factor(train$holiday)
train$weather<-as.factor(train$weather)
train$workingday<-as.factor(train$workingday)
train$datetime<-as.character(train$datetime)
model<-lm(count~.-datetime-casual-registered,data=train)
summary(model)
##
## Call:
## lm(formula = count ~ . - datetime - casual - registered, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340.78 -97.74 -28.64 65.02 639.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.52030 8.60003 14.479 < 2e-16 ***
## season2 -0.80577 5.13055 -0.157 0.87521
## season3 -38.72723 6.57675 -5.889 4.01e-09 ***
## season4 61.80222 4.31382 14.327 < 2e-16 ***
## holiday1 -8.88431 8.70476 -1.021 0.30745
## workingday1 -9.08046 3.11885 -2.911 0.00360 **
## weather2 14.73994 3.42528 4.303 1.70e-05 ***
## weather3 -10.09841 5.75771 -1.754 0.07948 .
## weather4 181.76350 146.27227 1.243 0.21403
## temp 7.76278 1.14756 6.765 1.41e-11 ***
## atemp 2.57271 1.00638 2.556 0.01059 *
## humidity -2.67073 0.08909 -29.976 < 2e-16 ***
## windspeed 0.59726 0.18927 3.156 0.00161 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 146.2 on 10785 degrees of freedom
## Multiple R-squared: 0.2756, Adjusted R-squared: 0.2748
## F-statistic: 341.9 on 12 and 10785 DF, p-value: < 2.2e-16
##Extracting date, month and hours component from the stringdate column
month<-c()
date<-c()
hours<-c()
train$datetime<-as.character(train$datetime)
for(i in train$datetime){
strings<-strsplit(i," ")
dateString<-strsplit(strings[[1]][1],"-")
month<-c(month,dateString[[1]][2])
date<-c(date,dateString[[1]][3])
timeString<-strsplit(strings[[1]][2],":")
hours<-c(hours,timeString[[1]][1])
}
train$month<-as.factor((month))
train$date<-as.numeric(date)
train$hours<-as.numeric(hours)
##To check if the day was weekend or not
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.1
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
dayOfWeek<-wday(train$datetime)
train$dayOfWeek<-as.factor(dayOfWeek)
train$isWeekend<-ifelse(dayOfWeek %in% c(1,7),1,0)
train$isWeekend<-as.factor(train$isWeekend)
##breaking the hours into time periods
train$categoryhours<-cut(train$hours,breaks = c(0,5,11,16,19,24),include.lowest = TRUE)
levels(train$categoryhours)<-c("Night","Morning","Afternoon","Evening","Night")
##breaking the humidity into levels
train$humidityCategory<-cut(train$humidity,breaks = c(0,30,50,80,101),include.lowest = TRUE)
levels(train$humidityCategory)<-c("Low","Mid","High","Very High")
##breaking the wind speed into levels
train$windspeedCategory<-cut(train$windspeed,breaks = c(0,7,20,30,60),include.lowest = TRUE)
levels(train$windspeedCategory)<-c("Low","Mid","High","Very High")
##cretaing new column if thetemperature was good or not on that day (values chosen by seeing the graph of distribution of temp Vs count)
train$isTempGood<-as.factor(ifelse((train$temp<35 & train$temp>12),1,0))
train$isaTempGood<-as.factor(ifelse((train$atemp<=35 & train$atemp>=17),1,0))
train$date<-NULL
##predicting casual column
modelcasual<-lm(casual~.,data=train[,-c(1,11,12)])
train$o1<-predict(modelcasual,train)
## Warning in predict.lm(modelcasual, train): prediction from a rank-deficient
## fit may be misleading
##predicting registered column
modelregistered<-lm(registered~.,data=train[,-c(1,10,12)])
train$o2<-predict(modelregistered,train)
## Warning in predict.lm(modelregistered, train): prediction from a rank-
## deficient fit may be misleading
##predicting count column
modelcount<-lm(count~.,data=train[,-c(1,10,11)])
train$o3<-predict(modelcount,train)
## Warning in predict.lm(modelcount, train): prediction from a rank-deficient
## fit may be misleading
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("xgboost")
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library("Ckmeans.1d.dp")
t<- train %>% mutate_if(is.factor,as.numeric)
t$casual<-NULL
t$registered<-NULL
t$datetime<-NULL
data_variables <- as.matrix(t[,-9])
data_label <- t[,"count"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)
xgb_params <- list("objective" = "reg:linear",eta=0.05,gamma=0.9,max_depth=10,eval_metric="rmse")
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 300, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 10, maximize = F)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-rmse:241.371886+0.513958 test-rmse:241.458801+4.733973
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [11] train-rmse:154.869686+0.349015 test-rmse:157.070825+3.949479
## [21] train-rmse:104.718185+0.282131 test-rmse:110.503851+3.451212
## [31] train-rmse:75.997510+0.241797 test-rmse:86.357245+3.061250
## [41] train-rmse:59.576576+0.434476 test-rmse:74.777360+2.606574
## [51] train-rmse:49.797395+0.549152 test-rmse:69.330086+2.213214
## [61] train-rmse:43.555334+0.540961 test-rmse:66.722548+1.887851
## [71] train-rmse:39.135011+0.432785 test-rmse:65.364407+1.758038
## [81] train-rmse:35.696813+0.449775 test-rmse:64.505077+1.600509
## [91] train-rmse:33.296328+0.464943 test-rmse:64.048376+1.541181
## [101] train-rmse:31.415567+0.446056 test-rmse:63.746746+1.538553
## [111] train-rmse:29.708969+0.445177 test-rmse:63.509613+1.546352
## [121] train-rmse:28.151886+0.362917 test-rmse:63.173001+1.536523
## [131] train-rmse:26.821979+0.484156 test-rmse:62.942278+1.507856
## [141] train-rmse:25.704980+0.538742 test-rmse:62.701207+1.447580
## [151] train-rmse:24.637851+0.626814 test-rmse:62.526959+1.438313
## [161] train-rmse:23.752792+0.670531 test-rmse:62.368913+1.426365
## [171] train-rmse:22.990796+0.602196 test-rmse:62.256218+1.446475
## [181] train-rmse:22.205176+0.486906 test-rmse:62.112324+1.443606
## [191] train-rmse:21.479510+0.499520 test-rmse:62.001143+1.435013
## [201] train-rmse:20.844796+0.459602 test-rmse:61.929614+1.442571
## [211] train-rmse:20.207739+0.381496 test-rmse:61.835682+1.457668
## [221] train-rmse:19.551760+0.400044 test-rmse:61.799272+1.468363
## [231] train-rmse:18.998403+0.467510 test-rmse:61.753053+1.471821
## [241] train-rmse:18.493114+0.461396 test-rmse:61.707334+1.490324
## [251] train-rmse:17.992181+0.423061 test-rmse:61.654713+1.477912
## [261] train-rmse:17.475707+0.473905 test-rmse:61.581411+1.484088
## [271] train-rmse:17.083227+0.421014 test-rmse:61.552593+1.481111
## [281] train-rmse:16.598484+0.434973 test-rmse:61.517161+1.488966
## [291] train-rmse:16.195632+0.433981 test-rmse:61.483906+1.495207
## [300] train-rmse:15.838624+0.426136 test-rmse:61.455986+1.505569
nround <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold <- 5
# Fit cv.nfold * cv.nround XGB models and save OOF predictions
bst_model <- xgb.train(params = xgb_params,
data = data_matrix,
nrounds = nround)
## Test data (same process done as on train data)
test<-read.csv("test.csv")
test$season<-as.factor(test$season)
test$holiday<-as.factor(test$holiday)
test$weather<-as.factor(test$weather)
test$workingday<-as.factor(test$workingday)
test$datetime<-as.character(test$datetime)
month<-c()
date<-c()
hours<-c()
test$datetime<-as.character(test$datetime)
for(i in test$datetime){
strings<-strsplit(i," ")
dateString<-strsplit(strings[[1]][1],"-")
month<-c(month,dateString[[1]][2])
date<-c(date,dateString[[1]][3])
timeString<-strsplit(strings[[1]][2],":")
hours<-c(hours,timeString[[1]][1])
}
test$month<-as.factor((month))
test$date<-as.numeric(date)
test$hours<-as.numeric(hours)
library(lubridate)
dayOfWeek<-wday(test$datetime)
test$dayOfWeek<-as.factor(dayOfWeek)
test$isWeekend<-ifelse(dayOfWeek %in% c(1,7),1,0)
test$isWeekend<-as.factor(test$isWeekend)
test$categoryhours<-cut(test$hours,breaks = c(0,5,11,16,19,24),include.lowest = TRUE)
levels(test$categoryhours)<-c("Night","Morning","Afternoon","Evening","Night")
test$humidityCategory<-cut(test$humidity,breaks = c(0,30,50,80,101),include.lowest = TRUE)
levels(test$humidityCategory)<-c("Low","Mid","High","Very High")
test$windspeedCategory<-cut(test$windspeed,breaks = c(0,7,20,30,50),include.lowest = TRUE)
levels(test$windspeedCategory)<-c("Low","Mid","High","Very High")
test$isTempGood<-as.factor(ifelse((test$temp<35 & test$temp>12),1,0))
test$isaTempGood<-as.factor(ifelse((test$atemp<=35 & test$atemp>=17),1,0))
test$date<-NULL
test$datetime<-NULL
test$o1<-predict(modelcasual,test)
## Warning in predict.lm(modelcasual, test): prediction from a rank-deficient
## fit may be misleading
test$o2<-predict(modelregistered,test)
## Warning in predict.lm(modelregistered, test): prediction from a rank-
## deficient fit may be misleading
test$o3<-predict(modelcount,test)
## Warning in predict.lm(modelcount, test): prediction from a rank-deficient
## fit may be misleading
t<-test%>%mutate_if(is.factor,as.numeric)
predictions<-predict(bst_model,as.matrix(t))
predictions[predictions<0]<-0
temp<-read.csv("test.csv")
df<-data.frame(datetime=temp$datetime,count=predictions)
write.csv(df,"output.csv",row.names = FALSE)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
0.479 was scored using this code! You can tune the XGB model more to get better tuning or engineer more factors/columns.