train <- read.csv("../DATA/train.csv", colClasses=test_classes)
test <- read.csv("../DATA/test.csv", colClasses=test_classes[1:9])
train$count <- as.integer(train$count)
train_factor <- set_up_features(train)
test_factor <- set_up_features(test)
train_factor$lgcount <- log(train_factor$count+1)
train_factor<- train_factor[,-12]
train_factor<- train_factor[,-11]
train_factor<- train_factor[,-10]
train_factor<- train_factor[,-1]
test_factor<- test_factor[,-1]
Split the sample into 20% testing subsample and 80% trainning subsample.
set.seed(212312)
trainIndex <- createDataPartition(train_factor$lgcount, p = 0.8, list=FALSE, times=1)
subTrain <- train_factor[trainIndex,]
subTest <- train_factor[-trainIndex,]
library(party)
#create our formula
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
#build our model
fit.ctree.party <- ctree(formula, data=subTrain,controls=ctree_control(mincriterion=0.95,savesplitstats=FALSE))
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## ctree.party 0.4177777 55.68874 0.3658089 46.97015
## [1] object.size: 45.7 Mb
# all features
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.ctree <- train(formula, data=subTrain,method='ctree',tuneGrid=expand.grid(mincriterion=0.95))
ctreeVarImp = varImp(fit.ctree)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## ctree.caret.plain 0.5436062 84.36475 0.5015594 77.92729
## [1] object.size: 51.4 Mb
-- here I use ctree2, since the parameter (maxdepth) is easier to understand.
-- use RMSE to select the best model
##ctree2 with CV
fitControl <- trainControl(method = 'cv', number=6,summaryFunction=defaultSummary)
set.seed(123)
Grid <- expand.grid(maxdepth = seq(15, 50,5))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.ctree2CV <- train(formula, data=subTrain, method = 'ctree2', trControl=fitControl,tuneGrid=Grid,metric='RMSE')
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## ctree2.CV 0.5140868 77.55127 0.4329095 65.68954
## [1] object.size: 117.6 Mb
##model2a: CART using rpart with CV
set.seed(123)
fitControl <- trainControl(method = 'cv', number=6)
Grid <- expand.grid(cp=seq(0, 0.05, 0.005))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.rpartCV <- train(formula, data=subTrain, method = 'rpart', trControl=fitControl, metric='RMSE',maximize=FALSE, tuneGrid = Grid)
##model2b: rpart2 with CV
set.seed(123)
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fitControl <- trainControl(method = 'cv', number=6)
Grid<-expand.grid(.maxdepth=seq(5,20,5))
fit.rpart2CV <- train(formula, data=subTrain, method = 'rpart2', trControl=fitControl, metric = 'RMSE', maximize=FALSE, tuneGrid=Grid)
plot(fit.rpartCV)
plot(fit.rpart2CV)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## rpart.CV 0.5325579 79.97178 0.4049704 61.81217
## rpart2.CV 0.7847154 148.60228 0.7871033 145.48955
## [1] rpart.CV size: 6.6 Mb
## [1] rpart2.CV size: 6.2 Mb
## gbm fitting
set.seed(123)
fitControl <- trainControl(method = 'cv', number = 6, summaryFunction=defaultSummary)
Grid <- expand.grid( n.trees = seq(50,1000,50), interaction.depth = c(30), shrinkage = c(0.1))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.gbm <- train(formula, data=subTrain, method = 'gbm', trControl=fitControl,tuneGrid=Grid,metric='RMSE',maximize=FALSE)
plot(fit.gbm)
plot(gbmVarImp)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## gbm.CV 0.3145828 43.72667 0.1658346 29.22251
## [1] object.size: 11.8 Mb
The model is from mboost. It is another realization of gradient bossting model with parameter mstop, the number of initial boosting iterations, and prune, which is not found in the original mboost guide.
Note: 1. prune seems to be not tunnable using tuneGrid. It will be held constant at the first number of the sequence. 2. It also doesn’t change the result. I have tried prune\(=0.1, 0.5, 0.9, 5\). It all gives the same result at mstop\(=300\).
## gamboost fitting
set.seed(123)
fitControl <- trainControl(method = 'cv', number=6, summaryFunction=defaultSummary)
Grid <- expand.grid(.mstop=seq(100,1000,100),.prune=c(5))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.gamboost <- train(formula, data=subTrain, method = 'gamboost', trControl=fitControl,tuneGrid=Grid,metric='RMSE',maximize=FALSE)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## gamboost.CV 0.6140908 98.05113 0.6068851 96.41029
## [1] object.size: 10.9 Mb
## treebag fitting
set.seed(123)
fitControl <- trainControl(method = 'none', summaryFunction=defaultSummary)
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.treebag <- train(formula, data=subTrain, method = 'treebag', trControl=fitControl)
show(fit.treebag)
save(fit.treebag,file='fit_treebag_v1.RData')
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## treebag.CV 0.7499689 147.1881 0.7484233 143.1211
## [1] object.size: 132.5 Mb
According to the caret guide, bagEarth is ‘A bagging wrapper for multivariate adaptive regression splines (MARS) via the earth function’.
## bagEarth fitting
set.seed(123)
fitControl <- trainControl(method = 'cv', number = 6, summaryFunction=defaultSummary)
Grid <- expand.grid(degree=c(2), nprune = seq(10,90,20))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.bagEarth <- train(formula, data=subTrain, method = 'bagEarth', trControl=fitControl,tuneGrid=Grid,metric='RMSE',maximize=FALSE,keepX=FALSE)
#show(fit.bagEarth)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## bagEarth.CV 0.4242703 69.00662 0.4115398 68.0136
## [1] object.size: 383.9 Mb
# random forest
set.seed(123)
tc <- trainControl("oob")
Grid <- expand.grid(mtry = seq(4,16,4))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.rf <- train(formula, data=subTrain , method='rf', trControl=tc,tuneGrid=Grid,metric='RMSE')
## RMSE Rsquared mtry
## 1 0.5943940 0.8238771 4
## 2 0.4905211 0.8800549 8
## 3 0.4718308 0.8890213 12
## 4 0.4618798 0.8936531 16
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## rf.oob 0.4718695 74.40514 0.2334819 38.81704
## [1] object.size: 91.9 Mb
# random forest
set.seed(123)
tc <- trainControl("cv",number=4)
Grid <- expand.grid(mtry = seq(4,16,4))
formula <- lgcount ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + hour + wday + month + year
fit.rf.cv <- train(formula, data=subTrain , method='rf', trControl=tc,tuneGrid=Grid,metric='RMSE')
## mtry RMSE Rsquared RMSESD RsquaredSD
## 1 4 0.6070140 0.8597777 0.012837362 0.005337229
## 2 8 0.5076205 0.8808282 0.010867871 0.005845244
## 3 12 0.4881242 0.8852147 0.008540236 0.006206083
## 4 16 0.4788093 0.8878842 0.009833839 0.006777247
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## rf.cv 0.4685932 73.94653 0.2325217 38.51168
## rf.oob 0.4718695 74.40514 0.2334819 38.81704
show(compare)
## Test.rmsle Test.rmse Train.RMSLE Train.RMSE
## ctree.party 0.4177777 55.68874 0.3658089 46.97015
## ctree 0.5436062 84.36475 0.5015594 77.92729
## ctree2.CV 0.5140868 77.55127 0.4329095 65.68954
## rpart.CV 0.5325579 79.97178 0.4049704 61.81217
## rpart2.CV 0.7847154 148.60228 0.7871033 145.48955
## gbm.CV 0.3145828 43.72667 0.1658346 29.22251
## gamboost.CV 0.6140908 98.05113 0.6068851 96.41029
## treebag 0.7499689 147.18809 0.7484233 143.12107
## bagEarth.CV 0.4242703 69.00662 0.4115398 68.01360
## rf.oob 0.5011784 84.06601 0.2918352 52.45717
## rf.cv 0.4685932 73.94653 0.2325217 38.51168
#run model against test data set
predict.rf <- predict(fit.rf.cv, test_factor)
predict.rf <- exp(predict.rf) - 1
#build a dataframe with our results
submit.rf <- data.frame(datetime = test$datetime, count=predict.rf)
#write results to .csv for submission
write.csv(submit.rf, file="submit_rf_v1.csv",row.names=FALSE,quote=FALSE)
#run model against test data set
predict.gbm <- predict(fit.gbm, test_factor)
predict.gbm <- exp(predict.gbm) - 1
#build a dataframe with our results
submit.gbm <- data.frame(datetime = test$datetime, count=predict.gbm)
#write results to .csv for submission
write.csv(submit.gbm, file="submit_gbm_v1.csv",row.names=FALSE,quote=FALSE)
#bagEarth
#run model against test data set
predict.bagEarth <- predict(fit.bagEarth, test_factor)
predict.bagEarth <- exp(predict.bagEarth) - 1
#build a dataframe with our results
submit.bagEarth <- data.frame(datetime = test$datetime, count=predict.bagEarth)
#write results to .csv for submission
write.csv(submit.bagEarth, file="submit_bagEarth_v1.csv",row.names=FALSE,quote=FALSE)
#ctree.party
#run model against test data set
predict.ctree <- predict(fit.ctree.party, test_factor)
colnames(predict.ctree) = 'count'
predict.ctree <- exp(predict.ctree) - 1
#build a dataframe with our results
submit.ctree <- data.frame(datetime = test$datetime, count=predict.ctree)
#write results to .csv for submission
write.csv(submit.ctree, file="submit_ctree_v1.csv",row.names=FALSE,quote=FALSE)
| Models | scores | Test.rmsle |
|---|---|---|
| rf.cv | 0.517 | 0.469 |
| gbm.cv | 0.402 | 0.315 |
| bagEarth.cv | 0.451 | 0.424 |
| ctree.party | 0.524 | 0.417 |