Problem 1.

Boosting and Bagging both are general approaches that can be used for regression and classification. Bagging involves creating multiple copies of the original training data set, and it involves bootstrapping and independent trees. Boosting on the other hand learns slowly as it is sequential and all of the trees are dependent. The goal is to decrease the variance in bagging and to increase the predictive force in boosting.

Problem 2.

From the code below, we run several generalized boosted regression models with different shrinkage parameters and number of trees.

From the first simulation, we see the purple line with shrinkage parameter lambda = 0.2 has the lowest CV error. The pink initially dips down below but comes up after about 50 trees.

From the second simulation, we see the blue with shrinkage parameter lambda = 0.1 dips below the purple and pink. The purple and pink start to come back up.

From the third simulation, we see the blue and purple, lambda = 0.1 and lambda = 0.2 respectively, battling for the lowest CV error. Green surpasses the pink though.

From the fourth similation, the most interesting simulation to me personally, I see pink, blue, and purple start bouncing back up . This is because they are now fitting noise and increasing the number of trees is not helping. Green also looks like its starting to creep back up, but it seems to be closest to the minimum CV value.

Based on these four simulations, I built a MSE vs Lambda graph. From this, I see the lowest CV/MSE error is at lambda = 0.01, which is closest to the green! However, this is for 1000 trees and that is the conclusion we had reached before.

Generally, what I have learned is that after a certain number of trees, adding more trees will no longer reduce CV error because the GBM will begin to just fit the noise or epsilon. I would assume the best lambda value would be around 0.02 or 0.025.

require(gbm)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.3.2
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
kajal_hw6<- read.csv("C:/Users/Kajal/Downloads/HW6.csv")

#SIMULATION 1

nt<-100

gbm1<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.1, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm4<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.2, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm5<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.5, n.trees=nt)  
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,8.5),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")

#SIMULATION 2

nt<-500

gbm1<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.1, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm4<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.2, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm5<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.5, n.trees=nt)  
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,8.5),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")

#SIMULATION 3

nt<-1000

gbm1<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.1, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm4<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.2, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm5<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.5, n.trees=nt)  
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,8.5),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")

#SIMULATION 4

nt<-100000

gbm1<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.001, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm2<-gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds = 3, shrinkage=0.01, n.trees=nt)
## Distribution not specified, assuming gaussian ...
gbm3<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.1, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm4<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.2, n.trees=nt) 
## Distribution not specified, assuming gaussian ...
gbm5<- gbm(Y~x1+x2+x3+x4+x5+x6, data=kajal_hw6, cv.folds=3, shrinkage = 0.5, n.trees=nt)
## Distribution not specified, assuming gaussian ...
plot(gbm1$cv.error,ylim=c(0,8.5),col="red",type="l")
points(gbm2$cv.error,col="green",type="l")
points(gbm3$cv.error,col="blue",type="l")
points(gbm4$cv.error,col="purple",type="l")
points(gbm5$cv.error,col="pink",type="l")

train <- kajal_hw6[1:500,]
test<-kajal_hw6[501:1000,]


lambda_set <- seq( 1.e-4, 0.3, by=0.001 )


training_set_mse <- rep(NA,length(lambda_set))
test_set_mse <- rep(NA,length(lambda_set))

for( lmi in 1:length(lambda_set) ){
  lm = lambda_set[lmi]
  
  boostkajalset <- gbm( Y ~ ., data=train, distribution="gaussian", n.trees=1000, interaction.depth=4, shrinkage=lm )

  y_hat <- predict(boostkajalset, newdata=train, n.trees=1000 )
  training_set_mse[lmi] <- mean( ( y_hat - train$Y )^2 )
  
  y_hat <- predict(boostkajalset, newdata=test, n.trees=1000 )
  test_set_mse[lmi] <- mean( ( y_hat - test$Y )^2 )
}

plot( lambda_set, training_set_mse, type='b', pch=19, col='red', xlab='Lambda Value', ylab='MSE' )
lines( lambda_set, test_set_mse, type='b', pch=19, col='green', xlab='Lambda Value', ylab='Test Set MSE' )
grid()