This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#q01
rm(list=ls())
library('knitr')
require(gbm)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1
par(mar=rep(4, 4)) 
wine_data <- read.csv("/home/archana/ML_works_ucsc/winequality-red.csv", header=TRUE, sep=';')
#q02a
library("ridge")
library(help="ridge")
wine_train = wine_data[1:1400,]
wine_test = wine_data[1401:1599,]
rmse=function(X,Y){
  return( sqrt(sum((X-Y)^2)/length(Y)) )
}
ptm <- proc.time()
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
  exp <- (+3 -4*(i/20))
  xlambda[i+1] <- 10^exp
}
wine_train_err_lr = rep(0, times = length(xlambda))
wine_test_err_lr = rep(0, times = length(xlambda))
min_lambda = 10000
y_train = wine_train[,12]
y_test = wine_test[,12]
for(i in 1:length(xlambda)){
  #optimal lamba
  lrmodel = linearRidge(quality~., data = wine_train, lambda = xlambda[i])
  wine_train_err_lr[i] = rmse(y_train, predict(lrmodel, newdata = wine_train[,-12]))
  wine_test_err_lr[i] = rmse(y_test, predict(lrmodel, newdata = wine_test[,-12]))

  if(i > 1 && wine_test_err_lr[i] < (wine_test_err_lr[i-1]-0.005)){
    min_lambda = xlambda[i]
    index_lambda=i
  }
}
plot(1:length(xlambda),wine_train_err_lr,
     ylim=c(min(wine_train_err_lr, wine_test_err_lr),
            max(wine_train_err_lr, wine_test_err_lr)))
points(1:length(xlambda),wine_test_err_lr, col='red')

plot of chunk unnamed-chunk-1

lrmodel2 = linearRidge(quality~., data = wine_train, lambda=min_lambda)
ptm_lr = proc.time() - ptm
wine_train_err_lr2 = rmse(y_train, predict(lrmodel2, newdata = wine_train[,-12]))
wine_test_err_lr2 = rmse(y_test, predict(lrmodel2, newdata = wine_test[,-12]))
ptm <- proc.time()
gbm1 <- gbm(quality~.,
            distribution="gaussian", 
            data=wine_data, 
            n.trees = 1000,
            interaction.depth=1,
            cv.folds=10,
            shrinkage=0.01)
best.iter = gbm.perf(gbm1,method="cv")

plot of chunk unnamed-chunk-1

ptm_bm = proc.time() - ptm
wine_train_error_bm =  min(gbm1$train.error)
wine_test_error_bm =  min(gbm1$cv.error)
cat(index_lambda, "th ", "lambda is optimal: ", min_lambda, "\n")
## 18 th  lambda is optimal:  0.3981
sprintf("Ridge training error: %f", wine_train_err_lr2)
## [1] "Ridge training error: 0.648245"
sprintf("Ridge test: %f",wine_test_err_lr2)
## [1] "Ridge test: 0.709274"
sprintf("Ridge regression Running time: %f sec",(ptm_lr[3]))
## [1] "Ridge regression Running time: 1.038000 sec"
cat("\n")
sprintf("Boosting training error: %f", wine_train_error_bm)
## [1] "Boosting training error: 0.376938"
sprintf("Boosting test error: %f", wine_test_error_bm)
## [1] "Boosting test error: 0.404349"
sprintf("1000 trees boost Running time: %f sec", ptm_bm[3])
## [1] "1000 trees boost Running time: 4.166000 sec"
## q02b
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
x_train<-wine_train[,-12]
y_train<-wine_train[,12]
x_test<-wine_test[,-12]
y_test<-wine_test[,12]
ptm <- proc.time()
fit<-randomForest(x_train, y=y_train, xtest=x_test, ytest=y_test, ntree=1000, oob.prox=FALSE)
ptm_rf = proc.time() - ptm
wine_train_err_rf = min(fit$mse)
wine_test_err_rf = min(fit$test$mse)
wine_train_err_rf
## [1] 0.31
wine_test_err_rf
## [1] 0.4625
sprintf("Random forest training error: %f", wine_train_err_rf)
## [1] "Random forest training error: 0.309979"
sprintf("Random forest test error: %f", wine_test_err_rf)
## [1] "Random forest test error: 0.462453"
sprintf("Random forest Running time: %f sec", ptm_rf[3])
## [1] "Random forest Running time: 5.260000 sec"
##q02c
results = data.frame(Training.Error=c(wine_train_err_lr2, wine_train_error_bm, wine_train_err_rf),
                     Test.Error=c(wine_test_err_lr2, wine_test_error_bm, wine_test_err_rf),
                     Running.Time=c(ptm_lr[3], ptm_bm[3], ptm_rf[3]))
row.names(results) <- c("Ridge regression", "Boosted method", "Random Forest")
print(results)
##                  Training.Error Test.Error Running.Time
## Ridge regression         0.6482     0.7093        1.038
## Boosted method           0.3769     0.4043        4.166
## Random Forest            0.3100     0.4625        5.260
##q03a
sonar_train = read.csv("sonar_train.csv", header=FALSE)
sonar_test = read.csv("sonar_test.csv", header=FALSE)
sonar_hold = rbind(sonar_train, sonar_test)
rm(sonar_train, sonar_test)
set_seed <- function(i) {
  set.seed(i)
  if (exists(".Random.seed"))  oldseed <- get(".Random.seed", .GlobalEnv)
  if (exists(".Random.seed"))  assign(".Random.seed", oldseed, .GlobalEnv)
}
k = 5
num_sample = nrow(sonar_hold)
seed_val=123
set_seed(seed_val)
sonar_full = sonar_hold[sample(num_sample, num_sample, replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
pick = k #pick kth set
sonar_train = sonar_scaled [1:(num_sample*((k-1)/k)),]
sonar_cv = sonar_scaled [1:(num_sample*(1/k)),]
error_train = 0
error_cv = 0
for(j in 1:k){
  i_tmp = 1
  for(i in 1:k){ 
    if(i == pick){
      sonar_cv = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
    } else {
      sonar_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*i/k),]
      i_tmp = i_tmp + 1
    }
  }
  pick = pick - 1
  y_sonar_train = sonar_train[,61]
  x_sonar_train = sonar_train[,-61]
  y_sonar_cv = sonar_cv[,61]
  x_sonar_cv = sonar_cv[,-61]
  lmodel = lm(V61~., sonar_train)
  yp = (y_sonar_train >= 0.0)
  yh = predict(lmodel, x_sonar_train)
  yhp = (yh > 0.0)
  error_train = error_train + sum(yhp != yp)/(length(y_sonar_train)*k)
  yp = (y_sonar_cv >= 0.0)
  yh = predict(lmodel, x_sonar_cv)
  yhp = (yh > 0.0)
  error_cv= error_cv + sum(yhp != yp)/(length(y_sonar_cv)*k*0.00001/0.00001)
  }
sprintf("5-fold training error: %f", error_train)
## [1] "5-fold training error: 0.077108"
sprintf("5-fold cross validation error: %f", error_cv)
## [1] "5-fold cross validation error: 0.248780"
##q03b
k = 5
num_sample = nrow(sonar_hold)-3
seed_val=123
pick = k #pick kth set
error_train=rep(0, times=26)
error_cv=rep(0, times=26)
id_size = 1 
list_sample = c(0)
while((num_sample*(k-1)/k) > 60){
  while(sum(sonar_full[,61]==-1)==0 | sum(sonar_full[,61]==1)==0){
    seed_val = seed_val + 1
    set_seed(seed_val)
    sonar_full = sonar_hold[sample(num_sample, num_sample, replace=FALSE),]
  }
  sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
  sonar_train = sonar_scaled[1:(num_sample*((k-1)/k)),]
  sonar_cv = sonar_scaled[1:(num_sample*(1/k)),]
  pick=k
  for(j in 1:k){
    i_tmp = 1
    for(i in 1:k){
      if(i == pick){
        sonar_cv = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        sonar_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*i/k),]
        i_tmp = i_tmp + 1
      }
    }
    pick = pick - 1
    y_sonar_train = sonar_train[,61]
    x_sonar_train = sonar_train[,-61]
    y_sonar_cv = sonar_cv[,61]
    x_sonar_cv = sonar_cv[,-61]
    lmodel = lm(V61~., sonar_train)
    length(sonar_train[,61])
    yp = (y_sonar_train >= 0.0)
    yh = predict(lmodel, x_sonar_train)
    yhp = (yh > 0.0)
    error_train[id_size] = error_train[id_size] + sum(yhp != yp)/(length(y_sonar_train)*k*0.00001/0.00001)
    yp = (y_sonar_cv >= 0.0)
    yh = predict(lmodel, x_sonar_cv)
    yhp = (yh > 0.0)
    error_cv[id_size]= error_cv[id_size] + sum(yhp != yp)/(length(y_sonar_cv)*k*0.00001/0.00001)
      }
  
  list_sample[id_size]=num_sample
  num_sample = num_sample-5
    id_size = id_size + 1
  
}

pt_overfit=list_sample[which(max(error_cv-error_train)==(error_cv-error_train))]
sprintf("When there are %d of observations, differnece between cross-validation error and training error is highest.", pt_overfit)
## [1] "When there are 85 of observations, differnece between cross-validation error and training error is highest."
plot(list_sample, error_train,
     ylim=c(min(error_train, error_cv),
            max(error_train, error_cv)),
     ylab="error", xlab="# of observations")
points(list_sample, error_cv, col='red')
lines(spline(list_sample, error_train,n=200))
lines(spline(list_sample, error_cv,n=200), col="red")

plot of chunk unnamed-chunk-1

##q04a_1
sonar_full = sonar_hold[sample(length(sonar_hold[,61]), length(sonar_hold[,61]), replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))

training_set =  sonar_scaled[1:pt_overfit,]
holdout_set = sonar_scaled[pt_overfit:length(sonar_scaled[,61]),]
n = 15
# q04a_2
training_set_x = training_set[,-61]
training_set_y = training_set[,61]

train_sub_err = rep(0, times =10)
train=rep(0, times=length(training_set_y))
pred_train = data.frame(train, train, train, train, train, train, train, train, train, train) 
pred_train2 = data.frame(train, train, train, train, train, train, train, train, train, train) 

test_sub_err = rep(0, times =10)
hold=rep(0, times=length(holdout_set[,61]))
pred_hold = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold) 
pred_hold2 = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold) 

ptm = proc.time()

for(i in 1:10){
  #a) choose n attributes out of 60
  set_seed(seed_val)
  seed_val = seed_val + 1
  training_sub_set_x = training_set_x[,sample(60, n, replace=FALSE)]
  
  #b) fit a linear model
  training_sub_set_yx = cbind(training_sub_set_x, training_set_y)
  lm_sub = lm(training_set_y~., training_sub_set_yx)
  
  #c) check error
  yp = (training_set_y >= 0.0)
  yh = predict(lm_sub, training_sub_set_x )
  yhp = (yh > 0.0)
  train_sub_err[i] = sum(yhp != yp)/(length(training_set_y))
  pred_train[,i]= yh
  pred_train2[yhp,i]= 1
  pred_train2[!yhp,i]= -1
  
  yp = (holdout_set[,61] >= 0.0)
  yh = predict(lm_sub, holdout_set[,-61])
  yhp = (yh > 0.0)
  test_sub_err[i] = sum(yhp != yp)/(length(holdout_set[,61]))
  pred_hold[,i]= yh
  pred_hold2[yhp,i]= 1
  pred_hold2[!yhp,i]= -1
  
}

ptm1 = proc.time() - ptm


for(i in 1:10){
  cat(sprintf("%dth %d attributes traning error: %f", i, n, train_sub_err[i]), "\n")
  cat(sprintf("%dth %d attributes test error: %f", i, n, test_sub_err[i]), "\n")
}
## 1th 15 attributes traning error: 0.258824 
## 1th 15 attributes test error: 0.322581 
## 2th 15 attributes traning error: 0.129412 
## 2th 15 attributes test error: 0.330645 
## 3th 15 attributes traning error: 0.258824 
## 3th 15 attributes test error: 0.387097 
## 4th 15 attributes traning error: 0.164706 
## 4th 15 attributes test error: 0.346774 
## 5th 15 attributes traning error: 0.105882 
## 5th 15 attributes test error: 0.314516 
## 6th 15 attributes traning error: 0.152941 
## 6th 15 attributes test error: 0.314516 
## 7th 15 attributes traning error: 0.188235 
## 7th 15 attributes test error: 0.250000 
## 8th 15 attributes traning error: 0.164706 
## 8th 15 attributes test error: 0.306452 
## 9th 15 attributes traning error: 0.164706 
## 9th 15 attributes test error: 0.338710 
## 10th 15 attributes traning error: 0.141176 
## 10th 15 attributes test error: 0.362903
##q04b
library(MASS)
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
  exp <- (+3 -4*(i/20))
  xlambda[i+1] <- 10^exp
}

yx_pred_train = cbind(pred_train2, training_set_y)
holdout_y=holdout_set[,61]
yx_pred_hold = cbind(pred_hold2, holdout_y)

en_train_err=rep(0, times=length(xlambda))
en_hold_err=rep(0, times=length(xlambda))

for(ilambda in 1:length(xlambda)){
  
  enmodel = lm.ridge(training_set_y~., yx_pred_train, lambda=xlambda[ilambda])
  A = as.array(enmodel$coef[1:10]/enmodel$scales)
  
  X = pred_train2
  for( i in seq(from = 1, to = ncol(pred_train2))){
    X[,i] = pred_train2[,i] - enmodel$xm[i]
  }
  X = as.matrix(X)
  yh = X%*%A + enmodel$ym
  
  yhP = (yh >= 0.0)
  yp = (training_set_y >= 0.0)
  
  en_train_err[ilambda] = en_train_err[ilambda] + sum(yhP != yp)/length(training_set_y)
  
  #holdout set
  X = pred_hold2
  for( i in seq(from = 1, to = ncol(pred_hold2))){
    X[,i] = pred_hold2[,i] - enmodel$xm[i]
  }
  X = as.matrix(X)
  yh = X%*%A + enmodel$ym
  
  yhP = (yh >= 0.0)
  yp = (holdout_y >= 0.0)
  
  en_hold_err[ilambda] = en_hold_err[ilambda] + sum(yhP != yp)/length(holdout_y)
} 


th_lambda = min(which(min(en_hold_err)+0.01 > en_hold_err))
min_en_lambda = xlambda[th_lambda]
sprintf("Optimal lambda is %f", min_en_lambda)
## [1] "Optimal lambda is 39.810717"
plot(1:length(xlambda),en_train_err,
     ylim=c(min(en_train_err, en_hold_err),
            max(en_train_err, en_hold_err)))
points(1:length(xlambda),en_hold_err, col='red')
lines(spline(1:length(xlambda), en_train_err,n=200))
lines(spline(1:length(xlambda), en_hold_err,n=200), col="red")

plot of chunk unnamed-chunk-1

##q04c

seed_val=123
sonar_full = sonar_hold[sample(length(sonar_hold[,61]), length(sonar_hold[,61]), replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))

training_set =  sonar_scaled[1:pt_overfit,]
holdout_set = sonar_scaled[pt_overfit:length(sonar_scaled[,61]),]

training_set_x = training_set[,-61]
training_set_y = training_set[,61]

train_sub_err = rep(0, times =10)
train=rep(0, times=length(training_set_y))
pred_train = data.frame(train, train, train, train, train, train, train, train, train, train) 
pred_train2 = data.frame(train, train, train, train, train, train, train, train, train, train) 

test_sub_err = rep(0, times =10)
hold=rep(0, times=length(holdout_set[,61]))
pred_hold = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold) 
pred_hold2 = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold) 

n_list = seq(5,30,by=5)

en_train_err=rep(0, times=length(n_list))
en_hold_err=rep(0, times=length(n_list))

id_n=0

for(n in n_list){
  id_n = id_n + 1 
  
  ptm2 = proc.time()
  
  for(i in 1:10){
    #a) choose n attributes out of 60
    set_seed(seed_val)
    seed_val = seed_val + 1
    training_sub_set_x = training_set_x[,sample(60, n, replace=FALSE)]
    
    #b) fit a linear model
    training_sub_set_yx = cbind(training_sub_set_x, training_set_y)
    lm_sub = lm(training_set_y~., training_sub_set_yx)
    
    #c) check error
    yp = (training_set_y >= 0.0)
    yh = predict(lm_sub, training_sub_set_x )
    yhp = (yh > 0.0)
    train_sub_err[i] = sum(yhp != yp)/(length(training_set_y))
    pred_train[,i]= yh
    pred_train2[yhp,i]= 1
    pred_train2[!yhp,i]= -1
    
    yp = (holdout_set[,61] >= 0.0)
    yh = predict(lm_sub, holdout_set[,-61])
    yhp = (yh > 0.0)
    test_sub_err[i] = sum(yhp != yp)/(length(holdout_set[,61]))
    pred_hold[,i]= yh
    pred_hold2[yhp,i]= 1
    pred_hold2[!yhp,i]= -1
    
  }
  
  
  yx_pred_train = cbind(pred_train2, training_set_y)
  holdout_y=holdout_set[,61]
  yx_pred_hold = cbind(pred_hold2, holdout_y)
  
  
  
  enmodel = lm.ridge(training_set_y~., yx_pred_train, lambda=min_en_lambda)
  A = as.array(enmodel$coef[1:10]/enmodel$scales)
  
  X = pred_train2
  for( i in seq(from = 1, to = ncol(pred_train2))){
    X[,i] = pred_train2[,i] - enmodel$xm[i]
  }
  X = as.matrix(X)
  yh = X%*%A + enmodel$ym
  
  yhP = (yh >= 0.0)
  yp = (training_set_y >= 0.0)
  
  en_train_err[id_n] = en_train_err[id_n] + sum(yhP != yp)/length(training_set_y)
  
  #holdout set
  X = pred_hold2
  for( i in seq(from = 1, to = ncol(pred_hold2))){
    X[,i] = pred_hold2[,i] - enmodel$xm[i]
  }
  X = as.matrix(X)
  yh = X%*%A + enmodel$ym
  
  yhP = (yh >= 0.0)
  yp = (holdout_y >= 0.0)
  
  en_hold_err[id_n] = en_hold_err[id_n] + sum(yhP != yp)/length(holdout_y)
  
  if(id_n == 1)
    ptm_en = proc.time() - ptm2
} 

ptm_en=ptm_en + ptm1

plot(n_list,en_train_err,
     ylim=c(min(en_train_err, en_hold_err),
            max(en_train_err, en_hold_err)))
points(n_list,en_hold_err, col='red')
lines(spline(n_list, en_train_err,n=200))
lines(spline(n_list, en_hold_err,n=200), col="red")

plot of chunk unnamed-chunk-1

min_n = n_list[which(en_hold_err==min(en_hold_err))]
sprintf("When n is %d, the error rate is the lowest.", min_n)
## [1] "When n is 20, the error rate is the lowest."
#q04d
ptm = proc.time()
lmodel_70row = lm(V61~., training_set)
yp = (training_set[,61] >= 0.0)
yh = predict(lmodel_70row, training_set_x)
yhp = (yh > 0.0)
error_train_70 = sum(yhp != yp)/(length(training_set[,61]))
yp = (holdout_y >= 0.0)
yh = predict(lmodel_70row, holdout_set[,-61])
yhp = (yh > 0.0)
error_hold_70 = sum(yhp != yp)/(length(holdout_y))
ptm_lsm = proc.time() - ptm
results = data.frame(Training.Error=c(error_train_70, en_train_err[4]),
                     Test.Error=c(error_hold_70, en_hold_err[4]),
                     Running.Time=c(ptm_lsm[3], ptm_en[3]))
row.names(results) <- c("Linear Model(#o=85)", "Ensemble Method(lambda=40, #n=20)")
print(results)
##                                   Training.Error Test.Error Running.Time
## Linear Model(#o=85)                      0.01176     0.3226        0.060
## Ensemble Method(lambda=40, #n=20)        0.07059     0.2177        0.212
print("#o: number of observations (out of 208), #n: number of attributes (out of 60)")
## [1] "#o: number of observations (out of 208), #n: number of attributes (out of 60)"
##q05a
seed_val=123
set_seed(seed_val)
sonar_train = read.csv("sonar_train.csv", header=FALSE)
sonar_test = read.csv("sonar_test.csv", header=FALSE)
ptm = proc.time()
fit_rf = randomForest(sonar_train[,-61], y=as.factor(sonar_train[,61]), ntree=100)
ptm5 = proc.time() - ptm
train_err_rf = 1-sum(sonar_train[,61]==predict(fit_rf,sonar_train[,-61]))/length(sonar_train[,61])
test_err_rf = 1-sum(sonar_test[,61]==predict(fit_rf,sonar_test[,-61]))/length(sonar_test[,61])
sprintf("Training error: %f", train_err_rf)
## [1] "Training error: 0.000000"
sprintf("Test error: %f", test_err_rf)
## [1] "Test error: 0.141026"
#q05b
require(rpart)
## Loading required package: rpart
sonar_train_x = sonar_train[, -61]
sonar_train_y = as.factor(sonar_train[, 61])
sonar_test_x = sonar_test[, -61]
sonar_test_y = sonar_test[, 61]

a_list = rep(0, times=length(sonar_train_y)*100)
a_matrix = matrix(a_list,length(sonar_train_y),100)
pred_dt2 = as.data.frame(a_matrix)
seed_val=123
fit_list <- list()
for(i in 1:100){
  set_seed(seed_val)
  seed_val=seed_val+1
  #sample 10 attributes
  train_subset = cbind(sonar_train_x[, sample(60, 10, replace=FALSE)], sonar_train_y)
  
  #make 100 r-part with depth 2
  fit_dt = rpart(sonar_train_y~., train_subset, control = rpart.control(cp = -1, maxdepth = 2))
  fit_list[[i]] = fit_dt
  pred_dt2[, i] = as.numeric(as.character(predict(fit_dt, train_subset, type="class")))
  
}

V101 = as.numeric(as.character(sonar_train_y))
pred_dt2_yx= cbind(pred_dt2, V101)
##5-fold cross_vald
k=5
dt2_train = pred_dt2_yx[1:(num_sample*((k-1)/k)),]
dt2_cross = pred_dt2_yx[1:(num_sample*(1/k)),]
error_train_total = matrix(0, nrow = length(xlambda), ncol = 1)
error_cross_total = matrix(0, nrow = length(xlambda), ncol = 1)
num_sample = nrow(pred_dt2_yx)
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
  exp <- (+5 -4*(i/20))
  xlambda[i+1] <- 10^exp
}

for(ilambda in 1:length(xlambda)){
  pick = k #pick kth set
  
  error_train = 0
  error_cross = 0
  
  for(j in 1:k){
    i_tmp = 1
    for(i in 1:k){
      
      #choose training set, and cross validation set 
      if(i == pick){
        dt2_cross = pred_dt2_yx[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        dt2_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = pred_dt2_yx[((i-1)*num_sample/k+1):(num_sample*i/k),]
        i_tmp = i_tmp + 1
      }
    }
    pick = pick - 1
    row.names(dt2_cross)<-NULL
    row.names(dt2_train)<-NULL
    y_dt2_train = dt2_train[,101]
    x_dt2_train = dt2_train[,-101]
    yx_dt2_train = cbind(x_dt2_train, y_dt2_train)
    
    y_dt2_cross = dt2_cross[,101]
    x_dt2_cross = dt2_cross[,-101]
    
    dt2_model = lm.ridge(y_dt2_train~., yx_dt2_train, lambda=xlambda[ilambda])
    A = as.array(dt2_model$coef[1:100]/dt2_model$scales)
    X_train = x_dt2_train
    for( i in seq(from = 1, to = ncol(x_dt2_train))){
      X_train[,i] = x_dt2_train[,i] - dt2_model$xm[i]
    }
    X_train=as.matrix(X_train)
    yh = X_train%*%A + dt2_model$ym
    
    yhP = (yh >= 0.0)
    yp = (y_dt2_train >= 0.0)
    
    error_train = error_train + sum(yhP != yp)/(length(y_dt2_train)*k*0.00001/0.00001)
    
    X_cross = x_dt2_cross
    for( i in seq(from = 1, to = ncol(x_dt2_cross))){
      X_cross[,i] = x_dt2_cross[,i] - dt2_model$xm[i]
    }
    X_cross=as.matrix(X_cross)
    yh = X_cross%*%A + dt2_model$ym
    
    
    yhP = (yh >= 0.0)
    yp = (y_dt2_cross >= 0.0)
    
    error_cross = error_cross + sum(yhP != yp)/(length(y_dt2_cross)*k*0.00001/0.00001)
    
  }
  
  error_train_total[ilambda] = error_train
  error_cross_total[ilambda] = error_cross
  
}

th_lambda = min(which(min(error_cross_total) == error_cross_total))
min_dt2_lambda = xlambda[th_lambda]

sprintf("Optimal lambda is %f.", min_dt2_lambda)
## [1] "Optimal lambda is 1000.000000."
plot(1:length(xlambda),error_train_total,
     ylim=c(min(error_train_total, error_cross_total),
            max(error_train_total, error_cross_total)))
points(1:length(xlambda),error_cross_total, col='red')
lines(spline(1:length(xlambda), error_train_total,n=200))
lines(spline(1:length(xlambda), error_cross_total,n=200), col="red")

plot of chunk unnamed-chunk-1

a_list = rep(0, times=length(sonar_test_y)*100)
a_matrix = matrix(a_list,length(sonar_test_y),100)
test_dt2 = as.data.frame(a_matrix)

ptm = proc.time()

for(i in 1:100){
  fit_dt = fit_list[[i]] 
  test_dt2[, i] = as.numeric(as.character(predict(fit_dt, sonar_test_x, type="class")))
}

dt2_min_model = lm.ridge(V101~., pred_dt2_yx, lambda=min_dt2_lambda)
A = as.array(dt2_min_model$coef[1:100]/dt2_min_model$scales)

X = pred_dt2_yx[,-101]
for( i in seq(from = 1, to = (ncol(pred_dt2_yx)-1) ) ){
  X[,i] = pred_dt2_yx[,i] - dt2_min_model$xm[i]
}
X=as.matrix(X)
yh = X%*%A + dt2_min_model$ym

yhP = (yh >= 0.0)
yp = (pred_dt2_yx[,101] >= 0.0)

error_dt2_train = sum(yhP != yp)/(length(pred_dt2_yx[,101])*0.00001/0.00001)


X = test_dt2
for( i in seq(from = 1, to = (ncol(test_dt2)) ) ){
  X[,i] = test_dt2[,i] - dt2_min_model$xm[i]
}
X=as.matrix(X)
yh = X%*%A + dt2_min_model$ym

yhP = (yh >= 0.0)
yp = (sonar_test_y >= 0.0)

error_dt2_test = sum(yhP != yp)/(length(sonar_test_y)*0.00001/0.00001)

ptm10 = proc.time()-ptm

#q05c
sprintf("Training error: %f", train_err_rf)
## [1] "Training error: 0.000000"
sprintf("Test error: %f", test_err_rf)
## [1] "Test error: 0.141026"
sprintf("Training error: %f", error_dt2_train)
## [1] "Training error: 0.023077"
sprintf("Test error: %f", error_dt2_test)
## [1] "Test error: 0.179487"
results = data.frame(Training.Error=c(train_err_rf, error_dt2_train),
                     Test.Error=c(test_err_rf, error_dt2_test),
                     Running.Time=c(ptm5[3], ptm10[3]))
row.names(results) <- c("Random Forest", "Decision Tree bagging")
print(results)
##                       Training.Error Test.Error Running.Time
## Random Forest                0.00000     0.1410        0.052
## Decision Tree bagging        0.02308     0.1795        0.505
print("Random forest has n_tree=100, decision tree bagging has depth=2, ntree=100")
## [1] "Random forest has n_tree=100, decision tree bagging has depth=2, ntree=100"