gbm, randomForest, rpart, ridge
rm(list=ls())
library(gbm)
## Warning: package 'gbm' was built under R version 3.1.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1.1
wine_data<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/winequality-red.txt",sep=";",header=TRUE)
str(wine_data)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
x_train <-wine_data[, 1:11]
colnames(wine_data)[12]<- "quality"
gbm_wine_quality <- gbm(quality~.,
data=wine_data,
shrinkage=0.005,
interaction.depth=3,
#distribution= "gaussian",
distribution= "multinomial",
cv.folds=5,
n.trees=2000)
best.iter <- gbm.perf(gbm_wine_quality,method="cv") #est iterr = 2000
mins <-min(gbm_wine_quality$cv.error) # error 0.3826038
NumOfTrees <-2000
splitRow<-1400
TrainSet<-seq(1,splitRow,1)
TestSet<-seq(splitRow+1,nrow(wine_data),1)
#First 1400 observations are used to create a boosted model
trainData<-wine_data[TrainSet,1:11]
trainTarget<-wine_data[TrainSet,12]
trainData1<-wine_data[TrainSet,]
# Use last 199 observations for the hold out set
#Result
testData<-wine_data[TestSet,1:11]
testTarget<-wine_data[TestSet,12]
Ytrain<-trainTarget
Xtrain<-trainData
#quality<-trainTarget
#names(Ytrain)<-"quality"
# fit initial model
gbmModelNumeric <- gbm(Ytrain~., # formula
data=Xtrain, # dataset
distribution="gaussian", # bernoulli, adaboost, gaussian,
# poisson, coxph, and quantile available
n.trees=NumOfTrees, # number of trees
shrinkage=0.005, # shrinkage or learning rate,
# 0.001 to 0.1 usually work
interaction.depth=5, # * was 3 1: additive model, 2: two-way interactions, etc.
bag.fraction = 0.5, # subsampling fraction, 0.5 is probably best
train.fraction = 1.0, # was.5 fraction of data for training,
# first train.fraction*N used for training
#n.minobsinnode = 10, # minimum total weight needed in each node
cv.folds = 5, # do 5-fold cross-validation
keep.data=TRUE, # keep a copy of the dataset with the object
verbose=FALSE) # print out progress
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(gbmModelNumeric,method="OOB")
## Warning in gbm.perf(gbmModelNumeric, method = "OOB"): OOB generally
## underestimates the optimal number of iterations although predictive
## performance is reasonably competitive. Using cv.folds>0 when calling gbm
## usually results in improved predictive performance.
print(best.iter)
## [1] 573
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbmModelNumeric,method="cv")
print(best.iter)
## [1] 1909
legend(1000, .5, c("OOB(Out Of Bag estimator method)", "CV(Cross Valdation method)"), cex=0.8, col=c("black", "green"), lty=1)
# plot variable influence
summary(gbmModelNumeric,n.trees=1) # based on the first tree
## var rel.inf
## alcohol alcohol 54.504801
## sulphates sulphates 25.649484
## volatile.acidity volatile.acidity 12.191087
## total.sulfur.dioxide total.sulfur.dioxide 7.654628
## fixed.acidity fixed.acidity 0.000000
## citric.acid citric.acid 0.000000
## residual.sugar residual.sugar 0.000000
## chlorides chlorides 0.000000
## free.sulfur.dioxide free.sulfur.dioxide 0.000000
## density density 0.000000
## pH pH 0.000000
summary(gbmModelNumeric,n.trees=best.iter) # based on the estimated best number of trees
## var rel.inf
## alcohol alcohol 31.433129
## volatile.acidity volatile.acidity 15.017583
## sulphates sulphates 13.929913
## total.sulfur.dioxide total.sulfur.dioxide 8.945110
## chlorides chlorides 5.762426
## citric.acid citric.acid 4.582381
## pH pH 4.552241
## residual.sugar residual.sugar 4.551760
## density density 4.275012
## fixed.acidity fixed.acidity 4.202975
## free.sulfur.dioxide free.sulfur.dioxide 2.747470
# compactly print the first and last trees for curiosity
print(pretty.gbm.tree(gbmModelNumeric,1))
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0 10 1.155000e+01 1 14 15 92.480361
## 1 9 6.450000e-01 2 6 13 34.795390
## 2 9 5.250000e-01 3 4 5 8.725062
## 3 -1 -2.724687e-03 -1 -1 -1 0.000000
## 4 -1 -1.072311e-03 -1 -1 -1 0.000000
## 5 -1 -1.566723e-03 -1 -1 -1 0.000000
## 6 1 3.625000e-01 7 8 12 20.685078
## 7 -1 3.547243e-03 -1 -1 -1 0.000000
## 8 6 5.050000e+01 9 10 11 12.987897
## 9 -1 1.108647e-03 -1 -1 -1 0.000000
## 10 -1 -1.980728e-03 -1 -1 -1 0.000000
## 11 -1 2.316602e-06 -1 -1 -1 0.000000
## 12 -1 9.879791e-04 -1 -1 -1 0.000000
## 13 -1 -6.730132e-04 -1 -1 -1 0.000000
## 14 -1 4.248997e-03 -1 -1 -1 0.000000
## 15 -1 1.285714e-04 -1 -1 -1 0.000000
## Weight Prediction
## 0 700 1.285714e-04
## 1 586 -6.730132e-04
## 2 381 -1.566723e-03
## 3 114 -2.724687e-03
## 4 267 -1.072311e-03
## 5 381 -1.566723e-03
## 6 205 9.879791e-04
## 7 57 3.547243e-03
## 8 148 2.316602e-06
## 9 95 1.108647e-03
## 10 53 -1.980728e-03
## 11 148 2.316602e-06
## 12 205 9.879791e-04
## 13 586 -6.730132e-04
## 14 114 4.248997e-03
## 15 700 1.285714e-04
print(pretty.gbm.tree(gbmModelNumeric,gbmModelNumeric$n.trees))
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0 3 3.5000000000 1 14 15 1.756963
## 1 10 13.0500000000 2 12 13 1.990077
## 2 10 9.1500000000 3 7 11 1.679094
## 3 6 39.0000000000 4 5 6 2.214289
## 4 -1 0.0021992564 -1 -1 -1 0.000000
## 5 -1 -0.0006059907 -1 -1 -1 0.000000
## 6 -1 0.0010384645 -1 -1 -1 0.000000
## 7 6 72.5000000000 8 9 10 1.875747
## 8 -1 -0.0003264218 -1 -1 -1 0.000000
## 9 -1 0.0004021992 -1 -1 -1 0.000000
## 10 -1 -0.0001937219 -1 -1 -1 0.000000
## 11 -1 -0.0001362727 -1 -1 -1 0.000000
## 12 -1 0.0021120997 -1 -1 -1 0.000000
## 13 -1 -0.0001006971 -1 -1 -1 0.000000
## 14 -1 0.0007451411 -1 -1 -1 0.000000
## 15 -1 -0.0000185300 -1 -1 -1 0.000000
## Weight Prediction
## 0 700 -0.0000185300
## 1 632 -0.0001006971
## 2 622 -0.0001362727
## 3 29 0.0010384645
## 4 17 0.0021992564
## 5 12 -0.0006059907
## 6 29 0.0010384645
## 7 593 -0.0001937219
## 8 485 -0.0003264218
## 9 108 0.0004021992
## 10 593 -0.0001937219
## 11 622 -0.0001362727
## 12 10 0.0021120997
## 13 632 -0.0001006971
## 14 68 0.0007451411
## 15 700 -0.0000185300
#predict <- predict(gbmModelNumeric,x_train,best.iter)
pred<-predict(gbmModelNumeric, newdata=testData, n.trees=best.iter, type="response")
#What is RMS error using Euclidean Distance
rms_error <- sqrt(sum((testTarget-pred)^2)/length(testTarget))
cat("GBM: Test RMS Error on hold out set using 2000 Trees, using a Numeric response:", rms_error)
## GBM: Test RMS Error on hold out set using 2000 Trees, using a Numeric response: 0.6808431
rm(list=ls())
library(rpart)
library(ridge)
## Warning: package 'ridge' was built under R version 3.1.2
library(gbm)
wine_data <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/winequality-red.txt",sep=";",header=TRUE)
wine_train = wine_data[1:1400,]
wine_test = wine_data[1401:1599,]
rmse=function(X,Y){
return( sqrt(sum((X-Y)^2)/length(Y)) )
}
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
exp <- (+3 -4*(i/20))
xlambda[i+1] <- 10^exp
}
wine_train_err_lr = rep(0, times = length(xlambda))
wine_test_err_lr = rep(0, times = length(xlambda))
min_lambda = 10000
y_train = wine_train[,12]
y_test = wine_test[,12]
for(i in 1:length(xlambda)){
#optimal lamba
lrmodel = linearRidge(quality~., data = wine_train, lambda = xlambda[i])
wine_train_err_lr[i] = rmse(y_train, predict(lrmodel, newdata = wine_train[,-12]))
wine_test_err_lr[i] = rmse(y_test, predict(lrmodel, newdata = wine_test[,-12]))
if(i > 1 && wine_test_err_lr[i] < (wine_test_err_lr[i-1]-0.005)){
min_lambda = xlambda[i]
index_lambda=i
}
}
plot(1:length(xlambda),wine_train_err_lr,
ylim=c(min(wine_train_err_lr, wine_test_err_lr),
max(wine_train_err_lr, wine_test_err_lr)))
points(1:length(xlambda),wine_test_err_lr, col='red')
lrmodel2 = linearRidge(quality~., data = wine_train, lambda=min_lambda)
wine_train_err_lr2 = rmse(y_train, predict(lrmodel2, newdata = wine_train[,-12]))
wine_test_err_lr2 = rmse(y_test, predict(lrmodel2, newdata = wine_test[,-12]))
gbm1 <- gbm(quality~.,
distribution="gaussian",
data=wine_data,
n.trees = 3000,
interaction.depth=3,
cv.folds=5,
shrinkage=0.01)
best.iter = gbm.perf(gbm1,method="cv")
summary(gbm1,n.trees=best.iter)
## var rel.inf
## alcohol alcohol 24.939224
## volatile.acidity volatile.acidity 15.152118
## sulphates sulphates 13.218844
## total.sulfur.dioxide total.sulfur.dioxide 8.554816
## chlorides chlorides 6.837760
## density density 6.135272
## citric.acid citric.acid 5.634590
## pH pH 5.570655
## fixed.acidity fixed.acidity 5.388811
## residual.sugar residual.sugar 5.143485
## free.sulfur.dioxide free.sulfur.dioxide 3.424424
wine_train_error_bm = min(gbm1$train.error)
wine_test_error_bm = min(gbm1$cv.error)
cat(index_lambda, "th ", "lambda is optimal: ", min_lambda, "\n")
## 18 th lambda is optimal: 0.3981072
sprintf("Ridge training error: %f", wine_train_err_lr2)
## [1] "Ridge training error: 0.648245"
sprintf("Ridge test: %f",wine_test_err_lr2)
## [1] "Ridge test: 0.709274"
#sprintf("Ridge regression Running time: %f sec",(ptm_lr[3]))
cat("\n")
sprintf("Boosting training error: %f", wine_train_error_bm)
## [1] "Boosting training error: 0.229054"
sprintf("Boosting test error: %f", wine_test_error_bm)
## [1] "Boosting test error: 0.376224"
#sprintf("3000 trees boost Running time: %f sec", ptm_bm[3])
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.3
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
x_train<-wine_train[,-12]
y_train<-wine_train[,12]
x_test<-wine_test[,-12]
y_test<-wine_test[,12]
fit<-randomForest(x_train, y=y_train, xtest=x_test, ytest=y_test, ntree=3000, oob.prox=FALSE)
wine_train_err_rf = min(fit$mse)
wine_test_err_rf = min(fit$test$mse)
wine_train_err_rf
## [1] 0.3099382
wine_test_err_rf
## [1] 0.472383
sprintf("Random forest training error: %f", wine_train_err_rf)
## [1] "Random forest training error: 0.309938"
sprintf("Random forest test error: %f", wine_test_err_rf)
## [1] "Random forest test error: 0.472383"
#sprintf("Random forest Running time: %f sec", ptm_rf[3])
results = data.frame(Training.Error=c(wine_train_err_lr2, wine_train_error_bm, wine_train_err_rf),
Test.Error=c(wine_test_err_lr2, wine_test_error_bm, wine_test_err_rf))
# , Running.Time=c(ptm_lr[3], ptm_bm[3], ptm_rf[3]))
row.names(results) <- c("Ridge regression", "Boosted method", "Random Forest")
print(results)
## Training.Error Test.Error
## Ridge regression 0.6482447 0.7092735
## Boosted method 0.2290545 0.3762239
## Random Forest 0.3099382 0.4723830
3a) Start with the full sonar data set. Estimate the error of an ordinary least squared linear model on this data set by using 5 fold cross validation on the whole data set. Average the 5 training errors from each of these runs. Average the 5 test errors from each of these runs. What are these averages? 3b) Next run a series of cross validations on a series of data sets which are decreasing in size (decrease the number of observations). (At a minimum you must have more observations than attributes to use lm. Also, make sure that your smallest data set has at least one observation of both rock and mine.) Record the training and test errors for each data set size. 3c) Plot both the training error and test error verses data set size on the same graph. The horizontal axis should be the number of observations in the data set and the vertical axis should be error rates.
#3a)
sonar_train = read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv", header=FALSE)
sonar_test = read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv", header=FALSE)
sonar_hold = rbind(sonar_train, sonar_test)
rm(sonar_train, sonar_test)
set_seed <- function(i) {
set.seed(i)
if (exists(".Random.seed")) oldseed <- get(".Random.seed", .GlobalEnv)
if (exists(".Random.seed")) assign(".Random.seed", oldseed, .GlobalEnv)
}
k = 5
num_sample = nrow(sonar_hold)
seed_val=123
set_seed(seed_val)
sonar_full = sonar_hold[sample(num_sample, num_sample, replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
pick = k #pick kth set
sonar_train = sonar_scaled [1:(num_sample*((k-1)/k)),]
sonar_cv = sonar_scaled [1:(num_sample*(1/k)),]
error_train = 0
error_cv = 0
for(j in 1:k){
i_tmp = 1
for(i in 1:k){
if(i == pick){
sonar_cv = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
sonar_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*i/k),]
i_tmp = i_tmp + 1
}
}
pick = pick - 1
y_sonar_train = sonar_train[,61]
x_sonar_train = sonar_train[,-61]
y_sonar_cv = sonar_cv[,61]
x_sonar_cv = sonar_cv[,-61]
lmodel = lm(V61~., sonar_train)
yp = (y_sonar_train >= 0.0)
yh = predict(lmodel, x_sonar_train)
yhp = (yh > 0.0)
error_train = error_train + sum(yhp != yp)/(length(y_sonar_train)*k)
yp = (y_sonar_cv >= 0.0)
yh = predict(lmodel, x_sonar_cv)
yhp = (yh > 0.0)
error_cv= error_cv + sum(yhp != yp)/(length(y_sonar_cv)*k*0.00001/0.00001)
}
sprintf("5-fold training error: %f", error_train)
## [1] "5-fold training error: 0.077108"
## [1] "5-fold training error: 0.077108"
sprintf("5-fold cross validation error: %f", error_cv)
## [1] "5-fold cross validation error: 0.248780"
## [1] "5-fold cross validation error: 0.248780"
k = 5
num_sample = nrow(sonar_hold)-3
seed_val=123
pick = k #pick kth set
error_train=rep(0, times=26)
error_cv=rep(0, times=26)
id_size = 1
list_sample = c(0)
while((num_sample*(k-1)/k) > 60){
while(sum(sonar_full[,61]==-1)==0 | sum(sonar_full[,61]==1)==0){
seed_val = seed_val + 1
set_seed(seed_val)
sonar_full = sonar_hold[sample(num_sample, num_sample, replace=FALSE),]
}
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
sonar_train = sonar_scaled[1:(num_sample*((k-1)/k)),]
sonar_cv = sonar_scaled[1:(num_sample*(1/k)),]
pick=k
for(j in 1:k){
i_tmp = 1
for(i in 1:k){
if(i == pick){
sonar_cv = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
sonar_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = sonar_scaled[((i-1)*num_sample/k+1):(num_sample*i/k),]
i_tmp = i_tmp + 1
}
}
pick = pick - 1
y_sonar_train = sonar_train[,61]
x_sonar_train = sonar_train[,-61]
y_sonar_cv = sonar_cv[,61]
x_sonar_cv = sonar_cv[,-61]
lmodel = lm(V61~., sonar_train)
length(sonar_train[,61])
yp = (y_sonar_train >= 0.0)
yh = predict(lmodel, x_sonar_train)
yhp = (yh > 0.0)
error_train[id_size] = error_train[id_size] + sum(yhp != yp)/(length(y_sonar_train)*k*0.00001/0.00001)
yp = (y_sonar_cv >= 0.0)
yh = predict(lmodel, x_sonar_cv)
yhp = (yh > 0.0)
error_cv[id_size]= error_cv[id_size] + sum(yhp != yp)/(length(y_sonar_cv)*k*0.00001/0.00001)
}
list_sample[id_size]=num_sample
num_sample = num_sample-5
id_size = id_size + 1
}
pt_overfit=list_sample[which(max(error_cv-error_train)==(error_cv-error_train))]
sprintf("When there are %d of observations, differnece between cross-validation error and training error is highest.", pt_overfit)
## [1] "When there are 85 of observations, differnece between cross-validation error and training error is highest."
## [1] "When there are 85 of observations, differnece between cross-validation error and training error is highest."
#Problem 3(c)
plot(list_sample, error_train,
ylim=c(min(error_train, error_cv),
max(error_train, error_cv)),
ylab="error", xlab="# of observations")
points(list_sample, error_cv, col='red')
lines(spline(list_sample, error_train,n=200))
lines(spline(list_sample, error_cv,n=200), col="red")
### Problem 4 This problem illustrates some of the concepts of ensemble methods. (For debugging purposes you may want to use the command set.seed(pi) to make the runs consistent.)
To generate one of these linear models: A) Choose n attributes randomly from the 60 available attributes (without replacement). For example if you fixed n to be 11 then choose 11 attributes randomly (without replacement) from the 60 available sonar attributes. B) Fit a linear model to the training set using only these n attributes. C) Use this model to make predictions on both the training set and the hold out set. These will become one of the attributes in the ensemble model training set. This will become one attribute for problem 4b.
for (n in seq(5,30,by=5)){… create ensemble model}. Plot the training error and test error as a function of n. How well did the new ensemble models do in comparison to the original model created by problem 3?
#Problem 4(a)
sonar_full = sonar_hold[sample(length(sonar_hold[,61]), length(sonar_hold[,61]), replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
training_set = sonar_scaled[1:pt_overfit,]
holdout_set = sonar_scaled[pt_overfit:length(sonar_scaled[,61]),]
n = 11
# q04a_2
training_set_x = training_set[,-61]
training_set_y = training_set[,61]
train_sub_err = rep(0, times =10)
train=rep(0, times=length(training_set_y))
pred_train = data.frame(train, train, train, train, train, train, train, train, train, train)
pred_train2 = data.frame(train, train, train, train, train, train, train, train, train, train)
test_sub_err = rep(0, times =10)
hold=rep(0, times=length(holdout_set[,61]))
pred_hold = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold)
pred_hold2 = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold)
ptm = proc.time()
for(i in 1:10){
#a) choose n attributes out of 60
set_seed(seed_val)
seed_val = seed_val + 1
training_sub_set_x = training_set_x[,sample(60, n, replace=FALSE)]
#b) fit a linear model
training_sub_set_yx = cbind(training_sub_set_x, training_set_y)
lm_sub = lm(training_set_y~., training_sub_set_yx)
#c) check error
yp = (training_set_y >= 0.0)
yh = predict(lm_sub, training_sub_set_x )
yhp = (yh > 0.0)
train_sub_err[i] = sum(yhp != yp)/(length(training_set_y))
pred_train[,i]= yh
pred_train2[yhp,i]= 1
pred_train2[!yhp,i]= -1
yp = (holdout_set[,61] >= 0.0)
yh = predict(lm_sub, holdout_set[,-61])
yhp = (yh > 0.0)
test_sub_err[i] = sum(yhp != yp)/(length(holdout_set[,61]))
pred_hold[,i]= yh
pred_hold2[yhp,i]= 1
pred_hold2[!yhp,i]= -1
}
for(i in 1:10){
cat(sprintf("%dth %d attributes traning error: %f", i, n, train_sub_err[i]), "\n")
cat(sprintf("%dth %d attributes test error: %f", i, n, test_sub_err[i]), "\n")
}
## 1th 11 attributes traning error: 0.282353
## 1th 11 attributes test error: 0.354839
## 2th 11 attributes traning error: 0.188235
## 2th 11 attributes test error: 0.370968
## 3th 11 attributes traning error: 0.294118
## 3th 11 attributes test error: 0.387097
## 4th 11 attributes traning error: 0.141176
## 4th 11 attributes test error: 0.370968
## 5th 11 attributes traning error: 0.129412
## 5th 11 attributes test error: 0.282258
## 6th 11 attributes traning error: 0.188235
## 6th 11 attributes test error: 0.290323
## 7th 11 attributes traning error: 0.211765
## 7th 11 attributes test error: 0.266129
## 8th 11 attributes traning error: 0.188235
## 8th 11 attributes test error: 0.290323
## 9th 11 attributes traning error: 0.164706
## 9th 11 attributes test error: 0.346774
## 10th 11 attributes traning error: 0.141176
## 10th 11 attributes test error: 0.370968
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.2
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
exp <- (+3 -4*(i/20))
xlambda[i+1] <- 10^exp
}
yx_pred_train = cbind(pred_train2, training_set_y)
holdout_y=holdout_set[,61]
yx_pred_hold = cbind(pred_hold2, holdout_y)
en_train_err=rep(0, times=length(xlambda))
en_hold_err=rep(0, times=length(xlambda))
for(ilambda in 1:length(xlambda)){
enmodel = lm.ridge(training_set_y~., yx_pred_train, lambda=xlambda[ilambda])
A = as.array(enmodel$coef[1:10]/enmodel$scales)
X = pred_train2
for( i in seq(from = 1, to = ncol(pred_train2))){
X[,i] = pred_train2[,i] - enmodel$xm[i]
}
X = as.matrix(X)
yh = X%*%A + enmodel$ym
yhP = (yh >= 0.0)
yp = (training_set_y >= 0.0)
en_train_err[ilambda] = en_train_err[ilambda] + sum(yhP != yp)/length(training_set_y)
#holdout set
X = pred_hold2
for( i in seq(from = 1, to = ncol(pred_hold2))){
X[,i] = pred_hold2[,i] - enmodel$xm[i]
}
X = as.matrix(X)
yh = X%*%A + enmodel$ym
yhP = (yh >= 0.0)
yp = (holdout_y >= 0.0)
en_hold_err[ilambda] = en_hold_err[ilambda] + sum(yhP != yp)/length(holdout_y)
}
th_lambda = min(which(min(en_hold_err)+0.01 > en_hold_err))
min_en_lambda = xlambda[th_lambda]
sprintf("Optimal lambda is %f", min_en_lambda)
## [1] "Optimal lambda is 100.000000"
## [1] "Optimal lambda is 100.000000"
plot(1:length(xlambda),en_train_err,
ylim=c(min(en_train_err, en_hold_err),
max(en_train_err, en_hold_err)))
points(1:length(xlambda),en_hold_err, col='red')
lines(spline(1:length(xlambda), en_train_err,n=200))
lines(spline(1:length(xlambda), en_hold_err,n=200), col="red")
#### 4c) Repeat Homework problem 4b with various values for n (the number of randomly chosen attributes). In part 4aA of the example above, n was fixed to be 11. Now, put n in a loop. That is in pseudo code: for (n in seq(5,30,by=5)){… create ensemble model}. Plot the training error and test error as a function of n. How well did the new ensemble models do in comparison to the original model created by problem 3?
seed_val=123
sonar_full = sonar_hold[sample(length(sonar_hold[,61]), length(sonar_hold[,61]), replace=FALSE),]
sonar_scaled = as.data.frame(cbind(scale(sonar_full[,-61]), sonar_full[,61]))
training_set = sonar_scaled[1:pt_overfit,]
holdout_set = sonar_scaled[pt_overfit:length(sonar_scaled[,61]),]
training_set_x = training_set[,-61]
training_set_y = training_set[,61]
train_sub_err = rep(0, times =10)
train=rep(0, times=length(training_set_y))
pred_train = data.frame(train, train, train, train, train, train, train, train, train, train)
pred_train2 = data.frame(train, train, train, train, train, train, train, train, train, train)
test_sub_err = rep(0, times =10)
hold=rep(0, times=length(holdout_set[,61]))
pred_hold = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold)
pred_hold2 = data.frame(hold, hold, hold, hold, hold, hold, hold, hold, hold, hold)
n_list = seq(5,30,by=5)
en_train_err=rep(0, times=length(n_list))
en_hold_err=rep(0, times=length(n_list))
id_n=0
for(n in n_list){
id_n = id_n + 1
ptm2 = proc.time()
for(i in 1:10){
#a) choose n attributes out of 60
set_seed(seed_val)
seed_val = seed_val + 1
training_sub_set_x = training_set_x[,sample(60, n, replace=FALSE)]
#b) fit a linear model
training_sub_set_yx = cbind(training_sub_set_x, training_set_y)
lm_sub = lm(training_set_y~., training_sub_set_yx)
#c) check error
yp = (training_set_y >= 0.0)
yh = predict(lm_sub, training_sub_set_x )
yhp = (yh > 0.0)
train_sub_err[i] = sum(yhp != yp)/(length(training_set_y))
pred_train[,i]= yh
pred_train2[yhp,i]= 1
pred_train2[!yhp,i]= -1
yp = (holdout_set[,61] >= 0.0)
yh = predict(lm_sub, holdout_set[,-61])
yhp = (yh > 0.0)
test_sub_err[i] = sum(yhp != yp)/(length(holdout_set[,61]))
pred_hold[,i]= yh
pred_hold2[yhp,i]= 1
pred_hold2[!yhp,i]= -1
}
yx_pred_train = cbind(pred_train2, training_set_y)
holdout_y=holdout_set[,61]
yx_pred_hold = cbind(pred_hold2, holdout_y)
enmodel = lm.ridge(training_set_y~., yx_pred_train, lambda=min_en_lambda)
A = as.array(enmodel$coef[1:10]/enmodel$scales)
X = pred_train2
for( i in seq(from = 1, to = ncol(pred_train2))){
X[,i] = pred_train2[,i] - enmodel$xm[i]
}
X = as.matrix(X)
yh = X%*%A + enmodel$ym
yhP = (yh >= 0.0)
yp = (training_set_y >= 0.0)
en_train_err[id_n] = en_train_err[id_n] + sum(yhP != yp)/length(training_set_y)
#holdout set
X = pred_hold2
for( i in seq(from = 1, to = ncol(pred_hold2))){
X[,i] = pred_hold2[,i] - enmodel$xm[i]
}
X = as.matrix(X)
yh = X%*%A + enmodel$ym
yhP = (yh >= 0.0)
yp = (holdout_y >= 0.0)
en_hold_err[id_n] = en_hold_err[id_n] + sum(yhP != yp)/length(holdout_y)
# if(id_n == 1)
# ptm_en = proc.time() - ptm2
}
# ptm_en=ptm_en + ptm1
plot(n_list,en_train_err,
ylim=c(min(en_train_err, en_hold_err),
max(en_train_err, en_hold_err)))
points(n_list,en_hold_err, col='red')
lines(spline(n_list, en_train_err,n=200))
lines(spline(n_list, en_hold_err,n=200), col="red")
min_n = n_list[which(en_hold_err==min(en_hold_err))]
sprintf("When n is %d, the error rate is the lowest.", min_n)
## [1] "When n is 15, the error rate is the lowest."
## [1] "When n is 15, the error rate is the lowest."
lmodel_70row = lm(V61~., training_set)
yp = (training_set[,61] >= 0.0)
yh = predict(lmodel_70row, training_set_x)
yhp = (yh > 0.0)
error_train_70 = sum(yhp != yp)/(length(training_set[,61]))
yp = (holdout_y >= 0.0)
yh = predict(lmodel_70row, holdout_set[,-61])
yhp = (yh > 0.0)
error_hold_70 = sum(yhp != yp)/(length(holdout_y))
results = data.frame(Training.Error=c(error_train_70, en_train_err[4]),
Test.Error=c(error_hold_70, en_hold_err[4]))
row.names(results) <- c("Linear Model", "Ensemble Method(#n=15)")
print(results)
## Training.Error Test.Error
## Linear Model 0.01176471 0.3548387
## Ensemble Method(#n=15) 0.09411765 0.2983871
print("#o: number of observations (out of 208), #n: number of attributes (out of 60)")
## [1] "#o: number of observations (out of 208), #n: number of attributes (out of 60)"
Use Random Forest to classify the sonar data. #### 5b) Use rpart to generate trees with a depth of two on randomly selected attributes. Create at least 100 rpart models. The results of these models applied to the whole data set become new attributes. Use ridge regression to combine these trees to make predictions. The input columns for ridge regression are the attributes that were created from the rpart models. Use cross validation to choose the best lambda and calculate the test error for this lambda. (Use the example BabyEnsemble.R Instead of using lm to create the initial models use rpart. Instead of creating 10 initial models, create at least 100 models ) #### 5c) Which model resulted in the smallest test error? #### 5d) extra credit Add additional rpart models to 5b). As the number of rpart models increases, does the error rate decrease? Try using a majority count instead of ridge regression to combine the 100 models. Normally the cut off is 50%. What happens when you change this cut off to favor classifying mines instead of rocks?
seed_val=123
set_seed(seed_val)
require(randomForest)
sonar_train = read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_train.csv", header=FALSE)
sonar_test = read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/sonar_test.csv", header=FALSE)
fit_rf = randomForest(sonar_train[,-61], y=as.factor(sonar_train[,61]), ntree=100)
train_err_rf = 1-sum(sonar_train[,61]==predict(fit_rf,sonar_train[,-61]))/length(sonar_train[,61])
test_err_rf = 1-sum(sonar_test[,61]==predict(fit_rf,sonar_test[,-61]))/length(sonar_test[,61])
sprintf("Training error: %f", train_err_rf)
## [1] "Training error: 0.000000"
sprintf("Test error: %f", test_err_rf)
## [1] "Test error: 0.141026"
library(rpart)
sonar_train_x = sonar_train[, -61]
sonar_train_y = as.factor(sonar_train[, 61])
sonar_test_x = sonar_test[, -61]
sonar_test_y = sonar_test[, 61]
a_list = rep(0, times=length(sonar_train_y)*100)
a_matrix = matrix(a_list,length(sonar_train_y),100)
pred_dt2 = as.data.frame(a_matrix)
seed_val=123
fit_list <- list()
for(i in 1:100){
set_seed(seed_val)
seed_val=seed_val+1
#sample 10 attributes
train_subset = cbind(sonar_train_x[, sample(60, 10, replace=FALSE)], sonar_train_y)
#make 100 r-part with depth 2
fit_dt = rpart(sonar_train_y~., train_subset, control = rpart.control(cp = -1, maxdepth = 2))
fit_list[[i]] = fit_dt
pred_dt2[, i] = as.numeric(as.character(predict(fit_dt, train_subset, type="class")))
}
V101 = as.numeric(as.character(sonar_train_y))
pred_dt2_yx= cbind(pred_dt2, V101)
##5-fold cross_vald
k=5
dt2_train = pred_dt2_yx[1:(num_sample*((k-1)/k)),]
dt2_cross = pred_dt2_yx[1:(num_sample*(1/k)),]
error_train_total = matrix(0, nrow = length(xlambda), ncol = 1)
error_cross_total = matrix(0, nrow = length(xlambda), ncol = 1)
num_sample = nrow(pred_dt2_yx)
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
exp <- (+5 -4*(i/20))
xlambda[i+1] <- 10^exp
}
library("ridge")
for(ilambda in 1:length(xlambda)){
pick = k #pick kth set
error_train = 0
error_cross = 0
for(j in 1:k){
i_tmp = 1
for(i in 1:k){
#choose training set, and cross validation set
if(i == pick){
dt2_cross = pred_dt2_yx[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
dt2_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = pred_dt2_yx[((i-1)*num_sample/k+1):(num_sample*i/k),]
i_tmp = i_tmp + 1
}
}
pick = pick - 1
row.names(dt2_cross)<-NULL
row.names(dt2_train)<-NULL
y_dt2_train = dt2_train[,101]
x_dt2_train = dt2_train[,-101]
yx_dt2_train = cbind(x_dt2_train, y_dt2_train)
y_dt2_cross = dt2_cross[,101]
x_dt2_cross = dt2_cross[,-101]
dt2_model = lm.ridge(y_dt2_train~., yx_dt2_train, lambda=xlambda[ilambda])
A = as.array(dt2_model$coef[1:100]/dt2_model$scales)
X_train = x_dt2_train
for( i in seq(from = 1, to = ncol(x_dt2_train))){
X_train[,i] = x_dt2_train[,i] - dt2_model$xm[i]
}
X_train=as.matrix(X_train)
yh = X_train%*%A + dt2_model$ym
yhP = (yh >= 0.0)
yp = (y_dt2_train >= 0.0)
error_train = error_train + sum(yhP != yp)/(length(y_dt2_train)*k*0.00001/0.00001)
X_cross = x_dt2_cross
for( i in seq(from = 1, to = ncol(x_dt2_cross))){
X_cross[,i] = x_dt2_cross[,i] - dt2_model$xm[i]
}
X_cross=as.matrix(X_cross)
yh = X_cross%*%A + dt2_model$ym
yhP = (yh >= 0.0)
yp = (y_dt2_cross >= 0.0)
error_cross = error_cross + sum(yhP != yp)/(length(y_dt2_cross)*k*0.00001/0.00001)
}
error_train_total[ilambda] = error_train
error_cross_total[ilambda] = error_cross
}
th_lambda = min(which(min(error_cross_total) == error_cross_total))
min_dt2_lambda = xlambda[th_lambda]
sprintf("Optimal lambda is %f.", min_dt2_lambda)
## [1] "Optimal lambda is 1000.000000."
## [1] "Optimal lambda is 100000.000000."
plot(1:length(xlambda),error_train_total,
ylim=c(min(error_train_total, error_cross_total),
max(error_train_total, error_cross_total)))
points(1:length(xlambda),error_cross_total, col='red')
lines(spline(1:length(xlambda), error_train_total,n=200))
lines(spline(1:length(xlambda), error_cross_total,n=200), col="red")
a_list = rep(0, times=length(sonar_test_y)*100)
a_matrix = matrix(a_list,length(sonar_test_y),100)
test_dt2 = as.data.frame(a_matrix)
for(i in 1:100){
fit_dt = fit_list[[i]]
test_dt2[, i] = as.numeric(as.character(predict(fit_dt, sonar_test_x, type="class")))
}
dt2_min_model = lm.ridge(V101~., pred_dt2_yx, lambda=min_dt2_lambda)
A = as.array(dt2_min_model$coef[1:100]/dt2_min_model$scales)
X = pred_dt2_yx[,-101]
for( i in seq(from = 1, to = (ncol(pred_dt2_yx)-1) ) ){
X[,i] = pred_dt2_yx[,i] - dt2_min_model$xm[i]
}
X=as.matrix(X)
yh = X%*%A + dt2_min_model$ym
yhP = (yh >= 0.0)
yp = (pred_dt2_yx[,101] >= 0.0)
error_dt2_train = sum(yhP != yp)/(length(pred_dt2_yx[,101])*0.00001/0.00001)
X = test_dt2
for( i in seq(from = 1, to = (ncol(test_dt2)) ) ){
X[,i] = test_dt2[,i] - dt2_min_model$xm[i]
}
X=as.matrix(X)
yh = X%*%A + dt2_min_model$ym
yhP = (yh >= 0.0)
yp = (sonar_test_y >= 0.0)
error_dt2_test = sum(yhP != yp)/(length(sonar_test_y)*0.00001/0.00001)
sprintf("Training error: %f", train_err_rf)
## [1] "Training error: 0.000000"
## [1] "Training error: 0.000000"
sprintf("Test error: %f", test_err_rf)
## [1] "Test error: 0.141026"
## [1] "Test error: 0.141026"
sprintf("Training error: %f", error_dt2_train)
## [1] "Training error: 0.023077"
## [1] "Training error: 0.023077"
sprintf("Test error: %f", error_dt2_test)
## [1] "Test error: 0.179487"
## [1] "Test error: 0.179487"
results = data.frame(Training.Error=c(train_err_rf, error_dt2_train),
Test.Error=c(test_err_rf, error_dt2_test))
row.names(results) <- c("Random Forest", "Decision Tree bagging")
print(results)
## Training.Error Test.Error
## Random Forest 0.00000000 0.1410256
## Decision Tree bagging 0.02307692 0.1794872
## Training.Error Test.Error Running.Time
## Random Forest 0.00000 0.1410 0.052
## Decision Tree bagging 0.02308 0.1795 0.505
print("Random forest has n_tree=100, decision tree bagging has depth=2, ntree=100")
## [1] "Random forest has n_tree=100, decision tree bagging has depth=2, ntree=100"
## [1] "Random forest has n_tree=100, decision tree bagging has depth=2, ntree=100"