此文档是对我的硕士论文《An Efficient Smooth Quantile Boost Algorithm for Binary Classification》的一个展示,这篇文章已近发表在杂志《Advances in Pure Mathematics》。这也意味着我可以硕士毕业了!
文章主要是对一个基于机器学习算法Boosting在分位数框架下进行分类的算法改进,并得到了不错的改进效果。文章对算法效果的检验主要应用在三种数据上,并与其他的四种同样基于Boosting的分类算法进行比较。实验部分是用R语言实现的。所以在这里我会把我的实验代码包括数据展示在这篇文档中。对于实验的结果,由于实验的计算量比较大,所以得到结果需要计算机运行很长的时间,这里就不直接运行代码了。因此,我用文章中的数据表格进行实验结果的展示。
这篇文档使用R中的包
rmarkdown、knitr整理完成的,由于整理仓促,再加上能力有限,所以有不足之处,还请谅解。
文章的理论部分具体见上面文章的链接,这里给出经过我改进的分类的算法SQBC:
CART实现的;gbm编写的。## 用到的R包
library(rpart)
library(caret)
## 标准正态分布分布函数
G = function(x, y) {pnorm(x/y)}
DG = function(x, y) {dnorm(x/y) / y}
## 训练函数
BS_QC = function(train_data, shrinkage, iter, tau, alpha, h, ...) {
learner = list()
t = dim(train_data)[2]
f = 0
train_data$U = DG(f, h)*(tau-1/(1+exp((train_data$y-G(f, h))/alpha)))
for (i in 1:iter) {
learner[[i]] = rpart(U ~ .,
data = train_data[, -t],
control = rpart.control(maxdepth = 1, cp = 0))
f = f + shrinkage * predict(learner[[i]])
train_data$U = DG(f, h)*(tau-1/(1+exp((train_data$y-G(f, h))/alpha)))
}
result = list('learner' = learner,
'shrinkage' = shrinkage,
'iter' = iter,
'tau' = tau,
'alpha' = alpha,
'h' = h)
return(result)
}
## 预测函数
predict_BS_QC = function(model, test_data) {
predict_y = list()
learner = model$learner
shrinkage = model$shrinkage
iter = model$iter
tau = model$tau
alpha = model$alpha
h = model$h
n = nrow(test_data)
m = dim(test_data)[2]
f = 0
predict_y[[1]] = f + shrinkage * predict(learner[[1]], test_data[, -m])
for(i in 2:iter) {
predict_y[[i]] = predict_y[[i-1]] + shrinkage*predict(learner[[i]], test_data[, -m])
}
result = list('predict_y' = predict_y)
return(result)
}
## 用到的R包
library(rpart)
library(caret)
## 目标函数
# obje_fun = function(x, y, tau, h) {
# sum((y - (1 - tau)) * pnorm(x/h))
# }
## 训练函数
QBC = function(train_data, shrinkage, iter, tau, h, ...) {
learner = list()
# obj_value = vector()
t = dim(train_data)[2]
f = 0
train_data$U = (train_data$y - (1 - tau)) * dnorm(f/h) / h
for (i in 1:iter) {
learner[[i]] = rpart(U ~ .,
data = train_data[, -t],
control = rpart.control(maxdepth = 1, cp = 0))
f = f + shrinkage * predict(learner[[i]])
train_data$U = (train_data$y - (1 - tau)) * dnorm(f/h) / h
# obj_value[i] = obje_fun(f, train_data$y, tau, h)
}
result = list('learner' = learner,
'shrinkage' = shrinkage,
'iter' = iter,
'tau' = tau,
'h' = h
# 'obj_value' = obj_value
)
return(result)
}
## 预测函数
predict_QBC = function(model, test_data) {
predict_y = list()
learner = model$learner
shrinkage = model$shrinkage
iter = model$iter
tau = model$tau
h = model$h
m = dim(test_data)[2]
n = nrow(test_data)
f = 0
predict_y[[1]] = f + shrinkage * predict(learner[[1]], test_data[, -m])
for(i in 2:iter) {
predict_y[[i]] = predict_y[[i-1]] + shrinkage*predict(learner[[i]],
test_data[, -m])
}
result = list('predict_y' = predict_y)
return(result)
}
Measure = function(data) {
library(caret)
library(gbm)
train_error = vector()
test_error = vector()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
LogitBoost_model = gbm(y ~ .,
distribution = "bernoulli",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = LogitBoost_model$n.trees
y_pred_train = predict(LogitBoost_model, train, iter)
y_final_train = ifelse(y_pred_train > 0.5, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict(LogitBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list("train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
Measure = function(data) {
library(caret)
library(gbm)
train_error = vector()
test_error = vector()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
L2_Boost_model = gbm(y ~ .,
distribution = "gaussian",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = L2_Boost_model$n.trees
y_pred_train = predict(L2_Boost_model, train, iter)
y_final_train = ifelse(y_pred_train > 0.5, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict(L2_Boost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list("train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
Measure = function(data) {
library(caret)
library(gbm)
train_error = vector()
test_error = vector()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
AdaBoost_model = gbm(y ~ .,
distribution = "adaboost",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = AdaBoost_model$n.trees
y_pred_train = predict(AdaBoost_model, train, iter)
y_final_train = ifelse(y_pred_train > 0.5, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict(AdaBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list("train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
## d = 5
library(MASS)
I = matrix(0, 5, 5)
for(i in 1:5) {
for(j in 1:5) {
I[i, j] = ifelse(i == j, 1, 0.5 ^ abs(i-j))
}
}
X = mvrnorm(n = 10000, mu = rep(0, 5), Sigma = I)
beta = c(7, 0, 4, 10, 3)
epsilon = rnorm(n = 10000, mean = 0, sd = 1)
linear_y = X %*% beta + epsilon
p = 1 / (1 + exp(-linear_y))
y = rbinom(n = 10000, size = 1, prob = p)
X = as.data.frame(X)
names(X) = paste0(rep("x", 5), 1:5)
data = cbind(X, y)
library(DT)
datatable(data)
## d = 10
library(MASS)
I = matrix(0, 10, 10)
for(i in 1:10) {
for(j in 1:10) {
I[i, j] = ifelse(i == j, 1, 0.5 ^ abs(i-j))
}
}
X = mvrnorm(n = 10000, mu = rep(0, 10), Sigma = I)
beta = c(7, 7, 5, 8, 0, 6, 1, 0, 6, 5)
epsilon = rnorm(n = 10000, mean = 0, sd = 1)
linear_y = X %*% beta + epsilon
p = 1 / (1 + exp(-linear_y))
y = rbinom(n = 10000, size = 1, prob = p)
X = as.data.frame(X)
names(X) = paste0(rep("x", 10), 1:10)
data = cbind(X, y)
head(data)
## x1 x2 x3 x4 x5 x6
## 1 0.76339191 -1.0836563 0.71008272 0.06622242 0.1583119 -1.5393307
## 2 -1.14842282 -1.3876882 0.60910616 1.13871280 0.1637562 -0.2083448
## 3 -1.37819503 -0.5302778 -0.66496345 0.41166718 0.6555960 1.5969750
## 4 -0.00290653 -1.4117553 -0.59523074 0.68600636 -0.2763362 -1.0843156
## 5 -0.90052291 -0.2977888 0.19500308 0.25993477 -1.4878824 0.4569048
## 6 -0.34716388 -0.6019838 0.09668925 0.32226253 0.7844415 0.2412834
## x7 x8 x9 x10 y
## 1 -0.3997970 1.0342091 1.1097329 0.02756102 0
## 2 0.0704264 -0.9923676 -1.6033658 -1.35823599 0
## 3 1.0456126 -0.2872267 -0.5103496 -0.91562779 0
## 4 -1.2327104 0.5112847 -0.3482465 -1.63812787 0
## 5 0.7700809 -0.8486176 -1.5200546 -0.69531323 0
## 6 0.8420909 0.9463423 1.5732465 1.13692112 1
## d = 20
library(MASS)
I = matrix(0, 20, 20)
for(i in 1:20) {
for(j in 1:20) {
I[i, j] = ifelse(i == j, 1, 0.5 ^ abs(i-j))
}
}
X = mvrnorm(n = 10000, mu = rep(0, 20), Sigma = I)
beta = c(0, 9, 10, 2, 4, 7, 4, 10, 5, 10,
10, 2, 6, 5, 9, 1, 2, 6, 1, 0)
epsilon = rnorm(n = 10000, mean = 0, sd = 1)
linear_y = X %*% beta + epsilon
p = 1 / (1 + exp(-linear_y))
y = rbinom(n = 10000, size = 1, prob = p)
X = as.data.frame(X)
names(X) = paste0(rep("x", 20), 1:20)
data = cbind(X, y)
head(data)
## x1 x2 x3 x4 x5 x6
## 1 -0.9271522 1.5291801 1.73131726 1.16657800 1.13288815 0.49926454
## 2 -0.3198332 -0.2649894 0.39663771 0.55966094 -0.84808232 -1.12550621
## 3 2.2621756 0.1020649 0.65955215 -0.13923886 -0.09234651 0.47817648
## 4 -0.7633544 -0.3549601 -0.87430554 0.01983628 1.81936278 -0.04221294
## 5 -0.4015405 0.9768219 0.42331673 0.34224928 -0.24540222 -0.09985831
## 6 1.3649247 0.4029899 -0.09676228 -0.63849153 -1.22625612 -1.43555173
## x7 x8 x9 x10 x11 x12
## 1 0.8788791 -0.4629553 -0.23871897 0.62593991 -0.3195274 0.71341405
## 2 -0.5362605 -0.6061843 0.01547504 0.08235889 0.3562879 -0.48458599
## 3 -0.2128729 1.2217888 -0.76808728 -0.90248413 -0.9244526 -0.61209513
## 4 -0.5379076 0.2516663 0.56862284 0.49371146 -0.1589400 0.66068483
## 5 -0.2731114 -0.9111114 0.73270194 0.75327043 0.7178108 1.69018477
## 6 -0.6325476 -1.1333057 -0.99945267 1.35926321 1.3846692 0.06168821
## x13 x14 x15 x16 x17 x18
## 1 0.41923810 1.4840803 1.1185362 1.10057234 -0.08167604 0.5712753
## 2 -0.96440337 -1.2447026 0.5513099 -0.59701720 -1.13726452 -1.5957152
## 3 0.74222208 1.0000790 0.4643866 0.05958271 -0.04345482 1.3901690
## 4 -0.90265612 -0.6265617 -1.5399095 -2.32209765 -2.88138365 -2.5722124
## 5 0.69123661 0.4093474 -0.4427204 -0.54737819 -1.16501653 0.3597966
## 6 0.04882694 0.2855142 -0.4991765 0.17146132 -0.29893980 -1.5903108
## x19 x20 y
## 1 1.97795146 1.5521724 1
## 2 -1.57462571 -1.4734853 0
## 3 -0.49277169 0.5380048 1
## 4 0.05432769 1.1724154 0
## 5 0.18753073 0.4245079 1
## 6 -0.36281430 -0.4316065 0
## d = 30
library(MASS)
I = matrix(0, 30, 30)
for(i in 1:30) {
for(j in 1:30) {
I[i, j] = ifelse(i == j, 1, 0.5 ^ abs(i-j))
}
}
X = mvrnorm(n = 10000, mu = rep(0, 30), Sigma = I)
beta = c(8, 2, 2, 3, 9, 4, 4, 0, 8, 1,
6, 1, 3, 5, 6, 10, 6, 1, 9, 0,
6, 3, 4, 6, 9, 10, 9, 5, 9, 0)
epsilon = rnorm(n = 10000, mean = 0, sd = 1)
linear_y = X %*% beta + epsilon
p = 1 / (1 + exp(-linear_y))
y = rbinom(n = 10000, size = 1, prob = p)
X = as.data.frame(X)
names(X) = paste0(rep("x", 30), 1:30)
data = cbind(X, y = y)
head(data)
## x1 x2 x3 x4 x5 x6
## 1 -0.03462461 -0.7080424 0.2016466 -0.02894544 0.0442317 -1.64304128
## 2 -1.14425133 -0.9519240 -1.1719876 -0.31564242 1.0227871 0.63019216
## 3 -1.21288371 0.1471827 -1.1785423 -1.52537274 -1.1239963 -0.14164907
## 4 -0.77887905 0.5516724 0.9107677 1.23596713 0.7716891 -0.04876431
## 5 0.02848825 -1.0843074 -0.3159069 0.60962976 1.8213806 0.18663443
## 6 -0.62381468 -0.7176902 0.2547147 2.25673190 1.2559530 1.52120271
## x7 x8 x9 x10 x11 x12
## 1 -0.72001864 1.4546129 0.97274203 1.2691041 0.0004834632 0.41225731
## 2 0.05125958 0.9540079 -0.45232963 0.3116382 -0.6546978404 -0.77191725
## 3 1.31734202 0.6668914 0.89076294 1.0147441 -0.8499234493 -0.09381324
## 4 -0.56060072 -1.1647220 -0.57794147 -0.3828260 -0.7721727720 -1.37857264
## 5 -1.46837190 -1.3651940 0.07141187 -0.4320732 0.6145670723 0.46359015
## 6 1.62884587 0.8850743 -0.57459202 -0.8223610 -0.3685440929 0.92022992
## x13 x14 x15 x16 x17 x18
## 1 -0.54291161 0.13958724 -0.08889993 -0.74409785 -0.4898396 -1.0302919
## 2 0.59111936 0.43465902 -0.97872725 0.47653407 0.3138035 0.2205606
## 3 -0.07103618 0.77868730 -0.04318692 -0.85465182 -1.7946110 -0.3224638
## 4 -1.11613070 -0.09972625 0.21817299 0.86516658 0.7229972 -0.3168603
## 5 0.23986850 -1.42314293 -0.31521112 -0.66485700 1.7078898 1.6948596
## 6 0.80707928 1.59531875 -0.24643074 -0.05055092 -0.6512441 0.2529438
## x19 x20 x21 x22 x23 x24
## 1 -1.22134458 -0.4922352 0.2894784 0.981305541 0.4296749 0.1835128
## 2 -0.65904677 -0.3178818 -0.6820901 0.077345118 0.6300441 1.1259243
## 3 0.07491263 0.3270166 0.1998535 -0.362055326 1.5968700 1.9271654
## 4 0.34895592 -0.1744281 0.6041154 -0.204135496 0.8866096 -0.4089966
## 5 1.85306650 0.2952780 1.1917152 1.172212831 0.3031969 -0.2538372
## 6 0.05339309 0.5651339 0.9628562 0.004149327 -0.0133694 -0.8089041
## x25 x26 x27 x28 x29 x30 y
## 1 0.05342664 -3.3658676 -1.5854423 0.30556741 0.2239299 0.2763083 0
## 2 0.95545372 1.2830113 1.5686744 2.12687895 -0.9049997 -0.5968240 1
## 3 1.21077881 1.4015516 0.4374940 0.21003300 -0.6884262 -1.1511653 1
## 4 -0.78702082 -2.2277481 -1.0211877 -0.45118205 1.0490631 -0.4804341 0
## 5 -1.17229832 0.9555658 0.9513294 2.53109885 1.7620638 0.7265436 1
## 6 -0.05188930 2.0091098 1.0432561 0.09304176 -0.4307724 -0.9380655 1
在这里模型参数\(\alpha\)候选值我选择了14个,即模型会在这14个值中选择一个SQBC算法在数据上表现最好的\(\alpha\),下面是参数选择的R代码,只需将不同的数据代入下面的选择过程即可得到不同数据不同参数下模型的表现:
## 14个候选模型参数alpha
ALPHA = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0)
## 参数选择
test_error_rate = vector()
for(k in ALPHA) {
test_error = vector()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = k,
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
index = which(k == ALPHA)
test_error_rate[index] = mean(test_error)
}
test_error_rate
从上面的结果可以看出四种维度数据上模型参数\(\alpha\)的选择结果为:
| 维度 | \(\alpha\) |
|---|---|
| 5维 | 0.2 |
| 10维 | 0.2 |
| 20维 | 0.8 |
| 30维 | 0.9 |
具体实施过程即:将下面Measure函数中BS_QC函数中的参数选项alpha分别代入上面选择的\(\alpha\)值
Measure = function(data) {
train_error = vector()
test_error = vector()
# obj_values = list()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
BS_QC_model = BS_QC(train, shrinkage = 0.1, iter = 100, tau = 0.5,
alpha = '?', h = 0.1)
# obj_values[[i]] = BS_QC_model$obj_value
iter = BS_QC_model$iter
y_pred_train = predict_BS_QC(BS_QC_model, train)$predict_y[[iter]]
y_final_train = ifelse(y_pred_train >= 0, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
# obj_values = t(as.data.frame(obj_values))
# ave_obj_value = apply(obj_values, 2, mean)
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list(# "ave_obj_value" = ave_obj_value,
"train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
# MEASURE$ave_obj_value
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
把四个数据集分别带入到下面的函数中得到QBC算法在不同数据上的表现。
## Simulation
Measure = function(data) {
train_error = vector()
test_error = vector()
# obj_values = list()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
QBC_model = QBC(train, shrinkage = 0.1, iter = 100, tau = 0.5, h = 0.1)
# obj_values[[i]] = QBC_model$obj_value
iter = QBC_model$iter
y_pred_train = predict_QBC(QBC_model, train)$predict_y[[iter]]
y_final_train = ifelse(y_pred_train >= 0, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict_QBC(QBC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
# obj_values = t(as.data.frame(obj_values))
# ave_obj_value = apply(obj_values, 2, mean)
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list(# "ave_obj_value" = ave_obj_value,
"train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
# MEASURE$ave_obj_value
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
将四种数据分别输入到上面的三种分类算法的训练和预测函数中可得到三种分类算法的表现
从上表的分类结果可以得到的结论为:SQBC算法在四种维度的模拟数据上的分类表现都优于其他四种分类算法。
## German Credit data
data = read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",
header = FALSE,
sep = "")
names(data) = c(paste0(rep("x", 20), 1:20), "y")
data$y = as.integer(data$y) - 1
library(magrittr)
data = sapply(data, as.integer) %>%
as.data.frame() %>%
sapply(function(x) x/max(x)) %>%
as.data.frame()
head(data)
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12
## 1 0.25 0.08333333 1.0 0.5 0.06344985 1.0 1.0 1.00 0.75 0.3333333 1.00 0.25
## 2 0.50 0.66666667 0.6 0.5 0.32300261 0.2 0.6 0.50 0.50 0.3333333 0.50 0.25
## 3 1.00 0.16666667 1.0 0.8 0.11376465 0.2 0.8 0.50 0.75 0.3333333 0.75 0.25
## 4 0.25 0.58333333 0.6 0.4 0.42781155 0.2 0.8 0.50 0.75 1.0000000 1.00 0.50
## 5 0.25 0.33333333 0.8 0.1 0.26432914 0.2 0.6 0.75 0.75 0.3333333 1.00 1.00
## 6 1.00 0.50000000 0.6 0.8 0.49147851 1.0 0.6 0.50 0.75 0.3333333 1.00 1.00
## x13 x14 x15 x16 x17 x18 x19 x20 y
## 1 0.8933333 1 0.6666667 0.50 0.75 0.5 1.0 0.5 0
## 2 0.2933333 1 0.6666667 0.25 0.75 0.5 0.5 0.5 1
## 3 0.6533333 1 0.6666667 0.25 0.50 1.0 0.5 0.5 0
## 4 0.6000000 1 1.0000000 0.25 0.75 1.0 0.5 0.5 0
## 5 0.7066667 1 1.0000000 0.50 0.75 1.0 0.5 0.5 1
## 6 0.4666667 1 1.0000000 0.25 0.50 1.0 1.0 0.5 0
## Credit data with noise
data1 = read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",
header = FALSE,
sep = "")
data1$V21 = as.integer(data1$V21) - 1
library(magrittr)
data1 = sapply(data1, as.integer) %>%
as.data.frame() %>%
sapply(function(x) x/max(x)) %>%
as.data.frame()
data2 = matrix(0, nrow = 1000, ncol = 20)
for(i in 1:20) {
data2[, i] = runif(1000, min = -1, max = 1)
}
data2 = data2 %>% as.data.frame()
data = cbind(data1[, 1:20], data2, data1[, 21])
names(data) = c(paste0(rep("x", 40), 1:40), "y")
head(data)
## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12
## 1 0.25 0.08333333 1.0 0.5 0.06344985 1.0 1.0 1.00 0.75 0.3333333 1.00 0.25
## 2 0.50 0.66666667 0.6 0.5 0.32300261 0.2 0.6 0.50 0.50 0.3333333 0.50 0.25
## 3 1.00 0.16666667 1.0 0.8 0.11376465 0.2 0.8 0.50 0.75 0.3333333 0.75 0.25
## 4 0.25 0.58333333 0.6 0.4 0.42781155 0.2 0.8 0.50 0.75 1.0000000 1.00 0.50
## 5 0.25 0.33333333 0.8 0.1 0.26432914 0.2 0.6 0.75 0.75 0.3333333 1.00 1.00
## 6 1.00 0.50000000 0.6 0.8 0.49147851 1.0 0.6 0.50 0.75 0.3333333 1.00 1.00
## x13 x14 x15 x16 x17 x18 x19 x20 x21 x22
## 1 0.8933333 1 0.6666667 0.50 0.75 0.5 1.0 0.5 0.22844586 0.8320590
## 2 0.2933333 1 0.6666667 0.25 0.75 0.5 0.5 0.5 -0.72693190 0.9991424
## 3 0.6533333 1 0.6666667 0.25 0.50 1.0 0.5 0.5 -0.54565089 0.5143605
## 4 0.6000000 1 1.0000000 0.25 0.75 1.0 0.5 0.5 0.61987177 0.3900457
## 5 0.7066667 1 1.0000000 0.50 0.75 1.0 0.5 0.5 -0.31743021 -0.9876573
## 6 0.4666667 1 1.0000000 0.25 0.50 1.0 1.0 0.5 0.02075523 -0.6706805
## x23 x24 x25 x26 x27 x28
## 1 0.04103177 0.058608059 0.86974898 0.65794282 -0.98991730 0.6786311
## 2 0.36486475 0.489247674 0.07611107 0.04598383 -0.47078222 0.4925983
## 3 0.85110660 -0.607923863 -0.94047510 -0.27389790 0.09504785 -0.4796194
## 4 -0.33141494 -0.009270498 -0.54727874 0.93665679 -0.09901721 0.6138460
## 5 0.99397841 -0.309244839 0.61772968 -0.37018907 0.62342747 -0.9569968
## 6 0.31795809 0.202838360 0.95817897 0.24552659 -0.37657524 -0.9589107
## x29 x30 x31 x32 x33 x34
## 1 0.6345412 0.6145097 0.32813789 -0.9533325 -0.7247609 0.66494572
## 2 -0.8430762 0.8935410 -0.28545981 -0.1540411 -0.8725890 0.39317313
## 3 0.4267373 -0.1567467 -0.56647454 0.7269622 0.7041609 0.09821726
## 4 0.1607107 -0.4748961 -0.64033638 0.6147819 -0.5974904 -0.92076997
## 5 -0.3331214 0.1091735 -0.88278198 -0.4898839 0.7987866 -0.25938899
## 6 -0.5983156 0.9334539 0.09745629 -0.9336912 0.1235361 -0.10251458
## x35 x36 x37 x38 x39 x40 y
## 1 0.9724355 0.7884714 0.59811357 0.08782790 0.67639851 0.6286167 0
## 2 0.1506990 -0.8179054 0.29660407 0.34662557 0.99352918 0.7197240 1
## 3 -0.5884777 0.8461547 -0.92998418 -0.71629914 0.23189007 -0.2338860 0
## 4 0.4490941 0.3697184 -0.86069181 -0.83569311 0.07557601 0.4142512 0
## 5 -0.6703903 -0.4855065 -0.03806945 -0.96223484 -0.75331773 -0.3083017 1
## 6 -0.1828044 0.5358807 0.30982501 -0.05992841 -0.10374151 -0.6458609 0
在这里模型参数\(\alpha\)候选值我选择了14个,即模型会在这14个值中选择一个SQBC算法表现最好的\(\alpha\)。
## Parameters
ALPHA = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0)
test_error_rate = vector()
for(k in ALPHA) {
test_error = vector()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = k,
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
index = which(k == ALPHA)
test_error_rate[index] = mean(test_error)
}
test_error_rate
SQBC算法在两种信用数据上的参数选择结果:
从上面的结果可以看出两种信用数据上模型参数\(\alpha\)的选择结果为:
| 维度 | \(\alpha\) |
|---|---|
| 无干扰数据 | 0.5 |
| 有干扰数据 | 0.9 |
具体实施过程即:将下面Measure函数中BS_QC函数中的参数选项alpha分别代入上面选择的\(\alpha\)值
Measure = function(data) {
train_error = vector()
test_error = vector()
# obj_values = list()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
BS_QC_model = BS_QC(train, shrinkage = 0.1, iter = 100, tau = 0.5,
alpha = '?', h = 0.1)
# obj_values[[i]] = BS_QC_model$obj_value
iter = BS_QC_model$iter
y_pred_train = predict_BS_QC(BS_QC_model, train)$predict_y[[iter]]
y_final_train = ifelse(y_pred_train >= 0, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
# obj_values = t(as.data.frame(obj_values))
# ave_obj_value = apply(obj_values, 2, mean)
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list(# "ave_obj_value" = ave_obj_value,
"train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
# MEASURE$ave_obj_value
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
把四个数据集分别带入到下面的函数中得到QBC算法在不同数据上的表现。
## Simulation
Measure = function(data) {
train_error = vector()
test_error = vector()
# obj_values = list()
for(i in 1:500) {
inTrain = createDataPartition(y = data$y, p = 0.8, list = FALSE)
train = data[inTrain, ]
test = data[-inTrain, ]
QBC_model = QBC(train, shrinkage = 0.1, iter = 100, tau = 0.5, h = 0.1)
# obj_values[[i]] = QBC_model$obj_value
iter = QBC_model$iter
y_pred_train = predict_QBC(QBC_model, train)$predict_y[[iter]]
y_final_train = ifelse(y_pred_train >= 0, 1, 0)
train_error[i] = mean(train$y != y_final_train)
y_pred_test = predict_QBC(QBC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
# obj_values = t(as.data.frame(obj_values))
# ave_obj_value = apply(obj_values, 2, mean)
train_error_rate = mean(train_error)
test_error_rate = mean(test_error)
result = list(# "ave_obj_value" = ave_obj_value,
"train_error" = train_error,
"test_error" = test_error,
"train_error_rate" = train_error_rate,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
# MEASURE$ave_obj_value
MEASURE$train_error
MEASURE$test_error
MEASURE$train_error_rate
MEASURE$test_error_rate
将两种信用数据分别输入到上面提到的的三种分类算法的训练和预测函数中可得到三种分类算法的表现。
结论:从上表的分类结果可以得到的结论为:SQBC算法在两种基因数据上的分类表现都优于其他四种分类算法,且SQBC算法对噪音数据有比较好的抗干扰能力。
## colon gene data
options(digits = 10)
colon = read.table("E:\\R\\Data\\gene.txt", header = F, sep = " ")
colon = as.data.frame(t(as.matrix(colon)))
names(colon) = c(paste0(rep("x", 2000), 1:2000), "y")
colon$y = ifelse(colon$y > 0, 1, 0)
dim(colon)
## [1] 62 2001
## golub gene data
load("E:/R/Data/GolubData.RData")
y_train = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
y_test = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0)
y = c(y_train, y_test)
golub = as.data.frame(t(golub))
golub = sapply(golub, function(x) x*log10(2))
golub = data.frame(golub, y)
names(golub) = c(paste0(rep("x", 3571), 1:3571), "y")
row.names(golub) = NULL
dim(golub)
## [1] 72 3572
LOO-CV:
## Leave-One-Out(LOO) cross validation
ALPHA = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0)
N_LOO = vector()
for(k in ALPHA) {
set.seed(789)
folds = createFolds(y = data$y, k = dim(data)[1], list = T, returnTrain = T)
nfolds = length(folds)
n_LOO = vector()
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = k,
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
index = which(k == ALPHA)
N_LOO[index] = sum(n_LOO)
}
N_LOO
5-folds CV
## 5-folds cross validation
ALPHA = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0)
test_error_rate = vector()
for(k in ALPHA) {
set.seed(3154)
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = k,
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
index = which(k == ALPHA)
test_error_rate[index] = mean(test_error)
}
test_error_rate
10-folds CV
## 10-folds cross validation
ALPHA = c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0)
test_error_rate = vector()
for(k in ALPHA) {
set.seed(3154)
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = k,
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
index = which(k == ALPHA)
test_error_rate[index] = mean(test_error)
}
test_error_rate
根据上面的表格可得得到SQBC算法对参数的选择情况为:
| 方法 | 数据 | \(\alpha\) |
|---|---|---|
| LOO | Colon | 1.5 |
| LOO | Leukemia | 0.7-2.0 |
| 5-folds | Colon | 0.01 or 2.0 |
| 5-folds | Leukemia | 1.5 or 2.0 |
| 10-folds | Colon | 0.1 |
| 10-folds | Leukemia | 2.0 |
#########################################################################################
## Leave-One-Out(LOO) cross validation
#########################################################################################
Measure = function(data) {
n_LOO = vector()
folds = createFolds(y = data$y, k = dim(data)[1], list = TRUE, returnTrain = TRUE)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = '?',
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
N_LOO = sum(n_LOO)
test_error_rate = N_LOO / nfolds
result = list("N_LOO" = N_LOO,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$N_LOO
MEASURE$test_error_rate
################################################################################
## 5-fold cross validation
################################################################################
Measure = function(data) {
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = '?',
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
MEASURE$test_error
MEASURE$test_error_rate
################################################################################
## 10-fold cross validation
################################################################################
Measure = function(data) {
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
BS_QC_model = BS_QC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
alpha = '?',
h = 0.1)
iter = BS_QC_model$iter
y_pred_test = predict_BS_QC(BS_QC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
MEASURE$test_error
MEASURE$test_error_rate
#######################################################################################
## Leave-One-Out(LOO) cross validation
#######################################################################################
Measure = function(data) {
n_LOO = vector()
folds = createFolds(y = data$y, k = dim(data)[1], list = TRUE, returnTrain = TRUE)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
QBC_model = QBC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
h = 0.1)
iter = QBC_model$iter
y_pred_test = predict_QBC(QBC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
N_LOO = sum(n_LOO)
test_error_rate = N_LOO / nfolds
result = list("N_LOO" = N_LOO,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$N_LOO
MEASURE$test_error_rate
################################################################################
## 5-fold cross validation
################################################################################
Measure = function(data) {
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
QBC_model = QBC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
h = 0.1)
iter = QBC_model$iter
y_pred_test = predict_QBC(QBC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
################################################################################
## 10-fold cross validation
################################################################################
Measure = function(data) {
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
QBC_model = QBC(train,
shrinkage = 0.1,
iter = 100,
tau = 0.5,
h = 0.1)
iter = QBC_model$iter
y_pred_test = predict_QBC(QBC_model, test)$predict_y[[iter]]
y_final_test = ifelse(y_pred_test >= 0, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
###################################################################################
## Leave-One-Out(LOO) cross validation
###################################################################################
Measure = function(data) {
library(caret)
library(gbm)
n_LOO = vector()
folds = createFolds(y = data$y, k = dim(data)[1], list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
LogitBoost_model = gbm(y ~ .,
distribution = "bernoulli",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = LogitBoost_model$n.trees
y_pred_test = predict(LogitBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
N_LOO = sum(n_LOO)
test_error_rate = N_LOO / nfolds
result = list("N_LOO" = N_LOO,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$N_LOO
MEASURE$test_error_rate
################################################################################
## 5-fold cross validation
################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
LogitBoost_model = gbm(y ~ .,
distribution = "bernoulli",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = LogitBoost_model$n.trees
y_pred_test = predict(LogitBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
################################################################################
## 10-fold cross validation
################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
LogitBoost_model = gbm(y ~ .,
distribution = "bernoulli",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = LogitBoost_model$n.trees
y_pred_test = predict(LogitBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
#####################################################################################
## Leave-One-Out(LOO) cross validation
#####################################################################################
Measure = function(data) {
library(caret)
library(gbm)
n_LOO = vector()
folds = createFolds(y = data$y, k = dim(data)[1], list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
L2_Boost_model = gbm(y ~ .,
distribution = "gaussian",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = L2_Boost_model$n.trees
y_pred_test = predict(L2_Boost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
N_LOO = sum(n_LOO)
test_error_rate = N_LOO / nfolds
result = list("N_LOO" = N_LOO,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$N_LOO
MEASURE$test_error_rate
###################################################################################
## 5-fold cross validation
###################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
L2_Boost_model = gbm(y ~ .,
distribution = "gaussian",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = L2_Boost_model$n.trees
y_pred_test = predict(L2_Boost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
####################################################################################
## 10-fold cross validation
####################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
L2_Boost_model = gbm(y ~ .,
distribution = "gaussian",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = L2_Boost_model$n.trees
y_pred_test = predict(L2_Boost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$test_error
MEASURE$test_error_rate
####################################################################################
## Leave-One-Out(LOO) cross validation
####################################################################################
Measure = function(data) {
library(caret)
library(gbm)
n_LOO = vector()
folds = createFolds(y = data$y, k = dim(data)[1], list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
AdaBoost_model = gbm(y ~ .,
distribution = "adaboost",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = AdaBoost_model$n.trees
y_pred_test = predict(AdaBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
n_LOO[i] = mean(test$y != y_final_test)
}
N_LOO = sum(n_LOO)
test_error_rate = N_LOO / nfolds
result = list("N_LOO" = N_LOO,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
objects(MEASURE)
MEASURE$N_LOO
MEASURE$test_error_rate
################################################################################
## 5-fold cross validation
################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 5, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
AdaBoost_model = gbm(y ~ .,
distribution = "adaboost",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = AdaBoost_model$n.trees
y_pred_test = predict(AdaBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
MEASURE$test_error
MEASURE$test_error_rate
################################################################################
## 10-fold cross validation
################################################################################
Measure = function(data) {
library(caret)
library(gbm)
test_error = vector()
folds = createFolds(y = data$y, k = 10, list = T, returnTrain = T)
nfolds = length(folds)
for(i in 1:nfolds) {
train = data[folds[[i]], ]
test = data[-folds[[i]], ]
AdaBoost_model = gbm(y ~ .,
distribution = "adaboost",
n.minobsinnode = 1,
bag.fraction = 1,
data = train,
n.trees = 100,
shrinkage = 0.1)
iter = AdaBoost_model$n.trees
y_pred_test = predict(AdaBoost_model, test, iter)
y_final_test = ifelse(y_pred_test > 0.5, 1, 0)
test_error[i] = mean(test$y != y_final_test)
}
test_error_rate = mean(test_error)
result = list("test_error" = test_error,
"test_error_rate" = test_error_rate)
return(result)
}
MEASURE = Measure(data)
MEASURE$test_error
MEASURE$test_error_rate
结论:
- 在LOO-CV方法中,
- 在数据集Colon上,SQBC比AdaBoost,LogitBoost和QBC要表现得更好,但是比L2_Boost算法要差,不过SQBC和L2_Boost的误分类率很相近;
- 在数据集Leukemia上,AdaBoost和LogitBoost是表现最好的两种算法。SQBC算法比 L2_Boost和QBC要表现好,但是AdaBoost和LogitBoost的结果跟SQBC的结果很相近。
- 在5-folds CV和10-folds CV方法中,SQBC算法都产生了相比于其他四种分类算法更好地结果,即分类错误率更低。