本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。
此文档采用R中的mlr包中的smote算法来处理数据类别不平衡的问题,用Microsoft R Server(专业版R)中的RevoScaleR包中rxFastForest函数进行随机森林建模。采用mlr包调用randomforest包的randomForest函数建模,进行并行运算,效率依然低下,不能满足正常工作;因此需要调用RevoScaleR包的函数,rxDForest可以进行随机森林建模,但是效率远低于rxFastForest函数,因此本文档采用rxFastForest函数。由于随机森林函数效率较低,因此此文档所读取的数据为“ http://rpubs.com/yisu/xgboost_mlr_kaggle_case_oversample ” 文档中处理后的xgb_tr3,xgb_te3数据(提取信约95%的信息增益);故而本文档直接进入建模部分,不再做数据探索与处理。
1) 读取数据;
2) 并行运算:由于rxFastForest函数可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;
3) 特征选择:本文档不再处理;
4) 调参:逐步调试rxFastForest函数的参数,并多次调试,直到满意为止;
5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建rxFastForest模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.829533,已超过Private Leaderboard排名第一的结果。
rx_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_tr3.csv')
rx_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_te3.csv')
rx_formula <- paste0(
'TARGET ~ ',
paste0(colnames(rx_tr)[1:(rx_tr_ncol - 1)], collapse = ' + '),
collapse = ''
)
library(mlr) ## 调用smote函数
library(parallelMap) ## 并行运算
parallelStartSocket(4)
library(pROC) ## 计算auc值
library(caret) ## 十折交叉验证
library(ggplot2) ## 调参时绘图查看参数不同值对应的auc值,以确定最优参数
grid_search <- expand.grid(
rate = seq(5, 50, 5),
nn = seq(5, 17, 2)
)
perf_rate_1 <- matrix(nrow = nrow(grid_search), ncol = 10) ## 十折交叉验证
set.seed(1)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = grid_search[j, 'rate'],
nn = grid_search[j, 'nn']
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = 500,
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_rate_1[j, i] <- rx_tr_roc$auc
}
}
perf_rate_1_f <- apply(perf_rate_1, 1, mean) ## 十折交叉验证的平均值
## 绘图
ggplot(data = grid_search, aes(x = rate, y = perf)) +
geom_point() +
facet_wrap(facets = ~ nn, ncol = 3)
set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
rate = seq(2, 9, 1),
nn = 9
)
perf_rate_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = grid_search[j, 'rate'],
nn = grid_search[j, 'nn']
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = 500,
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_rate_2[j, i] <- rx_tr_roc$auc
}
}
perf_rate_2_f <- apply(perf_rate_2, 1, mean) ## 十折交叉验证平均值
## 绘图
grid_search$perf <- perf_rate_2_f
ggplot(data = grid_search, aes(x = rate, y = perf)) +
geom_point()
set.seed(3)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees = seq(100, 1000, 100)
)
perf_numTrees_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numTrees_1[j, i] <- rx_tr_roc$auc
}
}
perf_numTrees_1_f <- apply(perf_numTrees_1, 1, mean)
grid_search$perf <- perf_numTrees_1_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
geom_point()
set.seed(4)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees = 600,
numLeaves = 2 ^ (5:9)
)
perf_numLeaves_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numLeaves_1[j, i] <- rx_tr_roc$auc
}
}
perf_numLeaves_1_f <- apply(perf_numLeaves_1, 1, mean)
grid_search$perf <- perf_numLeaves_1_f
ggplot(data = grid_search, aes(x = numLeaves, y = perf)) +
geom_point()
set.seed(5)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = seq(5, 30, 5)
)
perf_minSplit_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_minSplit_1[j, i] <- rx_tr_roc$auc
}
}
perf_minSplit_1_f <- apply(perf_minSplit_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_minSplit_1_f
ggplot(data = grid_search, aes(x = minSplit, y = perf)) +
geom_point()
set.seed(6)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = seq(.55, .9, .05)
)
perf_exampleFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_exampleFraction_1[j, i] <- rx_tr_roc$auc
}
}
perf_exampleFraction_1_f <- apply(perf_exampleFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_exampleFraction_1_f
ggplot(data = grid_search, aes(x = exampleFraction, y = perf)) +
geom_point()
set.seed(7)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = seq(.5, .9, .05)
)
perf_featureFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_featureFraction_1[j, i] <- rx_tr_roc$auc
}
}
perf_featureFraction_1_f <- apply(perf_featureFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_featureFraction_1_f
ggplot(data = grid_search, aes(x = featureFraction, y = perf)) +
geom_point()
set.seed(8)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = seq(.5, .95, .05)
)
perf_splitFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_splitFraction_1[j, i] <- rx_tr_roc$auc
}
}
perf_splitFraction_1_f <- apply(perf_splitFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_splitFraction_1_f
ggplot(data = grid_search, aes(x = splitFraction, y = perf)) +
geom_point()
set.seed(9)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = seq(105, 505, 50)
)
perf_numBins_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numBins_1[j, i] <- rx_tr_roc$auc
}
}
perf_numBins_1_f <- apply(perf_numBins_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numBins_1_f
ggplot(data = grid_search, aes(x = numBins, y = perf)) +
geom_point() +
geom_smooth()
set.seed(10)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = seq(0, 1, .2)
)
perf_firstUsePenalty_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_firstUsePenalty_1[j, i] <- rx_tr_roc$auc
}
}
perf_firstUsePenalty_1_f <- apply(perf_firstUsePenalty_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_firstUsePenalty_1_f
ggplot(data = grid_search, aes(x = firstUsePenalty, y = perf)) +
geom_point() +
geom_smooth()
set.seed(11)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=500,
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = 1.2,
gainConfLevel = seq(.01, .1, .01)
)
perf_gainConfLevel_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
gainConfLevel = grid_search[j, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_gainConfLevel_2[j, i] <- rx_tr_roc$auc
}
}
perf_gainConfLevel_2_f <- apply(perf_gainConfLevel_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_gainConfLevel_2_f
ggplot(data = grid_search, aes(x = gainConfLevel, y = perf)) +
geom_point() +
geom_smooth()
set.seed(12)
folds <- createFolds(y=1:rx_tr_nrow, k=10)
grid_search <- expand.grid(
numTrees=seq(200, 600, 100),
numLeaves = 2^7,
minSplit = 25,
exampleFraction = .6,
featureFraction = .85,
splitFraction = .5,
numBins = 350,
firstUsePenalty = 1.2,
gainConfLevel = .05
)
perf_numTrees_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
for (i in 1:10){
# 分割数据
rx_tr_1 <- rx_tr[-folds[[i]], ]
rx_tr_2 <- rx_tr[ folds[[i]], ]
# smote 抽样
rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
rx_tr_1_task_smote <- smote(
rx_tr_1_task,
rate = 4,
nn = 9
)
rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_1_2,
numTrees = grid_search[j, 'numTrees'],
numLeaves = grid_search[j, 'numLeaves'],
minSplit = grid_search[j, 'minSplit'],
exampleFraction = grid_search[j, 'exampleFraction'],
featureFraction = grid_search[j, 'featureFraction'],
splitFraction = grid_search[j, 'splitFraction'],
numBins = grid_search[j, 'numBins'],
firstUsePenalty = grid_search[j, 'firstUsePenalty'],
gainConfLevel = grid_search[j, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_tr_2
)
# 修改数据类型以计算auc值
rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
# 计算auc值
rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
perf_numTrees_2[j, i] <- rx_tr_roc$auc
}
}
perf_numTrees_2_f <- apply(perf_numTrees_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numTrees_2_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
geom_point() +
geom_smooth()
set.seed(1)
grid_search <- expand.grid(
numTrees=sample(550:650, 10, replace = FALSE),
numLeaves = sample(124:132, 4, replace = FALSE),
minSplit = sample(24:26, 2, replace = FALSE),
exampleFraction = sample(550:650, 10, replace = FALSE) / 1000,
featureFraction = sample(750:900, 10, replace = FALSE) / 1000,
splitFraction = sample(45:55, 5, replace = FALSE) / 100,
numBins = sample(320:380, 5, replace = FALSE),
firstUsePenalty = sample(115:125, 5, replace = FALSE) / 100,
gainConfLevel = sample(45:55, 5, replace = FALSE) / 1000
)
sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)
grid_search2 <- grid_search[sample_ind, ]
rm(grid_search)
rxfastforest.pred <- list()
for (i in 1:nrow(grid_search2)[1]){
# smote 抽样
rx_tr_task <- makeClassifTask(data = rx_tr, target = 'TARGET')
rx_tr_task_smote <- smote(
rx_tr_task,
rate = 4,
nn = 9
)
rx_tr_2 <- getTaskData(rx_tr_task_smote)
# 训练
rx_tr_mod <- rxFastForest(
formula = rx_formula,
data = rx_tr_2,
numTrees = grid_search2[i, 'numTrees'],
numLeaves = grid_search2[i, 'numLeaves'],
minSplit = grid_search2[i, 'minSplit'],
exampleFraction = grid_search2[i, 'exampleFraction'],
featureFraction = grid_search2[i, 'featureFraction'],
splitFraction = grid_search2[i, 'splitFraction'],
numBins = grid_search2[i, 'numBins'],
firstUsePenalty = grid_search2[i, 'firstUsePenalty'],
gainConfLevel = grid_search2[i, 'gainConfLevel'],
trainThreads = 4
)
# 预测
rx_tr_pre <- rxPredict(
rx_tr_mod,
rx_te
)
rxfastforest.pred[[i]] <- rx_tr_pre$Probability.1
}
rxfastforest.pred2 <- matrix(unlist(rxfastforest.pred), ncol = 100)
rxfastforest.pred3 <- data.frame(prob1 = apply(rxfastforest.pred2, 1, mean))
write.csv(rxfastforest.pred3, "C:/Users/Administrator/Documents/kaggle/scs_rf/rxfastforest.pred1.csv")