零、案例背景介绍与建模思路说明

1.背景介绍

本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。

2.建模思路

此文档采用R中的mlr包中的smote算法来处理数据类别不平衡的问题,用Microsoft R Server(专业版R)中的RevoScaleR包中rxFastForest函数进行随机森林建模。采用mlr包调用randomforest包的randomForest函数建模,进行并行运算,效率依然低下,不能满足正常工作;因此需要调用RevoScaleR包的函数,rxDForest可以进行随机森林建模,但是效率远低于rxFastForest函数,因此本文档采用rxFastForest函数。由于随机森林函数效率较低,因此此文档所读取的数据为“ http://rpubs.com/yisu/xgboost_mlr_kaggle_case_oversample ” 文档中处理后的xgb_tr3,xgb_te3数据(提取信约95%的信息增益);故而本文档直接进入建模部分,不再做数据探索与处理。
1) 读取数据;
2) 并行运算:由于rxFastForest函数可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;
3) 特征选择:本文档不再处理;
4) 调参:逐步调试rxFastForest函数的参数,并多次调试,直到满意为止;
5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建rxFastForest模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.829533,已超过Private Leaderboard排名第一的结果。

一、读取数据

rx_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_tr3.csv')
rx_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_rf/rf_te3.csv')

二、算法

1.建模准备

1)模型公式
rx_formula <- paste0(
    'TARGET ~ ', 
    paste0(colnames(rx_tr)[1:(rx_tr_ncol - 1)], collapse = ' + '), 
    collapse = ''
)
2)加装包
library(mlr) ## 调用smote函数
library(parallelMap) ## 并行运算
parallelStartSocket(4)
library(pROC) ## 计算auc值
library(caret) ## 十折交叉验证
library(ggplot2) ## 调参时绘图查看参数不同值对应的auc值,以确定最优参数

2.调试parms参数中的rate与nn参数(smote处理类别不平衡)

1)rate与nn参数
grid_search <- expand.grid(
    rate = seq(5, 50, 5),
    nn = seq(5, 17, 2)
)
2)构建perf矩阵放置auc值
perf_rate_1 <- matrix(nrow = nrow(grid_search), ncol = 10) ## 十折交叉验证
3)十折交叉验证
set.seed(1)
folds <- createFolds(y=1:rx_tr_nrow, k=10)  
4)计算auc值—由于rxFastForest自动调用并行运算,因此此处使用循环
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = grid_search[j, 'rate'],
            nn = grid_search[j, 'nn']
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = 500,
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_rate_1[j, i] <- rx_tr_roc$auc
    }
}
perf_rate_1_f <- apply(perf_rate_1, 1, mean) ## 十折交叉验证的平均值
## 绘图
ggplot(data = grid_search, aes(x = rate, y = perf)) +
    geom_point() + 
    facet_wrap(facets = ~ nn, ncol = 3)

5)结论:rate = 5, nn = 9时最优

3.继续调试parms参数中的rate参数(类别不平衡)

1)十折交叉验证
set.seed(2)
folds <- createFolds(y=1:rx_tr_nrow, k=10)  
2)rate参数
grid_search <- expand.grid(
    rate = seq(2, 9, 1),
    nn = 9
)
3)构建perf矩阵放置auc值
perf_rate_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = grid_search[j, 'rate'],
            nn = grid_search[j, 'nn']
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = 500,
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_rate_2[j, i] <- rx_tr_roc$auc
    }
}
perf_rate_2_f <- apply(perf_rate_2, 1, mean) ## 十折交叉验证平均值
## 绘图
grid_search$perf <- perf_rate_2_f
ggplot(data = grid_search, aes(x = rate, y = perf)) +
    geom_point()
5)结论:rate = 4, nn = 9时最优

4.调试parms参数中的numTrees参数

1)十折交叉验证
set.seed(3)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)numTrees参数
grid_search <- expand.grid(
    numTrees = seq(100, 1000, 100)
)
3)构建perf矩阵放置auc值
perf_numTrees_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_numTrees_1[j, i] <- rx_tr_roc$auc
    }
}
perf_numTrees_1_f <- apply(perf_numTrees_1, 1, mean)
grid_search$perf <- perf_numTrees_1_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
    geom_point()
5)结论:numTrees=600时最优

5.调试parms参数中的numLeaves参数

1)十折交叉验证
set.seed(4)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)numLeaves参数
grid_search <- expand.grid(
    numTrees = 600,
    numLeaves = 2 ^ (5:9)
)
3)构建perf矩阵放置auc值
perf_numLeaves_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_numLeaves_1[j, i] <- rx_tr_roc$auc
    }
}
perf_numLeaves_1_f <- apply(perf_numLeaves_1, 1, mean)
grid_search$perf <- perf_numLeaves_1_f
ggplot(data = grid_search, aes(x = numLeaves, y = perf)) +
    geom_point()
5)结论:numLeaves=2^7时最优

6.调试parms参数中的minSplit参数

1)十折交叉验证
set.seed(5)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)minSplit参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = seq(5, 30, 5)
)
3)构建perf矩阵放置auc值
perf_minSplit_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_minSplit_1[j, i] <- rx_tr_roc$auc
    }
}
perf_minSplit_1_f <- apply(perf_minSplit_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_minSplit_1_f
ggplot(data = grid_search, aes(x = minSplit, y = perf)) +
    geom_point()
5)结论:minSplit=25 时最优

7.调试parms参数中的exampleFraction参数

1)十折交叉验证
set.seed(6)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)exampleFraction参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = seq(.55, .9, .05)
    
)
3)构建perf矩阵放置auc值
perf_exampleFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_exampleFraction_1[j, i] <- rx_tr_roc$auc
    }
}
perf_exampleFraction_1_f <- apply(perf_exampleFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_exampleFraction_1_f
ggplot(data = grid_search, aes(x = exampleFraction, y = perf)) +
    geom_point()
5)结论:exampleFraction=.6 时最优[.55, .65]比较好,但是[.55, 1]区间变化不大

8.调试parms参数中的featureFraction参数

1)十折交叉验证
set.seed(7)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)featureFraction参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = seq(.5, .9, .05)
    
)
3)构建perf矩阵放置auc值
perf_featureFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_featureFraction_1[j, i] <- rx_tr_roc$auc
    }
}
perf_featureFraction_1_f <- apply(perf_featureFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_featureFraction_1_f
ggplot(data = grid_search, aes(x = featureFraction, y = perf)) +
    geom_point()
5)结论:featureFraction=.85 时最优[.75, .9]比较好

9.调试parms参数中的splitFraction参数

1)十折交叉验证
set.seed(8)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)splitFraction参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = .85,
    splitFraction = seq(.5, .95, .05)
)
3)构建perf矩阵放置auc值
perf_splitFraction_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            splitFraction = grid_search[j, 'splitFraction'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_splitFraction_1[j, i] <- rx_tr_roc$auc
    }
}
perf_splitFraction_1_f <- apply(perf_splitFraction_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_splitFraction_1_f
ggplot(data = grid_search, aes(x = splitFraction, y = perf)) +
    geom_point()
5)结论:splitFraction=.5 时最优,但是变化细微

10.调试parms参数中的numBins参数

1)十折交叉验证
set.seed(9)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)numBins参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = .85,
    splitFraction = .5,
    numBins = seq(105, 505, 50)
)
3)构建perf矩阵放置auc值
perf_numBins_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            splitFraction = grid_search[j, 'splitFraction'],
            numBins = grid_search[j, 'numBins'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_numBins_1[j, i] <- rx_tr_roc$auc
    }
}
perf_numBins_1_f <- apply(perf_numBins_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numBins_1_f
ggplot(data = grid_search, aes(x = numBins, y = perf)) +
    geom_point() + 
    geom_smooth()
5)结论:numBins=350 时最优,但是变化细微

11.调试parms参数中的firstUsePenalty参数

1)十折交叉验证
set.seed(10)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)firstUsePenalty参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = .85,
    splitFraction = .5,
    numBins = 350,
    firstUsePenalty = seq(0, 1, .2)
)
3)构建perf矩阵放置auc值
perf_firstUsePenalty_1 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            splitFraction = grid_search[j, 'splitFraction'],
            numBins = grid_search[j, 'numBins'],
            firstUsePenalty = grid_search[j, 'firstUsePenalty'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_firstUsePenalty_1[j, i] <- rx_tr_roc$auc
    }
}
perf_firstUsePenalty_1_f <- apply(perf_firstUsePenalty_1, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_firstUsePenalty_1_f
ggplot(data = grid_search, aes(x = firstUsePenalty, y = perf)) +
    geom_point() + 
    geom_smooth()
5)结论:firstUsePenalty=1.2 时最优

12.调试parms参数中的gainConfLevel参数

1)十折交叉验证
set.seed(11)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)gainConfLevel参数
grid_search <- expand.grid(
    numTrees=500,
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = .85,
    splitFraction = .5,
    numBins = 350,
    firstUsePenalty = 1.2,
    gainConfLevel = seq(.01, .1, .01)
)
3)构建perf矩阵放置auc值
perf_gainConfLevel_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            splitFraction = grid_search[j, 'splitFraction'],
            numBins = grid_search[j, 'numBins'],
            firstUsePenalty = grid_search[j, 'firstUsePenalty'],
            gainConfLevel = grid_search[j, 'gainConfLevel'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_gainConfLevel_2[j, i] <- rx_tr_roc$auc
    }
}
perf_gainConfLevel_2_f <- apply(perf_gainConfLevel_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_gainConfLevel_2_f
ggplot(data = grid_search, aes(x = gainConfLevel, y = perf)) +
    geom_point() + 
    geom_smooth()
5)结论:gainConfLevel=0.05 时最优

13.再次调试parms参数中的numTrees参数

1)十折交叉验证
set.seed(12)
folds <- createFolds(y=1:rx_tr_nrow, k=10) 
2)numTrees参数
grid_search <- expand.grid(
    numTrees=seq(200, 600, 100),
    numLeaves = 2^7,
    minSplit = 25,
    exampleFraction = .6,
    featureFraction = .85,
    splitFraction = .5,
    numBins = 350,
    firstUsePenalty = 1.2,
    gainConfLevel = .05
)
3)构建perf矩阵放置auc值
perf_numTrees_2 <- matrix(nrow = nrow(grid_search), ncol = 10)
4)计算auc值
for (j in 1:nrow(grid_search)){
    for (i in 1:10){
        # 分割数据
        rx_tr_1 <- rx_tr[-folds[[i]], ]
        rx_tr_2 <- rx_tr[ folds[[i]], ]
        # smote 抽样
        rx_tr_1_task <- makeClassifTask(data = rx_tr_1, target = 'TARGET')
        rx_tr_1_task_smote <- smote(
            rx_tr_1_task, 
            rate = 4,
            nn = 9
        )
        rx_tr_1_2 <- getTaskData(rx_tr_1_task_smote)
        
        # 训练
        rx_tr_mod <- rxFastForest(
            formula = rx_formula,
            data = rx_tr_1_2,
            numTrees = grid_search[j, 'numTrees'],
            numLeaves = grid_search[j, 'numLeaves'],
            minSplit = grid_search[j, 'minSplit'],
            exampleFraction = grid_search[j, 'exampleFraction'],
            featureFraction = grid_search[j, 'featureFraction'],
            splitFraction = grid_search[j, 'splitFraction'],
            numBins = grid_search[j, 'numBins'],
            firstUsePenalty = grid_search[j, 'firstUsePenalty'],
            gainConfLevel = grid_search[j, 'gainConfLevel'],
            trainThreads = 4
        )
        # 预测
        rx_tr_pre <- rxPredict(
            rx_tr_mod,
            rx_tr_2
        )
        # 修改数据类型以计算auc值
        rx_tr_pre$actual <- as.numeric(rx_tr_2$TARGET)
        
        # 计算auc值
        rx_tr_roc <- roc(rx_tr_pre$actual, rx_tr_pre$Probability.1)
        perf_numTrees_2[j, i] <- rx_tr_roc$auc
    }
}
perf_numTrees_2_f <- apply(perf_numTrees_2, 1, mean, na.rm = TRUE)
grid_search$perf <- perf_numTrees_2_f
ggplot(data = grid_search, aes(x = numTrees, y = perf)) +
    geom_point() + 
    geom_smooth()
5)结论:numTrees=600 时最优

结论:由于参数不发生变化,因此停止训练

三、集成学习

0)参数
set.seed(1)
grid_search <- expand.grid(
    numTrees=sample(550:650, 10, replace = FALSE),
    numLeaves = sample(124:132, 4, replace = FALSE),
    minSplit = sample(24:26, 2, replace = FALSE),
    exampleFraction = sample(550:650, 10, replace = FALSE) / 1000,
    featureFraction = sample(750:900, 10, replace = FALSE) / 1000,
    splitFraction = sample(45:55, 5, replace = FALSE) / 100,
    numBins = sample(320:380, 5, replace = FALSE),
    firstUsePenalty = sample(115:125, 5, replace = FALSE) / 100,
    gainConfLevel = sample(45:55, 5, replace = FALSE) / 1000
)
sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)

grid_search2 <- grid_search[sample_ind, ]
rm(grid_search)
1) 放置结果
rxfastforest.pred <- list()
2)训练
for (i in 1:nrow(grid_search2)[1]){
    
    # smote 抽样
    rx_tr_task <- makeClassifTask(data = rx_tr, target = 'TARGET')
    rx_tr_task_smote <- smote(
        rx_tr_task, 
        rate = 4,
        nn = 9
    )
    rx_tr_2 <- getTaskData(rx_tr_task_smote)
    
    # 训练
    rx_tr_mod <- rxFastForest(
        formula = rx_formula,
        data = rx_tr_2,
        numTrees = grid_search2[i, 'numTrees'],
        numLeaves = grid_search2[i, 'numLeaves'],
        minSplit = grid_search2[i, 'minSplit'],
        exampleFraction = grid_search2[i, 'exampleFraction'],
        featureFraction = grid_search2[i, 'featureFraction'],
        splitFraction = grid_search2[i, 'splitFraction'],
        numBins = grid_search2[i, 'numBins'],
        firstUsePenalty = grid_search2[i, 'firstUsePenalty'],
        gainConfLevel = grid_search2[i, 'gainConfLevel'],
        trainThreads = 4
    )
    # 预测
    rx_tr_pre <- rxPredict(
        rx_tr_mod,
        rx_te
    )
    rxfastforest.pred[[i]] <- rx_tr_pre$Probability.1
}
3)结果
rxfastforest.pred2 <- matrix(unlist(rxfastforest.pred), ncol = 100)
rxfastforest.pred3 <- data.frame(prob1 = apply(rxfastforest.pred2, 1, mean))
4)输出
write.csv(rxfastforest.pred3, "C:/Users/Administrator/Documents/kaggle/scs_rf/rxfastforest.pred1.csv")