本案例使用的数据为kaggle中“Santander Customer Satisfaction”比赛的数据。此案例为不平衡二分类问题,目标为最大化auc值(ROC曲线下方面积)。竞赛题目链接为:https://www.kaggle.com/c/santander-customer-satisfaction 。目前此比赛已经结束。
本文档采用微软开源的lightgbm算法进行分类,运行速度极快,超过xgboost算法与rxFastForest算法。
1) 读取数据;
2) 并行运算:由于lightgbm包可以通过设置相应参数进行并行运算,因此不再调用doParallel与foreach包进行并行运算;
3) 特征选择:使用mlr包提取了99%的信息增益;
4) 调参:逐步调试lgb.cv函数的参数,并多次调试,直到满意为止;
5) 集成预测结果:在每个参数的适宜范围内随机抽取参数值构建lightgbm模型,并将多个模型进行集成,输出预测结果;本案例所用程序输出结果的ROC值为0.832023,已绝对超过Private Leaderboard排名第一的结果(0.829072)。
options(java.parameters = "-Xmx8g") ## 特征选择时使用,但是需要在加载包之前设置,否则无效
library(readr)
lgb_tr1 <- read_csv("C:/Users/Administrator/Documents/kaggle/scs_lgb/train.csv")
lgb_te1 <- read_csv("C:/Users/Administrator/Documents/kaggle/scs_lgb/test.csv")
library(dplyr)
library(mlr)
library(parallelMap)
parallelStartSocket(2)
summarizeColumns(lgb_tr1) %>% View()
#impute missing values by mean and mode
imp_tr1 <- impute(
as.data.frame(lgb_tr1),
classes = list(
integer = imputeMean(),
numeric = imputeMean()
)
)
imp_te1 <- impute(
as.data.frame(lgb_te1),
classes = list(
integer = imputeMean(),
numeric = imputeMean()
)
)
## 处理缺失值后
summarizeColumns(imp_tr1$data) %>% View()
table(lgb_tr1$TARGET)
lgb_tr2 <- removeConstantFeatures(imp_tr1$data)
lgb_te2 <- removeConstantFeatures(imp_te1$data)
tr2_name <- data.frame(tr2_name = colnames(lgb_tr2))
te2_name <- data.frame(te2_name = colnames(lgb_te2))
tr2_name_inner <- tr2_name %>%
inner_join(te2_name, by = c('tr2_name' = 'te2_name'))
TARGET = data.frame(TARGET = lgb_tr2$TARGET)
lgb_tr2 <- lgb_tr2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]
lgb_te2 <- lgb_te2[, c(tr2_name_inner$tr2_name[2:dim(tr2_name_inner)[1]])]
lgb_tr2 <- cbind(lgb_tr2, TARGET)
library(lightgbm)
library(ggplot2)
grid_search <- expand.grid(
weight = seq(1, 30, 2)
)
lgb_rate_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr2$TARGET * i + 1) / sum(lgb_tr2$TARGET * i + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr2[, 1:300]),
label = lgb_tr2$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc'
)
# 交叉验证
lgb_tr2_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
learning_rate = .1,
num_threads = 2,
early_stopping_rounds = 10
)
lgb_rate_1[i] <- unlist(lgb_tr2_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr2_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- lgb_rate_1
ggplot(grid_search,aes(x = weight, y = perf)) +
geom_point()
lgb_tr2$TARGET <- factor(lgb_tr2$TARGET)
lgb.task <- makeClassifTask(data = lgb_tr2, target = 'TARGET')
lgb.task.smote <- oversample(lgb.task, rate = 9)
fv_time <- system.time(
fv <- generateFilterValuesData(
lgb.task.smote,
method = c('information.gain')
)
)
library(ggvis)
plotFilterValues(fv)
plotFilterValuesGGVIS(fv)
fv_data2 <- fv$data %>%
arrange(desc(information.gain)) %>%
mutate(info_gain_cul = cumsum(information.gain) / sum(information.gain))
fv_data2_filter <- fv_data2 %>% filter(info_gain_cul <= 0.99)
dim(fv_data2_filter)
fv_feature <- fv_data2_filter$name
lgb_tr3 <- lgb_tr2[, c(fv_feature, 'TARGET')]
lgb_te3 <- lgb_te2[, fv_feature]
write_csv(lgb_tr3, 'C:/users/Administrator/Documents/kaggle/scs_lgb/lgb_tr3.csv')
write_csv(lgb_te3, 'C:/users/Administrator/Documents/kaggle/scs_lgb/lgb_te3.csv')
lgb_tr <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb_tr3.csv')
lgb_te <- rxImport('C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb_te3.csv')
grid_search <- expand.grid(
weight = 1:30
)
perf_weight_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * i + 1) / sum(lgb_tr$TARGET * i + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc'
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
learning_rate = .1,
num_threads = 2,
early_stopping_rounds = 10
)
perf_weight_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_weight_1
ggplot(grid_search,aes(x = weight, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = 2 ^ (-(8:1))
)
perf_learning_rate_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_learning_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_learning_rate_1
ggplot(grid_search,aes(x = learning_rate, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = seq(50, 1000, 50)
)
perf_num_leaves_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_num_leaves_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_num_leaves_1
ggplot(grid_search,aes(x = num_leaves, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
min_data_in_leaf = 2 ^ (1:7)
)
perf_min_data_in_leaf_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
min_data_in_leaf = grid_search[i, 'min_data_in_leaf']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_min_data_in_leaf_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_min_data_in_leaf_1
ggplot(grid_search,aes(x = min_data_in_leaf, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 2 ^ (5:10)
)
perf_max_bin_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_max_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_max_bin_1
ggplot(grid_search,aes(x = max_bin, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 10 * (3:12)
)
perf_max_bin_2 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_max_bin_2[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_max_bin_2
ggplot(grid_search,aes(x = max_bin, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 2 ^ (1:9)
)
perf_min_data_in_bin_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_min_data_in_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_min_data_in_bin_1
ggplot(grid_search,aes(x = min_data_in_bin, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = seq(.5, 1, .02)
)
perf_feature_fraction_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_feature_fraction_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_feature_fraction_1
ggplot(grid_search,aes(x = feature_fraction, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = seq(0, .02, .001)
)
perf_min_sum_hessian_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_min_sum_hessian_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_min_sum_hessian_1
ggplot(grid_search,aes(x = min_sum_hessian, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = seq(0, .01, .002),
lambda_l2 = seq(0, .01, .002)
)
perf_lamda_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_lamda_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_lamda_1
ggplot(data = grid_search, aes(x = lambda_l1, y = perf)) +
geom_point() +
facet_wrap(~ lambda_l2, nrow = 5)
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = seq(0, 1, .1)
)
perf_drop_rate_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_drop_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_drop_rate_1
ggplot(data = grid_search, aes(x = drop_rate, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = seq(1, 10, 2)
)
perf_max_drop_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 2 + 1) / sum(lgb_tr$TARGET * 2 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_max_drop_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_max_drop_1
ggplot(data = grid_search, aes(x = max_drop, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .125,
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_weight_2 <- numeric(length = nrow(grid_search))
for(i in 1:20){
lgb_weight <- (lgb_tr$TARGET * i + 1) / sum(lgb_tr$TARGET * i + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[1, 'learning_rate'],
num_leaves = grid_search[1, 'num_leaves'],
max_bin = grid_search[1, 'max_bin'],
min_data_in_bin = grid_search[1, 'min_data_in_bin'],
feature_fraction = grid_search[1, 'feature_fraction'],
min_sum_hessian = grid_search[1, 'min_sum_hessian'],
lambda_l1 = grid_search[1, 'lambda_l1'],
lambda_l2 = grid_search[1, 'lambda_l2'],
drop_rate = grid_search[1, 'drop_rate'],
max_drop = grid_search[1, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
learning_rate = .1,
num_threads = 2,
early_stopping_rounds = 10
)
perf_weight_2[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
ggplot(data.frame(num = 1:length(perf_weight_2), perf = perf_weight_2), aes(x = num, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = seq(.05, .5, .01),
num_leaves = 600,
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_learning_rate_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_learning_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_learning_rate_1
ggplot(data = grid_search, aes(x = learning_rate, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = seq(50, 800, 50),
max_bin = 30,
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_num_leaves_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_num_leaves_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_num_leaves_1
ggplot(data = grid_search, aes(x = num_leaves, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = seq(30, 150, 10),
min_data_in_bin = 64,
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_max_bin_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_max_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_max_bin_1
ggplot(data = grid_search, aes(x = max_bin, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = seq(20, 100, 5),
feature_fraction = .64,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_min_data_in_bin_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_min_data_in_bin_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_min_data_in_bin_1
ggplot(data = grid_search, aes(x = min_data_in_bin, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = 20,
feature_fraction = .5,
min_sum_hessian = .004,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_feature_fraction_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_feature_fraction_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_feature_fraction_1
ggplot(data = grid_search, aes(x = feature_fraction, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = 20,
feature_fraction = .5,
min_sum_hessian = 0,
lambda_l1 = .002,
lambda_l2 = .008,
drop_rate = .3,
max_drop = 5
)
perf_min_sum_hessian_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_min_sum_hessian_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_min_sum_hessian_1
ggplot(data = grid_search, aes(x = min_sum_hessian, y = perf)) +
geom_point() +
geom_smooth()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = 20,
feature_fraction = .5,
min_sum_hessian = 0,
lambda_l1 = seq(0, .01, .002),
lambda_l2 = seq(0, .01, .002),
drop_rate = .3,
max_drop = 5
)
perf_lambda_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_lambda_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_lamda_1
ggplot(data = grid_search, aes(x = lambda_l1, y = perf)) +
geom_point() +
facet_wrap(~ lambda_l2, nrow = 5)
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = 20,
feature_fraction = .5,
min_sum_hessian = 0,
lambda_l1 = .002,
lambda_l2 = .01,
drop_rate = seq(0, .5, .05),
max_drop = 5
)
perf_drop_rate_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_drop_rate_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_drop_rate_1
ggplot(data = grid_search, aes(x = drop_rate, y = perf)) +
geom_point()
grid_search <- expand.grid(
learning_rate = .2,
num_leaves = 300,
max_bin = 120,
min_data_in_bin = 20,
feature_fraction = .5,
min_sum_hessian = 0,
lambda_l1 = .002,
lambda_l2 = .01,
drop_rate = .3,
max_drop = seq(19, 29, 2)
)
perf_max_drop_1 <- numeric(length = nrow(grid_search))
for(i in 1:nrow(grid_search)){
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search[i, 'learning_rate'],
num_leaves = grid_search[i, 'num_leaves'],
max_bin = grid_search[i, 'max_bin'],
min_data_in_bin = grid_search[i, 'min_data_in_bin'],
feature_fraction = grid_search[i, 'feature_fraction'],
min_sum_hessian = grid_search[i, 'min_sum_hessian'],
lambda_l1 = grid_search[i, 'lambda_l1'],
lambda_l2 = grid_search[i, 'lambda_l2'],
drop_rate = grid_search[i, 'drop_rate'],
max_drop = grid_search[i, 'max_drop']
)
# 交叉验证
lgb_tr_mod <- lgb.cv(
params,
data = lgb_train,
nrounds = 300,
stratified = TRUE,
nfold = 10,
num_threads = 2,
early_stopping_rounds = 10
)
perf_max_drop_1[i] <- unlist(lgb_tr_mod$record_evals$valid$auc$eval)[length(unlist(lgb_tr_mod$record_evals$valid$auc$eval))]
}
grid_search$perf <- perf_max_drop_1
ggplot(data = grid_search, aes(x = max_drop, y = perf)) +
geom_point()
set.seed(1)
grid_search <- expand.grid(
learning_rate = sample(115:125, 10, replace = FALSE) / 100,
num_leaves = sample(250:350, 10, replace = FALSE),
max_bin = sample(115:125, 5, replace = FALSE),
min_data_in_bin = sample(18:22, replace = FALSE),
feature_fraction = c(.5, .62),
min_sum_hessian = 0,
lambda_l1 = .002,
lambda_l2 = c(.008, .009, .01),
drop_rate = sample(126:134, 4, replace = FALSE) / 1000,
max_drop = c(23, 27, 29)
)
sample_ind <- sample(dim(grid_search)[1], 100, replace = FALSE)
lgb.pred <- list()
grid_search2 <- grid_search[sample_ind, ]
rm(grid_search)
lgb_weight <- (lgb_tr$TARGET * 8 + 1) / sum(lgb_tr$TARGET * 8 + 1)
lgb_train <- lgb.Dataset(
data = data.matrix(lgb_tr[, 1:137]),
label = lgb_tr$TARGET,
free_raw_data = FALSE,
weight = lgb_weight
)
for (i in 1:nrow(grid_search2)[1]){
# 参数
params <- list(
objective = 'binary',
metric = 'auc',
learning_rate = grid_search2[i, 'learning_rate'],
num_leaves = grid_search2[i, 'num_leaves'],
max_bin = grid_search2[i, 'max_bin'],
min_data_in_bin = grid_search2[i, 'min_data_in_bin'],
feature_fraction = grid_search2[i, 'feature_fraction'],
min_sum_hessian = grid_search2[i, 'min_sum_hessian'],
lambda_l1 = grid_search2[i, 'lambda_l1'],
lambda_l2 = grid_search2[i, 'lambda_l2'],
drop_rate = grid_search2[i, 'drop_rate'],
max_drop = grid_search2[i, 'max_drop']
)
# 模型
lgb_mod <- lightgbm(
params = params,
data = lgb_train,
nrounds = 300,
early_stopping_rounds = 10,
num_threads = 2
)
# 预测
lgb.pred[[i]] <- predict(lgb_mod, data.matrix(lgb_te))
}
lgb.pred2 <- matrix(unlist(lgb.pred), ncol = 100)
lgb.pred3 <- data.frame(prob1 = apply(lgb.pred2, 1, mean))
write.csv(lgb.pred3, "C:/Users/Administrator/Documents/kaggle/scs_lgb/lgb.pred1.csv")