Refer to the kaggle prediction challenge here.
library(data.table)
library(Matrix)
library(randomForest)
library(dplyr)
library(pROC)
set.seed(23567)
PATH <- "E:/kkbox/datav2/"
cat("Load data\n")
train <-
fread(
paste0(PATH, "train.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
test <-
fread(
paste0(PATH, "train_v2.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
transactions <-
fread(
paste0(PATH, "transactions.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
transactions_v2 <-
fread(
paste0(PATH, "transactions_v2.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
members <-
fread(
paste0(PATH, "members_v3.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
submission <-
fread(
paste0(PATH, "sample_submission_v2.csv"),
sep = ",",
na.strings = "",
stringsAsFactors = T
)
#user_logs <- dplyr::tbl(my_db, "user_log_used_v2")
# Combine train and test files
train$sample <- "train"
test$sample <- "test"
submission$sample <- "submission"
submission$is_churn <- NA
data <- rbind(train, test, submission)
data[, is_duplicate := as.numeric(duplicated(as.character(data$msno)) |
duplicated(as.character(data$msno), fromLast = T))]
data[, is_member := (data$msno %in% members$msno)]
# Reduce size of transactions
transactions <- rbind(transactions, transactions_v2)
transactions <-
transactions[transactions$msno %in% levels(data$msno), ]
transactions[, n_transactions := .N, by = msno]
# add variable "payment_diff"
transactions[, payment_diff := plan_list_price - actual_amount_paid]
# calculate mean of each user
# (tried version of not calculating mean, because some categorical variable such as payment_method would turn to continuous, which doesn't make sense; however I face difficulty to deal with finding representative transaction row.)
transactions <-
transactions[, lapply(.SD, mean, na.rm = T), by = msno, .SDcols = names(transactions)[c(2:6, 9:11)]]
# merge data with transactions
data <- merge(data, transactions, by = "msno", all.x = TRUE)
# merge data with members
data <- merge(data, members, by = "msno", all.x = TRUE)
data <- data %>% mutate(gender = as.numeric(gender, labels = c(0, 1)))
data[, 6:17][is.na(data[, 6:17])] <- 666666
data$is_churn <- as.factor(data$is_churn)
idc_train <- data$sample == "train"
idc_test <- data$sample == "test"
idc_submission <- data$sample == "submission"
# sometimes work but sometimes doesn't ...
# randomForest_fit <- randomForest(is_churn ~. - msno - sample - registration_init_time, data = data, subset = idc_train, mtry = 5, sampsize = 1000, ntree = 50, importance = T)
# tried to run 500 trees, and also taking away some variables, some is feasible, but the computer usually shows a bomb instead. Eventually submit this version.
randomForest_fit <- randomForest(is_churn ~ is_duplicate + is_member + payment_method_id + payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew + is_cancel + n_transactions + payment_price_diff + city + bd + gender + registered_via, data = data, subset = idc_train,mtry = 5, sampsize = 1000, ntree = 50, importance = T)
varImpPlot(randomForest_fit)
write.table(data, file = "ver6.CSV", sep = ",", row.names = F)
data$is_churn_hat <- predict(randomForest_fit, newdata = data)
mean(data$is_churn[idc_train] != data$is_churn_hat[idc_train])
mean(data$is_churn[idc_test] != data$is_churn_hat[idc_test])
data$is_churn_prob <- predict(randomForest_fit, newdata = data, type = "prob")[, 2]
pROC::plot.roc(data$is_churn[idc_train], data$is_churn_prob[idc_train])
pROC::plot.roc(data$is_churn[idc_test], data$is_churn_prob[idc_test])
pROC::auc(data$is_churn[idc_train], data$is_churn_prob[idc_train])
pROC::auc(data$is_churn[idc_test], data$is_churn_prob[idc_test])
data_submission <- data[data$sample == "submission", c("msno", "is_churn_prob")]
data_submission$msno <- as.character(data_submission$msno)
submission$msno <- as.character(submission$msno)
submission_final <- left_join(submission, data_submission, by = "msno")
submission_final$is_churn <- submission_final$is_churn_prob
submission_final$is_churn_prob <- NULL
submission_final$sample <- NULL
write.csv(submission_final, file = "submission.csv", quote = FALSE, row.names = FALSE)