首先參考Kaggle提供的Baseline code,將原始的member、transaction等資料合併整理成一個資料集,命名為dat。
library(data.table)
library(Matrix)
library(randomForest)
library(dplyr)
library(ggplot)
library(ggplot2)
PATH <- "C:/Users/user/Documents/106Course/Practicum in Applied Psy/data_new/"
train <- fread(paste0(PATH, "train.csv"),sep = ",",na.strings = "")
test <- fread(paste0(PATH, "train_v2.csv"),sep = ",",na.strings = "")
transactions <- fread(paste0(PATH, "transactions.csv"),sep = ",",na.strings = "")
transactions_v2 <- fread(paste0(PATH, "transactions_v2.csv"),sep = ",",na.strings = "")
members <- fread(paste0(PATH, "members_v3.csv"),sep = ",",na.strings = "")
submission <-fread(paste0(PATH, "sample_submission_v2.csv"),sep = ",",na.strings = "")
train$sample <- "train"
test$sample <- "test"
submission$sample <- "submission"
submission$is_churn <- NA
dat <- rbind(train, test, submission)
dat[, is_duplicate := as.numeric(duplicated(as.character(dat$msno)) |
duplicated(as.character(dat$msno), fromLast = T))]
dat[, is_member := (dat$msno %in% members$msno)]
members[,gender := as.numeric(gender)]
members$gender[is.na(members$gender)] <- 0
members[,":="(reg_fulldate = members$registration_init_time
,registration_init_time = as.Date(as.character(registration_init_time), '%Y%m%d'))]
members[,":="(reg_year = year(registration_init_time)
,reg_month = month(registration_init_time)
,reg_mday = mday(registration_init_time)
,reg_wday = wday(registration_init_time))]
members <- subset(members, select = -registration_init_time)
dat <- merge(dat, members, by = "msno", all.x = TRUE)
#Reduce size of transactions a bit
trans <- rbind(transactions, transactions_v2)
trans <- trans[trans$msno %in% levels(factor(dat$msno)), ]
#Get amount of transactions per user
trans[, n_transactions := .N, by = msno]
#Get difference between plan price and payment amount
trans[, payment_price_diff := plan_list_price - actual_amount_paid]
#Aggregate by user, get mean of columns
#Remove the transaction dates
trans <-
trans[, lapply(.SD, mean, na.rm = TRUE), by = msno, .SDcols = names(trans)[c(2:6, 9:11)]]
#Merge data and transactions
dat <- merge(dat, trans, by = "msno", all.x = TRUE)
setwd("C:/Users/user/Documents/106Course/Practicum in Applied Psy/data_create")
write.csv(dat,"dat20171129.csv")
在dat中看看各變項有無特別之處,發現可能與使用者是否背叛有關
p1 <- train %>%
ggplot(aes(is_churn, fill = is_churn)) +
geom_bar() +
theme(legend.position = "none")
p2 <- members %>%
ggplot(aes(gender, fill = gender)) +
geom_bar() +
theme(legend.position = "none")
p3 <- members %>%
ggplot(aes(reg_via, fill = reg_via)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_sqrt()
p4 <- members %>%
ggplot(aes(city, fill = city)) +
geom_bar() +
theme(legend.position = "none") +
scale_y_sqrt()
p5 <- members %>%
filter(bd > 0 & bd < 100) %>%
ggplot(aes(bd)) +
geom_density(fill = "red", bw = 1)
layout <- matrix(c(1,1,2,2,3,3,4,4,4,5,5,5),2,6,byrow=TRUE)
multiplot(p1, p2, p3, p4, p5, layout=layout)
tb2_3 <- table(dat[,c(2,3)])
colnames(tb2_3) <- c("No Duplicate","Duplicate")
rownames(tb2_3) <- c("No Churn","Churn","NA")
plot(tb2_3, col=topo.colors(length(tb2_3)),
main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[3],"]"))
tb2_4 <- table(dat[,c(2,4)])
rownames(tb2_4) <- c("No Churn","Churn","NA")
plot(tb2_4, col=topo.colors(length(tb2_4)),
main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[4],"]"))
dd <- na.omit(dat[,c(2,5)])
dd$bd2 <- as.factor((dd$bd >=10) & (dd$bd <=70))
tb2_5 <- table(dd$is_churn,dd$bd2)
colnames(tb2_5) <- c("Unusual","Usual")
rownames(tb2_5) <- c("No Churn","Churn","NA")
plot(tb2_5, col=topo.colors(length(tb2_5)),
main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[5],"]"))
發現資料中的user age有許多應非年齡的怪資料,故假定10歲至70歲是真實年齡,其他均為怪資料,創立一新變項bd2區分兩者。
tb2_7 <- table(dat[,c(2,7)])
colnames(tb2_7) <- paste0("Method ",colnames(tb2_7))
rownames(tb2_7) <- c("No Churn","Churn","NA")
plot(tb2_7, col=topo.colors(length(tb2_7)),
main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[7],"]"))
dat$n_transactions <- as.numeric(dat$n_transactions)
nt_cut <- cut(dat$n_transactions, breaks=c(0,8,16,24,32,Inf))
tb2_25 <- table(dat$is_churn, nt_cut)
rownames(tb2_25) <- c("No Churn","Churn","NA")
plot(tb2_25, col=topo.colors(length(tb2_25)),
main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[25],"]"))
將交易量分為若干組。
建立邏輯式迴歸模型,放入前面覺得有趣的變項:是否重複、城市、註冊管道、年齡是否怪怪的、交易量。
datIN3 <- datIN3[datIN5$V27,]; datIN4 <- datIN4[datIN5$V27,]
datIN7 <- datIN7[datIN5$V27,]; datIN25 <- datIN25[datIN5$V27,]
datIN <- data.frame(datIN3$is_churn, datIN3$is_duplicate, datIN4$city,
datIN5$bd2, datIN7$registered_via, datIN25$n_cut)
colnames(datIN) <- c("is_churn","is_duplicate","city","bd2","registered_via","n_cut")
rm(datIN3); rm(datIN4); rm(datIN5); rm(datIN7); rm(datIN25)
lrIN <- glm(is_churn ~ is_duplicate + city + bd2 + registered_via + n_cut,
family=binomial(link='logit'), data=datIN)
summary(lrIN)
讀入test資料檔,將模型的預測資料以下列code的方式與其比較,計算模型錯誤率、模型AUC。
prob_lrIN <- predict.glm(lrIN, type="response", na.action=na.omit)
hat_lrIN <- prob_lrIN
hat_lrIN[which(prob_lrIN >=0.5)] <- 1
hat_lrIN[which(prob_lrIN <0.5)] <- 0
na.omit(dat$is_churn[idc_train] != hat_lrIN[idc_train]) %>% mean
na.omit(dat$is_churn[idc_test] != hat_lrIN[idc_test]) %>% mean
pROC::auc(dat$is_churn[idc_train], prob_lrIN[idc_train])
pROC::auc(dat$is_churn[idc_test], prob_lrIN[idc_test])
模型錯誤率、模型AUC如下:
| Data | 錯誤率 | AUC |
|---|---|---|
| Train | 0.1094594 | 0.5986 |
| Test | 0.1314478 | 0.5944 |
看起來不是個好模型。
再來使用randomForest方式建立預測模型,模型放入前面覺得有趣的變項:是否重複、城市、註冊管道、年齡是否怪怪的、交易量,也參考Kaggle上其他人的分享,加入一些可能也有扮演角色的變項:payment_plan_days、plan_list_price、actual_amount_paid、is_auto_renew、is_cancel、payment_price_diff。一共建立了10個參數或變項不同的模型。
dat$is_churn <- as.numeric(dat$is_churn)
dat$is_churn <- as.factor(dat$is_churn)
idc_train <- dat$sample == "train"
idc_test <- dat$sample == "test"
idc_submission <- dat$sample == "submission"
randomForest_fit1 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
reg_year - reg_month - reg_mday - reg_wday, data = dat,
subset = idc_train, mtry= 3, sampsize = 10000, ntree = 100,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit1)
randomForest_fit2 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
reg_year - reg_month - reg_mday - reg_wday, data = dat,
subset = idc_train, mtry= 3, sampsize = 12000, ntree = 250,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit2)
randomForest_fit3 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
reg_year - reg_month - reg_mday - reg_wday, data = dat,
subset = idc_train, mtry= 3, sampsize = 20000, ntree = 500,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit3)
randomForest_fit4 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via +
payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
is_cancel + n_transactions + payment_price_diff,
data = dat, subset = idc_train, mtry=3, sampsize=20000, ntree=1000,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit4)
randomForest_fit5 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via +
payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
is_cancel + n_transactions + payment_price_diff,
data = dat, subset = idc_train, mtry=3, sampsize=22000, ntree=2000,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit5)
randomForest_fit6 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via +
payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
is_cancel + n_transactions + payment_price_diff,
data = dat, subset = idc_train, mtry=3, sampsize=32000, ntree=2000,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit6)
randomForest_fit7 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via +
payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
is_cancel + n_transactions + payment_price_diff,
data = dat, subset = idc_train, mtry=3, sampsize=32000, ntree=3000,
na.action = na.omit, importance = T)
varImpPlot(randomForest_fit7)
讀入test資料檔,將模型的預測資料以下列code的方式與其比較,計算各模型錯誤率、模型AUC。
dat$is_churn_hat <- predict(randomForest_fit, newdata = dat)
na.omit(dat$is_churn[idc_train] != dat$is_churn_hat[idc_train]) %>% mean
na.omit(dat$is_churn[idc_test] != dat$is_churn_hat[idc_test]) %>% mean
dat$is_churn_prob <- predict(randomForest_fit, newdata = dat, type = "prob")[, 2]
pROC::auc(dat$is_churn[idc_train], dat$is_churn_prob[idc_train])
pROC::auc(dat$is_churn[idc_test], dat$is_churn_prob[idc_test])
各模型錯誤率如下:
| Model | Data | 錯誤率 | AUC |
|---|---|---|---|
| randomForest_fit1 | Train | 0.0612225 | 0.6959 |
| randomForest_fit1 | Test | 0.0801128 | 0.6910 |
| randomForest_fit2 | Train | 0.0548993 | 0.7891 |
| randomForest_fit2 | Test | 0.0706217 | 0.7803 |
| randomForest_fit3 | Train | 0.0496651 | 0.8553 |
| randomForest_fit3 | Test | 0.0699842 | 0.8472 |
| randomForest_fit4 | Train | 0.0348995 | 0.9011 |
| randomForest_fit4 | Test | 0.0512554 | 0.8895 |
| randomForest_fit5 | Train | 0.0401405 | 0.8856 |
| randomForest_fit5 | Test | 0.0712003 | 0.8699 |
| randomForest_fit6 | Train | 0.0391494 | 0.8991 |
| randomForest_fit6 | Test | 0.0585564 | 0.8701 |
| randomForest_fit7 | Train | 0.0410136 | 0.8749 |
| randomForest_fit7 | Test | 0.0785122 | 0.8221 |
最後選擇看起來預測能力最好的模型4,輸出其預測資料。
輸出格式為兩欄,一欄為使用者名稱,一欄為預測使用者背叛的機率。
dat_submission <- dat[dat$sample == "submission", c("msno", "is_churn_prob")]
dat_submission$msno <- as.character(dat_submission$msno)
submission$msno <- as.character(submission$msno)
submission_final <- left_join(submission, dat_submission, by = "msno")
submission_final$is_churn <- submission_final$is_churn_prob
submission_final$is_churn_prob <- NULL
submission_final$sample <- NULL
submission_final[is.na(submission_final$is_churn),2] <- 0
write.csv(submission_final, file = "submission.csv", quote = FALSE, row.names = FALSE)
這份預測資料檔提交至Kaggle後,在Public Leaderboard和Private Leaderboard的Log loss及排名如下:
| Leaderboard | Log loss | 排名 |
|---|---|---|
| Public | 0.29559 | 465 |
| Private | 0.28747 | 464 |