資料初步整理

首先參考Kaggle提供的Baseline code,將原始的member、transaction等資料合併整理成一個資料集,命名為dat。

載入會用到的套件

library(data.table)
library(Matrix)
library(randomForest)
library(dplyr)
library(ggplot)
library(ggplot2)

讀入原始資料

PATH <- "C:/Users/user/Documents/106Course/Practicum in Applied Psy/data_new/"

train <- fread(paste0(PATH, "train.csv"),sep = ",",na.strings = "")
test <- fread(paste0(PATH, "train_v2.csv"),sep = ",",na.strings = "")
transactions <- fread(paste0(PATH, "transactions.csv"),sep = ",",na.strings = "")
transactions_v2 <- fread(paste0(PATH, "transactions_v2.csv"),sep = ",",na.strings = "")
members <- fread(paste0(PATH, "members_v3.csv"),sep = ",",na.strings = "")
submission <-fread(paste0(PATH, "sample_submission_v2.csv"),sep = ",",na.strings = "")

將train和test合併成dat

train$sample <- "train"
test$sample <- "test"
submission$sample <- "submission"

submission$is_churn <- NA
dat <- rbind(train, test, submission)
dat[, is_duplicate := as.numeric(duplicated(as.character(dat$msno)) |
                                    duplicated(as.character(dat$msno), fromLast = T))]
dat[, is_member := (dat$msno %in% members$msno)]

調整members的資料並將其與dat合併

members[,gender := as.numeric(gender)]
members$gender[is.na(members$gender)] <- 0

members[,":="(reg_fulldate = members$registration_init_time
              ,registration_init_time = as.Date(as.character(registration_init_time), '%Y%m%d'))]
members[,":="(reg_year = year(registration_init_time)
              ,reg_month = month(registration_init_time)
              ,reg_mday = mday(registration_init_time)
              ,reg_wday = wday(registration_init_time))]
members <- subset(members, select = -registration_init_time)

dat <- merge(dat, members, by = "msno", all.x = TRUE)

調整transactions的資料並將其與dat合併

#Reduce size of transactions a bit
trans <- rbind(transactions, transactions_v2)
trans <- trans[trans$msno %in% levels(factor(dat$msno)), ]

#Get amount of transactions per user
trans[, n_transactions := .N, by = msno]

#Get difference between plan price and payment amount
trans[, payment_price_diff := plan_list_price - actual_amount_paid]

#Aggregate by user, get mean of columns
#Remove the transaction dates
trans <-
  trans[, lapply(.SD, mean, na.rm = TRUE), by = msno, .SDcols = names(trans)[c(2:6, 9:11)]]

#Merge data and transactions
dat <- merge(dat, trans, by = "msno", all.x = TRUE)

將dat輸出成csv檔,以便之後讀取使用

setwd("C:/Users/user/Documents/106Course/Practicum in Applied Psy/data_create")
write.csv(dat,"dat20171129.csv")

資料挖掘

在dat中看看各變項有無特別之處,發現可能與使用者是否背叛有關

先用ggplot2畫一些描述統計的圖

is_churn

p1 <- train %>%
  ggplot(aes(is_churn, fill = is_churn)) +
  geom_bar() +
  theme(legend.position = "none")

p2 <- members %>%
  ggplot(aes(gender, fill = gender)) +
  geom_bar() +
  theme(legend.position = "none")

p3 <- members %>%
  ggplot(aes(reg_via, fill = reg_via)) +
  geom_bar() +
  theme(legend.position = "none") +
  scale_y_sqrt()

p4 <- members %>%
  ggplot(aes(city, fill = city)) +
  geom_bar() +
  theme(legend.position = "none") +
  scale_y_sqrt()

p5 <- members %>%
  filter(bd > 0 & bd < 100) %>%
  ggplot(aes(bd)) +
  geom_density(fill = "red", bw = 1)

layout <- matrix(c(1,1,2,2,3,3,4,4,4,5,5,5),2,6,byrow=TRUE)
multiplot(p1, p2, p3, p4, p5, layout=layout)

透過畫出各變項與is_churn此變項的table與table plot看看他們與is_churn的關係

is_duplicate

tb2_3 <- table(dat[,c(2,3)])
colnames(tb2_3) <- c("No Duplicate","Duplicate")
rownames(tb2_3) <- c("No Churn","Churn","NA")
plot(tb2_3, col=topo.colors(length(tb2_3)), 
     main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[3],"]"))

city

tb2_4 <- table(dat[,c(2,4)])
rownames(tb2_4) <- c("No Churn","Churn","NA")
plot(tb2_4, col=topo.colors(length(tb2_4)), 
     main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[4],"]"))

bd (User age)

dd <- na.omit(dat[,c(2,5)])
dd$bd2 <- as.factor((dd$bd >=10) & (dd$bd <=70))
tb2_5 <- table(dd$is_churn,dd$bd2)
colnames(tb2_5) <- c("Unusual","Usual")
rownames(tb2_5) <- c("No Churn","Churn","NA")
plot(tb2_5, col=topo.colors(length(tb2_5)), 
     main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[5],"]"))

發現資料中的user age有許多應非年齡的怪資料,故假定10歲至70歲是真實年齡,其他均為怪資料,創立一新變項bd2區分兩者。

registered_via

tb2_7 <- table(dat[,c(2,7)])
colnames(tb2_7) <- paste0("Method ",colnames(tb2_7))
rownames(tb2_7) <- c("No Churn","Churn","NA")
plot(tb2_7, col=topo.colors(length(tb2_7)), 
     main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[7],"]"))

n_transactions

dat$n_transactions <- as.numeric(dat$n_transactions)
nt_cut <- cut(dat$n_transactions, breaks=c(0,8,16,24,32,Inf))
tb2_25 <- table(dat$is_churn, nt_cut)
rownames(tb2_25) <- c("No Churn","Churn","NA")
plot(tb2_25, col=topo.colors(length(tb2_25)), 
     main=paste0("Plot of table of [",colnames(dat)[2],"] and [",colnames(dat)[25],"]"))

將交易量分為若干組。

建立模型

Logistic Regression

建立邏輯式迴歸模型,放入前面覺得有趣的變項:是否重複、城市、註冊管道、年齡是否怪怪的、交易量。

資料校正

datIN3 <- datIN3[datIN5$V27,]; datIN4 <- datIN4[datIN5$V27,]
datIN7 <- datIN7[datIN5$V27,]; datIN25 <- datIN25[datIN5$V27,]

datIN <- data.frame(datIN3$is_churn, datIN3$is_duplicate, datIN4$city, 
                    datIN5$bd2, datIN7$registered_via, datIN25$n_cut)
colnames(datIN) <- c("is_churn","is_duplicate","city","bd2","registered_via","n_cut")

rm(datIN3); rm(datIN4); rm(datIN5); rm(datIN7); rm(datIN25)

建立邏輯式迴歸模型

lrIN <- glm(is_churn ~ is_duplicate + city + bd2 + registered_via + n_cut,
            family=binomial(link='logit'), data=datIN)
summary(lrIN)

模型預測效果

讀入test資料檔,將模型的預測資料以下列code的方式與其比較,計算模型錯誤率、模型AUC。

prob_lrIN <- predict.glm(lrIN, type="response", na.action=na.omit)

hat_lrIN <- prob_lrIN
hat_lrIN[which(prob_lrIN >=0.5)] <- 1
hat_lrIN[which(prob_lrIN <0.5)] <- 0

na.omit(dat$is_churn[idc_train] != hat_lrIN[idc_train]) %>% mean
na.omit(dat$is_churn[idc_test] != hat_lrIN[idc_test]) %>% mean

pROC::auc(dat$is_churn[idc_train], prob_lrIN[idc_train])
pROC::auc(dat$is_churn[idc_test], prob_lrIN[idc_test])

模型錯誤率、模型AUC如下:

Data 錯誤率 AUC
Train 0.1094594 0.5986
Test 0.1314478 0.5944

看起來不是個好模型。

randomForest

再來使用randomForest方式建立預測模型,模型放入前面覺得有趣的變項:是否重複、城市、註冊管道、年齡是否怪怪的、交易量,也參考Kaggle上其他人的分享,加入一些可能也有扮演角色的變項:payment_plan_days、plan_list_price、actual_amount_paid、is_auto_renew、is_cancel、payment_price_diff。一共建立了10個參數或變項不同的模型。

進行命名與部分資料校正

dat$is_churn <- as.numeric(dat$is_churn)
dat$is_churn <- as.factor(dat$is_churn)
idc_train <- dat$sample == "train"
idc_test <- dat$sample == "test"
idc_submission <- dat$sample == "submission"

建立模型1

randomForest_fit1 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
                                 reg_year - reg_month - reg_mday - reg_wday, data = dat, 
                                 subset = idc_train, mtry= 3, sampsize = 10000, ntree = 100, 
                                 na.action = na.omit, importance = T)
varImpPlot(randomForest_fit1)

建立模型2

randomForest_fit2 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
                                 reg_year - reg_month - reg_mday - reg_wday, data = dat, 
                                 subset = idc_train, mtry= 3, sampsize = 12000, ntree = 250, 
                                 na.action = na.omit, importance = T)
varImpPlot(randomForest_fit2)

建立模型3

randomForest_fit3 <- randomForest(is_churn ~. - msno - sample - gender - is_member - reg_fulldate -
                                 reg_year - reg_month - reg_mday - reg_wday, data = dat, 
                                 subset = idc_train, mtry= 3, sampsize = 20000, ntree = 500, 
                                 na.action = na.omit, importance = T)
varImpPlot(randomForest_fit3)

建立模型4

randomForest_fit4 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via + 
                                    payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
                                    is_cancel + n_transactions + payment_price_diff,
                                  data = dat, subset = idc_train, mtry=3, sampsize=20000, ntree=1000, 
                                  na.action = na.omit, importance = T)
varImpPlot(randomForest_fit4)

建立模型5

randomForest_fit5 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via + 
                                    payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
                                    is_cancel + n_transactions + payment_price_diff,
                                  data = dat, subset = idc_train, mtry=3, sampsize=22000, ntree=2000, 
                                  na.action = na.omit, importance = T)
varImpPlot(randomForest_fit5)

建立模型6

randomForest_fit6 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via + 
                                    payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
                                    is_cancel + n_transactions + payment_price_diff,
                                  data = dat, subset = idc_train, mtry=3, sampsize=32000, ntree=2000, 
                                  na.action = na.omit, importance = T)
varImpPlot(randomForest_fit6)

建立模型7

randomForest_fit7 <- randomForest(is_churn ~is_duplicate + is_member + city + bd + registered_via + 
                                    payment_plan_days + plan_list_price + actual_amount_paid + is_auto_renew +
                                    is_cancel + n_transactions + payment_price_diff,
                                  data = dat, subset = idc_train, mtry=3, sampsize=32000, ntree=3000, 
                                  na.action = na.omit, importance = T)
varImpPlot(randomForest_fit7)

模型預測效果

讀入test資料檔,將模型的預測資料以下列code的方式與其比較,計算各模型錯誤率、模型AUC。

dat$is_churn_hat <- predict(randomForest_fit, newdata = dat)

na.omit(dat$is_churn[idc_train] != dat$is_churn_hat[idc_train]) %>% mean
na.omit(dat$is_churn[idc_test] != dat$is_churn_hat[idc_test]) %>% mean

dat$is_churn_prob <- predict(randomForest_fit, newdata = dat, type = "prob")[, 2]

pROC::auc(dat$is_churn[idc_train], dat$is_churn_prob[idc_train])
pROC::auc(dat$is_churn[idc_test], dat$is_churn_prob[idc_test])

各模型錯誤率如下:

Model Data 錯誤率 AUC
randomForest_fit1 Train 0.0612225 0.6959
randomForest_fit1 Test 0.0801128 0.6910
randomForest_fit2 Train 0.0548993 0.7891
randomForest_fit2 Test 0.0706217 0.7803
randomForest_fit3 Train 0.0496651 0.8553
randomForest_fit3 Test 0.0699842 0.8472
randomForest_fit4 Train 0.0348995 0.9011
randomForest_fit4 Test 0.0512554 0.8895
randomForest_fit5 Train 0.0401405 0.8856
randomForest_fit5 Test 0.0712003 0.8699
randomForest_fit6 Train 0.0391494 0.8991
randomForest_fit6 Test 0.0585564 0.8701
randomForest_fit7 Train 0.0410136 0.8749
randomForest_fit7 Test 0.0785122 0.8221

最後選擇看起來預測能力最好的模型4,輸出其預測資料。

預測資料輸出與提交

輸出格式為兩欄,一欄為使用者名稱,一欄為預測使用者背叛的機率。

dat_submission <- dat[dat$sample == "submission", c("msno", "is_churn_prob")]
dat_submission$msno <- as.character(dat_submission$msno)

submission$msno <- as.character(submission$msno)
submission_final <- left_join(submission, dat_submission, by = "msno")
submission_final$is_churn <- submission_final$is_churn_prob
submission_final$is_churn_prob <- NULL
submission_final$sample <- NULL
submission_final[is.na(submission_final$is_churn),2] <- 0

write.csv(submission_final, file = "submission.csv", quote = FALSE, row.names = FALSE)

這份預測資料檔提交至Kaggle後,在Public Leaderboard和Private Leaderboard的Log loss及排名如下:

Leaderboard Log loss 排名
Public 0.29559 465
Private 0.28747 464

未來展望

  1. 可以將遺漏值做更好的處理,例如將其改為一個特別的值
  2. 可以使用user_log的資料(多嘗試各種可能的變項)