This contest hold by Taiwan Post Co.
All contestants will get 4 data bases. Then they can choose one to analysis. I choose the contract data this time. It contain the contract start time and end time data and other detail information.
There is the variable list: Data size : 46365 obs of 28 variables.
I will use survival analysis to know which variables will effect the contract length.
And the variable CC12 shows whether the user shipping to post office by themselves or not.
I will use the classificate method to analysis it.
Import the data and label them at first.
#read the data
data <- read.csv("CC (1).csv",header = F)
colnames(data) <- c("CC01","CC02","CC03","CC04","CC05","CC06","CC07","CC08","CC09","CC10","CC11","CC12","CC13","CC14","CC15","CC16","C17","CC18","CC19","CC20","CC21","CC22","CC23","CC24","CC25","CC26","CC27","CC28")
I choose the useful variables, cleaning the data then add some new variables for analysis.
This data contain too many NAs. So I only clean the important variables. I reserved: 01 02 03 05 12 14 23.
library(tidyverse)
#fix the CC01 wrong data
data$CC01[777] <- 910222
data$CC01[2220] <- 931008
data$CC01[9516] <- 890913
data$CC01[11433] <- 971202
data$CC01[15187] <- 890913
data$CC01[16453] <- 930710
data$CC01[17097] <- 971202
data$CC01[18799] <- 031007
data$CC01[29188] <- 950204
data$CC01[29812] <- 910828
data$CC01[31592] <- 920227
data$CC01[32302] <- 940701
data$CC01[45390] <- 931006
#CC02 trans the 10101 to 9991231
for(i in c(1:length(data$CC02))){
if(data$CC02[i] == 10101){
data$CC02[i] <- 9991231
}
}
#trans date to AD date form
data$CC01 <- data$CC01+19110000
data$CC02 <- data$CC02+19110000
library(lubridate)
#Delete the data before 1999-01-02
j <- 1
num_03 <- vector()
for(i in c(1:length(data$CC01))){
if(data$CC01[i] < 19910102){
num_03[j] <- i
j <- j+1
}
}
data <- data[-num_03,]
#Delete the L>R data
j <- 1
num_05 <- vector()
for(i in c(1:length(data$CC01))){
if(data$CC01[i] > data$CC02[i]){
num_05[j] <- i
j <- j+1
}
}
data <- data[-num_05,]
#Only differentiate is Taiwanese or not. 1 means Taiwanese.
for(i in c(1:length(data$CC01))){
if(data$CC03[i] < 3){
data$CC03[i] = 1
}
else{
data$CC03[i] = 0
}
}
#Careers
for(i in c(1:length(data$CC01))){
if(is.na(data$CC05[i])){
data$CC05[i] = 0
}
}
#Industries
for(i in c(1:length(data$CC01))){
if(is.na(data$CC06[i])){
data$CC06[i] = 0
}
}
for(i in c(1:length(data$CC01))){
if(data$CC06[i]==26){
data$CC06[i] = 99
}
}
# Shipping to post office by themselves (1 = yes; 0 = no)
#NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
if(is.na(data$CC12[i])){
num_06[j] <- i
j <- j+1
}
}
data <- data[-num_06,]
#trans 2 to 0
for(i in c(1:length(data$CC01))){
if(data$CC12[i] == 2){
data$CC12[i] <- 0
}
}
#Checkout method (1 = Bookkeeping ; 0 = payment on the spot)
#NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
if(is.na(data$CC14[i])){
num_06[j] <- i
j <- j+1
}
}
data <- data[-num_06,]
#delete 0
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
if(data$CC14[i] == 0){
num_06[j] <- i
j <- j+1
}
}
data <- data[-num_06,]
#trans 2 to 0
for(i in c(1:length(data$CC01))){
if(data$CC14[i] == 2){
data$CC14[i] <- 0
}
}
#Project bargaining (0 = No ;1 = yes).
# NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
if(is.na(data$CC23[i])){
num_06[j] <- i
j <- j+1
}
}
data <- data[-num_06,]
Since the 05 06 are the classification data for career and industry data. We will use the specific variable, so we extense the variables.
for(i in c(1:26)){
data[,28+i] <- 0
for(j in c(1:length(data$CC01))){
if(data$CC05[j]==i){
data[j,28+i]=1
}
}
}
colnames(data)[29:54] <- c("CC0501","CC0502","CC0503","CC0504","CC0505","CC0506","CC0507","CC0508","CC0509","CC0510","CC0511","CC0512","CC0513","CC0514","CC0515","CC0516","CC0517","CC0518","CC0519","CC0520","CC0521","CC0522","CC0523","CC0524","CC0525","CC0526")
data$CC0599 <- 0
for(i in c(1:length(data$CC01))){
if(data$CC05[i]==99){
data$CC0599[i] <- 1
}
}
for(i in c(1:15)){
data[,55+i] <- 0
for(j in c(1:length(data$CC01))){
if(data$CC06[j]==i){
data[j,55+i]=1
}
}
}
colnames(data)[56:70] <- c("CC0601","CC0602","CC0603","CC0604","CC0605","CC0606","CC0607","CC0608","CC0609","CC0610","CC0611","CC0612","CC0613","CC0614","CC0615")
data$CC0699 <- 0
for(i in c(1:length(data$CC01))){
if(data$CC06[i]==99){
data$CC0699[i] <- 1
}
}
I will need some special variables for bulid the cox model. So I trans the time data form, then construct new variable like “status(cencored or not)” , “survival object”.
library(survival)
#creat time data
data$CC02b <- data$CC02
for(i in c(1:length(data$CC01))){
if(data$CC02b[i] > 20190304){
data$CC02b[i] = 20190304
}
}
data$time <- ymd(data$CC02b)-ymd(data$CC01)
data$time1 <- as.numeric(data$time)
#creat status data : 1=censored; 2=dead
data$status <- 1
for(i in c(1:length(data$CC01))){
if(data$CC02b[i] != 20190304){
data$status[i] <- 2
}
}
## Add survival object. status == 2 is death
data$SurvObj <- with(data, Surv(time, status == 2))
# Useful data
data_cox <- data.frame(data[,c(3,12,14,23,29:71,73:76)])
Show the data_cox:
In this chapter, I will plot the cencering data,build the cox model and check whether it pass some test.
Plot the cencering data plot.
## Kaplan-Meier estimator. The "log-log" confidence interval is preferred.
km.as.one <- survfit(SurvObj ~ 1, data = data, conf.type = "log-log")
## Plotting without any specification
plot(km.as.one)
Use Log Rank Test to check which variables are significant effect for survival function.
library(coin)
logrank_test(SurvObj~as.factor(CC03),data_cox)
##
## Asymptotic Two-Sample Logrank Test
##
## data: SurvObj by as.factor(CC03) (0, 1)
## Z = 16.019, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1
logrank_test(SurvObj~as.factor(CC12),data_cox)
##
## Asymptotic Two-Sample Logrank Test
##
## data: SurvObj by as.factor(CC12) (0, 1)
## Z = 14.067, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1
logrank_test(SurvObj~as.factor(CC14),data_cox)
##
## Asymptotic Two-Sample Logrank Test
##
## data: SurvObj by as.factor(CC14) (0, 1)
## Z = -17.224, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1
logrank_test(SurvObj~as.factor(CC23),data_cox)
##
## Asymptotic Two-Sample Logrank Test
##
## data: SurvObj by as.factor(CC23) (0, 1)
## Z = 87.462, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1
It is clearly that four variables are significant effect for contract length.
# Fit Cox regression: age, sex, Karnofsky performance score, wt loss
res.cox1 <- coxph(SurvObj ~ .-time-time1-status, data = data_cox)
summary(res.cox1)
## Call:
## coxph(formula = SurvObj ~ . - time - time1 - status, data = data_cox)
##
## n= 45440, number of events= 1044
##
## coef exp(coef) se(coef) z Pr(>|z|)
## CC03 9.440e-01 2.570e+00 1.587e-01 5.949 2.70e-09 ***
## CC12 -1.457e-01 8.644e-01 7.232e-02 -2.015 0.0439 *
## CC14 -4.638e-01 6.289e-01 7.167e-02 -6.472 9.68e-11 ***
## CC23 5.983e+00 3.965e+02 2.616e-01 22.865 < 2e-16 ***
## CC0501 1.267e+01 3.166e+05 1.045e+03 0.012 0.9903
## CC0502 1.194e+01 1.532e+05 1.045e+03 0.011 0.9909
## CC0503 1.221e+01 2.009e+05 1.045e+03 0.012 0.9907
## CC0504 1.181e+01 1.341e+05 1.045e+03 0.011 0.9910
## CC0505 1.137e+01 8.678e+04 1.045e+03 0.011 0.9913
## CC0506 1.242e+01 2.470e+05 1.045e+03 0.012 0.9905
## CC0507 1.160e+01 1.091e+05 1.045e+03 0.011 0.9911
## CC0508 1.247e+01 2.614e+05 1.045e+03 0.012 0.9905
## CC0509 1.089e+01 5.363e+04 1.045e+03 0.010 0.9917
## CC0510 -4.513e+00 1.097e-02 4.686e+03 -0.001 0.9992
## CC0511 1.234e+01 2.287e+05 1.045e+03 0.012 0.9906
## CC0512 -4.330e+00 1.317e-02 2.425e+03 -0.002 0.9986
## CC0513 1.109e+01 6.576e+04 1.045e+03 0.011 0.9915
## CC0514 1.075e+01 4.642e+04 1.045e+03 0.010 0.9918
## CC0515 1.170e+01 1.204e+05 1.045e+03 0.011 0.9911
## CC0516 1.232e+01 2.248e+05 1.045e+03 0.012 0.9906
## CC0517 1.042e+01 3.360e+04 1.045e+03 0.010 0.9920
## CC0518 -7.024e+00 8.903e-04 1.533e+05 0.000 1.0000
## CC0519 1.111e+01 6.658e+04 1.045e+03 0.011 0.9915
## CC0520 -4.420e+00 1.204e-02 5.134e+03 -0.001 0.9993
## CC0521 -6.888e+00 1.020e-03 2.606e+05 0.000 1.0000
## CC0522 NA NA 0.000e+00 NA NA
## CC0523 1.388e+01 1.064e+06 1.045e+03 0.013 0.9894
## CC0524 1.138e+01 8.712e+04 1.045e+03 0.011 0.9913
## CC0525 1.162e+01 1.109e+05 1.045e+03 0.011 0.9911
## CC0526 1.259e+01 2.940e+05 1.045e+03 0.012 0.9904
## CC0599 1.180e+01 1.334e+05 1.045e+03 0.011 0.9910
## CC0601 1.253e+01 2.776e+05 9.804e+02 0.013 0.9898
## CC0602 -4.832e+00 7.968e-03 6.974e+04 0.000 0.9999
## CC0603 1.392e+01 1.108e+06 9.804e+02 0.014 0.9887
## CC0604 1.324e+01 5.609e+05 9.804e+02 0.014 0.9892
## CC0605 1.302e+01 4.533e+05 9.804e+02 0.013 0.9894
## CC0606 1.368e+01 8.691e+05 9.804e+02 0.014 0.9889
## CC0607 1.414e+01 1.389e+06 9.804e+02 0.014 0.9885
## CC0608 1.295e+01 4.208e+05 9.804e+02 0.013 0.9895
## CC0609 1.385e+01 1.035e+06 9.804e+02 0.014 0.9887
## CC0610 1.213e+01 1.860e+05 9.804e+02 0.012 0.9901
## CC0611 1.408e+01 1.304e+06 9.804e+02 0.014 0.9885
## CC0612 1.394e+01 1.134e+06 9.804e+02 0.014 0.9887
## CC0613 1.317e+01 5.251e+05 9.804e+02 0.013 0.9893
## CC0614 1.212e+01 1.834e+05 9.804e+02 0.012 0.9901
## CC0615 1.391e+01 1.094e+06 9.804e+02 0.014 0.9887
## CC0699 1.409e+01 1.321e+06 9.804e+02 0.014 0.9885
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## CC03 2.570e+00 3.891e-01 1.8832 3.5079
## CC12 8.644e-01 1.157e+00 0.7501 0.9960
## CC14 6.289e-01 1.590e+00 0.5465 0.7237
## CC23 3.965e+02 2.522e-03 237.4049 662.0974
## CC0501 3.166e+05 3.159e-06 0.0000 Inf
## CC0502 1.532e+05 6.527e-06 0.0000 Inf
## CC0503 2.009e+05 4.977e-06 0.0000 Inf
## CC0504 1.341e+05 7.457e-06 0.0000 Inf
## CC0505 8.678e+04 1.152e-05 0.0000 Inf
## CC0506 2.470e+05 4.049e-06 0.0000 Inf
## CC0507 1.091e+05 9.163e-06 0.0000 Inf
## CC0508 2.614e+05 3.826e-06 0.0000 Inf
## CC0509 5.363e+04 1.865e-05 0.0000 Inf
## CC0510 1.097e-02 9.117e+01 0.0000 Inf
## CC0511 2.287e+05 4.373e-06 0.0000 Inf
## CC0512 1.317e-02 7.591e+01 0.0000 Inf
## CC0513 6.576e+04 1.521e-05 0.0000 Inf
## CC0514 4.642e+04 2.154e-05 0.0000 Inf
## CC0515 1.204e+05 8.308e-06 0.0000 Inf
## CC0516 2.248e+05 4.448e-06 0.0000 Inf
## CC0517 3.360e+04 2.976e-05 0.0000 Inf
## CC0518 8.903e-04 1.123e+03 0.0000 Inf
## CC0519 6.658e+04 1.502e-05 0.0000 Inf
## CC0520 1.204e-02 8.309e+01 0.0000 Inf
## CC0521 1.020e-03 9.804e+02 0.0000 Inf
## CC0522 NA NA NA NA
## CC0523 1.064e+06 9.401e-07 0.0000 Inf
## CC0524 8.712e+04 1.148e-05 0.0000 Inf
## CC0525 1.109e+05 9.014e-06 0.0000 Inf
## CC0526 2.940e+05 3.402e-06 0.0000 Inf
## CC0599 1.334e+05 7.496e-06 0.0000 Inf
## CC0601 2.776e+05 3.602e-06 0.0000 Inf
## CC0602 7.968e-03 1.255e+02 0.0000 Inf
## CC0603 1.108e+06 9.023e-07 0.0000 Inf
## CC0604 5.609e+05 1.783e-06 0.0000 Inf
## CC0605 4.533e+05 2.206e-06 0.0000 Inf
## CC0606 8.691e+05 1.151e-06 0.0000 Inf
## CC0607 1.389e+06 7.201e-07 0.0000 Inf
## CC0608 4.208e+05 2.376e-06 0.0000 Inf
## CC0609 1.035e+06 9.663e-07 0.0000 Inf
## CC0610 1.860e+05 5.377e-06 0.0000 Inf
## CC0611 1.304e+06 7.667e-07 0.0000 Inf
## CC0612 1.134e+06 8.821e-07 0.0000 Inf
## CC0613 5.251e+05 1.904e-06 0.0000 Inf
## CC0614 1.834e+05 5.453e-06 0.0000 Inf
## CC0615 1.094e+06 9.138e-07 0.0000 Inf
## CC0699 1.321e+06 7.570e-07 0.0000 Inf
##
## Concordance= 0.959 (se = 0.002 )
## Likelihood ratio test= 4756 on 46 df, p=<2e-16
## Wald test = 53.5 on 46 df, p=0.2
## Score (logrank) test = 8753 on 46 df, p=<2e-16
num <- vector()
j = 1
for(i in c(1:length(data_cox$CC03))){
if(data_cox$CC23[i] == 0){
num[j] <- i
j <- j+1
}
}
data_00 <- data[num,]
data_01 <- data[-num,]
library(My.stepwise)
my.variable.list <- c("CC03", "CC12", "CC14", "CC0504","CC0508","CC0511","CC0516","CC0608","CC0610")
#My.stepwise.coxph(Time = "time", Status = "status", variable.list = my.variable.list,
#data = data_00, sle = 0.25, sls = 0.25)
res.cox <- coxph(SurvObj ~ CC23, data = data_cox)
summary(res.cox)
## Call:
## coxph(formula = SurvObj ~ CC23, data = data_cox)
##
## n= 45440, number of events= 1044
##
## coef exp(coef) se(coef) z Pr(>|z|)
## CC23 6.3443 569.2620 0.2601 24.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## CC23 569.3 0.001757 341.9 947.8
##
## Concordance= 0.94 (se = 0.002 )
## Likelihood ratio test= 4429 on 1 df, p=<2e-16
## Wald test = 595 on 1 df, p=<2e-16
## Score (logrank) test = 8339 on 1 df, p=<2e-16
This result shows that the CC23 is the key for contrat length.
# Check for violation of proportional hazard (constant HR over time)
(res.zph1 <- cox.zph(res.cox))
## rho chisq p
## CC23 0.0564 3.32 0.0684
It under the \(H_0\), means this Cox model conform the hypothesis of proportional hazard.
The CC14 is a variable which shows whether user shipping to post office by themselves or not. This means if user shows 0 in CC14, it will lead office to cost more money on transport.
#2:8 data cut
set.seed(1)
set = sample(1:nrow(data_cox),0.8*nrow(data_cox))
train = data_cox[set,]
test = data_cox[-set,]
#logistic regression
model1<-glm(formula=CC12~.-time-time1-SurvObj-status,data=train,family=binomial(link="logit"),na.action=na.exclude) #Let Logistic model assign to model1
#summary(model1)
pred = predict(model1,newdata = test)
confus.matrix <- table(test$CC12,pred)
sum(diag(confus.matrix))/sum(confus.matrix)
## [1] 0.01364437
Under the full logistic model, the accuracy is tragic.
library(rpart)
library(rpart.plot)
data1 <- data_cox[,-(48:51)]
#2:8 data cut
set.seed(1)
set = sample(1:nrow(data1),0.8*nrow(data1))
train = data1[set,]
test = data1[-set,]
tree.model<- rpart(CC12~.,data1)
prp(tree.model,faclen=0,fallen.leaves=TRUE)
pred = predict(tree.model,newdata = test)
confus.matrix <- table(test$CC12,pred)
sum(diag(confus.matrix))/sum(confus.matrix)
## [1] 0.4489437
In the Decision Tree model , we have the accuracy: 0.4489437.