This contest hold by Taiwan Post Co.
All contestants will get 4 data bases. Then they can choose one to analysis. I choose the contract data this time. It contain the contract start time and end time data and other detail information.
There is the variable list： Alt text Data size : 46365 obs of 28 variables.
I will use survival analysis to know which variables will effect the contract length.
And the variable CC12 shows whether the user shipping to post office by themselves or not.
I will use the classificate method to analysis it.

1.Load data

Import the data and label them at first.

#read the data
data <- read.csv("CC (1).csv",header = F)
colnames(data) <- c("CC01","CC02","CC03","CC04","CC05","CC06","CC07","CC08","CC09","CC10","CC11","CC12","CC13","CC14","CC15","CC16","C17","CC18","CC19","CC20","CC21","CC22","CC23","CC24","CC25","CC26","CC27","CC28")

2.Data setting

I choose the useful variables, cleaning the data then add some new variables for analysis.

2.1 Data Cleaning

This data contain too many NAs. So I only clean the important variables. I reserved: 01 02 03 05 12 14 23.

CC01 CC02

library(tidyverse)
#fix the CC01 wrong data
data$CC01[777] <- 910222
data$CC01[2220] <- 931008
data$CC01[9516] <- 890913
data$CC01[11433] <- 971202
data$CC01[15187] <- 890913
data$CC01[16453] <- 930710
data$CC01[17097] <- 971202
data$CC01[18799] <- 031007
data$CC01[29188] <- 950204
data$CC01[29812] <- 910828
data$CC01[31592] <- 920227
data$CC01[32302] <- 940701
data$CC01[45390] <- 931006


#CC02 trans the 10101 to 9991231
for(i in c(1:length(data$CC02))){
  if(data$CC02[i] == 10101){
    data$CC02[i] <- 9991231
  }
}
#trans date to AD date form
data$CC01 <- data$CC01+19110000
data$CC02 <- data$CC02+19110000


library(lubridate)
#Delete the data before 1999-01-02
j <- 1
num_03 <- vector()
for(i in c(1:length(data$CC01))){
  if(data$CC01[i] < 19910102){
    num_03[j] <- i
    j <- j+1
  }
}
data <- data[-num_03,]

#Delete the L>R data 
j <- 1
num_05 <- vector()
for(i in c(1:length(data$CC01))){
  if(data$CC01[i] > data$CC02[i]){
    num_05[j] <- i
    j <- j+1
  }
}
data <- data[-num_05,]

CC03

#Only differentiate is Taiwanese or not. 1 means Taiwanese.
for(i in c(1:length(data$CC01))){
  if(data$CC03[i] < 3){
    data$CC03[i] = 1
  }
  else{
    data$CC03[i] = 0
  }
}

CC05

#Careers
for(i in c(1:length(data$CC01))){
  if(is.na(data$CC05[i])){
     data$CC05[i] = 0
  }
}

CC06

#Industries
for(i in c(1:length(data$CC01))){
  if(is.na(data$CC06[i])){
     data$CC06[i] = 0
  }
}
for(i in c(1:length(data$CC01))){
  if(data$CC06[i]==26){
     data$CC06[i] = 99
  }
}

CC12

# Shipping to post office by themselves (1 = yes; 0 = no)
#NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
  if(is.na(data$CC12[i])){
    num_06[j] <- i
    j <- j+1
  }
}
data <- data[-num_06,]

#trans 2 to 0
for(i in c(1:length(data$CC01))){
  if(data$CC12[i] == 2){
    data$CC12[i] <- 0
  }
}

CC14

#Checkout method (1 = Bookkeeping ; 0 = payment on the spot)
#NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
  if(is.na(data$CC14[i])){
    num_06[j] <- i
    j <- j+1
  }
}
data <- data[-num_06,]

#delete 0
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
  if(data$CC14[i] == 0){
    num_06[j] <- i
    j <- j+1
  }
}
data <- data[-num_06,]

#trans 2 to 0
for(i in c(1:length(data$CC01))){
  if(data$CC14[i] == 2){
     data$CC14[i] <-  0
  }
}

CC23

#Project bargaining (0 = No ;1 = yes).
# NA delete
j <- 1
num_06 <- vector()
for(i in c(1:length(data$CC01))){
  if(is.na(data$CC23[i])){
    num_06[j] <- i
    j <- j+1
  }
}
data <- data[-num_06,]

2.2 variable extension

Since the 05 06 are the classification data for career and industry data. We will use the specific variable, so we extense the variables.

cc05 extension

for(i in c(1:26)){
  data[,28+i] <- 0
  for(j in c(1:length(data$CC01))){
    if(data$CC05[j]==i){
    data[j,28+i]=1
    }
  }
  
}
colnames(data)[29:54] <- c("CC0501","CC0502","CC0503","CC0504","CC0505","CC0506","CC0507","CC0508","CC0509","CC0510","CC0511","CC0512","CC0513","CC0514","CC0515","CC0516","CC0517","CC0518","CC0519","CC0520","CC0521","CC0522","CC0523","CC0524","CC0525","CC0526")
data$CC0599 <- 0
for(i in c(1:length(data$CC01))){
  if(data$CC05[i]==99){
    data$CC0599[i] <- 1
  }
}

cc06 extension

for(i in c(1:15)){
  data[,55+i] <- 0
  for(j in c(1:length(data$CC01))){
    if(data$CC06[j]==i){
    data[j,55+i]=1
    }
  }
  
}
colnames(data)[56:70] <- c("CC0601","CC0602","CC0603","CC0604","CC0605","CC0606","CC0607","CC0608","CC0609","CC0610","CC0611","CC0612","CC0613","CC0614","CC0615")
data$CC0699 <- 0
for(i in c(1:length(data$CC01))){
  if(data$CC06[i]==99){
    data$CC0699[i] <- 1
  }
}

2.3 Create a survival object

I will need some special variables for bulid the cox model. So I trans the time data form, then construct new variable like “status(cencored or not)” , “survival object”.

library(survival)
#creat time data
data$CC02b <- data$CC02

for(i in c(1:length(data$CC01))){
  if(data$CC02b[i] > 20190304){
    data$CC02b[i] = 20190304
  }
}

data$time <- ymd(data$CC02b)-ymd(data$CC01)
data$time1 <- as.numeric(data$time)

#creat status data : 1=censored; 2=dead
data$status <- 1
for(i in c(1:length(data$CC01))){
  if(data$CC02b[i] != 20190304){
    data$status[i] <-  2
  }
}

## Add survival object. status == 2 is death
data$SurvObj <- with(data, Surv(time, status == 2))

# Useful data
data_cox <- data.frame(data[,c(3,12,14,23,29:71,73:76)])

Show the data_cox: Alt text

3. Survival Analysis

In this chapter, I will plot the cencering data,build the cox model and check whether it pass some test.

3.1 Kaplan-Meier estimator

Plot the cencering data plot.

## Kaplan-Meier estimator. The "log-log" confidence interval is preferred.
km.as.one <- survfit(SurvObj ~ 1, data = data, conf.type = "log-log")

## Plotting without any specification
plot(km.as.one)

3.2 Log rank test

Use Log Rank Test to check which variables are significant effect for survival function.

library(coin)
logrank_test(SurvObj~as.factor(CC03),data_cox)

## 
##  Asymptotic Two-Sample Logrank Test
## 
## data:  SurvObj by as.factor(CC03) (0, 1)
## Z = 16.019, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1

logrank_test(SurvObj~as.factor(CC12),data_cox)

## 
##  Asymptotic Two-Sample Logrank Test
## 
## data:  SurvObj by as.factor(CC12) (0, 1)
## Z = 14.067, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1

logrank_test(SurvObj~as.factor(CC14),data_cox)

## 
##  Asymptotic Two-Sample Logrank Test
## 
## data:  SurvObj by as.factor(CC14) (0, 1)
## Z = -17.224, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1

logrank_test(SurvObj~as.factor(CC23),data_cox)

## 
##  Asymptotic Two-Sample Logrank Test
## 
## data:  SurvObj by as.factor(CC23) (0, 1)
## Z = 87.462, p-value < 2.2e-16
## alternative hypothesis: true theta is not equal to 1

It is clearly that four variables are significant effect for contract length.

3.3 Cox Model

3.3.1 Fit the full Cox regression

# Fit Cox regression: age, sex, Karnofsky performance score, wt loss
res.cox1 <- coxph(SurvObj ~ .-time-time1-status, data =  data_cox)
summary(res.cox1)

## Call:
## coxph(formula = SurvObj ~ . - time - time1 - status, data = data_cox)
## 
##   n= 45440, number of events= 1044 
## 
##              coef  exp(coef)   se(coef)      z Pr(>|z|)    
## CC03    9.440e-01  2.570e+00  1.587e-01  5.949 2.70e-09 ***
## CC12   -1.457e-01  8.644e-01  7.232e-02 -2.015   0.0439 *  
## CC14   -4.638e-01  6.289e-01  7.167e-02 -6.472 9.68e-11 ***
## CC23    5.983e+00  3.965e+02  2.616e-01 22.865  < 2e-16 ***
## CC0501  1.267e+01  3.166e+05  1.045e+03  0.012   0.9903    
## CC0502  1.194e+01  1.532e+05  1.045e+03  0.011   0.9909    
## CC0503  1.221e+01  2.009e+05  1.045e+03  0.012   0.9907    
## CC0504  1.181e+01  1.341e+05  1.045e+03  0.011   0.9910    
## CC0505  1.137e+01  8.678e+04  1.045e+03  0.011   0.9913    
## CC0506  1.242e+01  2.470e+05  1.045e+03  0.012   0.9905    
## CC0507  1.160e+01  1.091e+05  1.045e+03  0.011   0.9911    
## CC0508  1.247e+01  2.614e+05  1.045e+03  0.012   0.9905    
## CC0509  1.089e+01  5.363e+04  1.045e+03  0.010   0.9917    
## CC0510 -4.513e+00  1.097e-02  4.686e+03 -0.001   0.9992    
## CC0511  1.234e+01  2.287e+05  1.045e+03  0.012   0.9906    
## CC0512 -4.330e+00  1.317e-02  2.425e+03 -0.002   0.9986    
## CC0513  1.109e+01  6.576e+04  1.045e+03  0.011   0.9915    
## CC0514  1.075e+01  4.642e+04  1.045e+03  0.010   0.9918    
## CC0515  1.170e+01  1.204e+05  1.045e+03  0.011   0.9911    
## CC0516  1.232e+01  2.248e+05  1.045e+03  0.012   0.9906    
## CC0517  1.042e+01  3.360e+04  1.045e+03  0.010   0.9920    
## CC0518 -7.024e+00  8.903e-04  1.533e+05  0.000   1.0000    
## CC0519  1.111e+01  6.658e+04  1.045e+03  0.011   0.9915    
## CC0520 -4.420e+00  1.204e-02  5.134e+03 -0.001   0.9993    
## CC0521 -6.888e+00  1.020e-03  2.606e+05  0.000   1.0000    
## CC0522         NA         NA  0.000e+00     NA       NA    
## CC0523  1.388e+01  1.064e+06  1.045e+03  0.013   0.9894    
## CC0524  1.138e+01  8.712e+04  1.045e+03  0.011   0.9913    
## CC0525  1.162e+01  1.109e+05  1.045e+03  0.011   0.9911    
## CC0526  1.259e+01  2.940e+05  1.045e+03  0.012   0.9904    
## CC0599  1.180e+01  1.334e+05  1.045e+03  0.011   0.9910    
## CC0601  1.253e+01  2.776e+05  9.804e+02  0.013   0.9898    
## CC0602 -4.832e+00  7.968e-03  6.974e+04  0.000   0.9999    
## CC0603  1.392e+01  1.108e+06  9.804e+02  0.014   0.9887    
## CC0604  1.324e+01  5.609e+05  9.804e+02  0.014   0.9892    
## CC0605  1.302e+01  4.533e+05  9.804e+02  0.013   0.9894    
## CC0606  1.368e+01  8.691e+05  9.804e+02  0.014   0.9889    
## CC0607  1.414e+01  1.389e+06  9.804e+02  0.014   0.9885    
## CC0608  1.295e+01  4.208e+05  9.804e+02  0.013   0.9895    
## CC0609  1.385e+01  1.035e+06  9.804e+02  0.014   0.9887    
## CC0610  1.213e+01  1.860e+05  9.804e+02  0.012   0.9901    
## CC0611  1.408e+01  1.304e+06  9.804e+02  0.014   0.9885    
## CC0612  1.394e+01  1.134e+06  9.804e+02  0.014   0.9887    
## CC0613  1.317e+01  5.251e+05  9.804e+02  0.013   0.9893    
## CC0614  1.212e+01  1.834e+05  9.804e+02  0.012   0.9901    
## CC0615  1.391e+01  1.094e+06  9.804e+02  0.014   0.9887    
## CC0699  1.409e+01  1.321e+06  9.804e+02  0.014   0.9885    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## CC03   2.570e+00  3.891e-01    1.8832    3.5079
## CC12   8.644e-01  1.157e+00    0.7501    0.9960
## CC14   6.289e-01  1.590e+00    0.5465    0.7237
## CC23   3.965e+02  2.522e-03  237.4049  662.0974
## CC0501 3.166e+05  3.159e-06    0.0000       Inf
## CC0502 1.532e+05  6.527e-06    0.0000       Inf
## CC0503 2.009e+05  4.977e-06    0.0000       Inf
## CC0504 1.341e+05  7.457e-06    0.0000       Inf
## CC0505 8.678e+04  1.152e-05    0.0000       Inf
## CC0506 2.470e+05  4.049e-06    0.0000       Inf
## CC0507 1.091e+05  9.163e-06    0.0000       Inf
## CC0508 2.614e+05  3.826e-06    0.0000       Inf
## CC0509 5.363e+04  1.865e-05    0.0000       Inf
## CC0510 1.097e-02  9.117e+01    0.0000       Inf
## CC0511 2.287e+05  4.373e-06    0.0000       Inf
## CC0512 1.317e-02  7.591e+01    0.0000       Inf
## CC0513 6.576e+04  1.521e-05    0.0000       Inf
## CC0514 4.642e+04  2.154e-05    0.0000       Inf
## CC0515 1.204e+05  8.308e-06    0.0000       Inf
## CC0516 2.248e+05  4.448e-06    0.0000       Inf
## CC0517 3.360e+04  2.976e-05    0.0000       Inf
## CC0518 8.903e-04  1.123e+03    0.0000       Inf
## CC0519 6.658e+04  1.502e-05    0.0000       Inf
## CC0520 1.204e-02  8.309e+01    0.0000       Inf
## CC0521 1.020e-03  9.804e+02    0.0000       Inf
## CC0522        NA         NA        NA        NA
## CC0523 1.064e+06  9.401e-07    0.0000       Inf
## CC0524 8.712e+04  1.148e-05    0.0000       Inf
## CC0525 1.109e+05  9.014e-06    0.0000       Inf
## CC0526 2.940e+05  3.402e-06    0.0000       Inf
## CC0599 1.334e+05  7.496e-06    0.0000       Inf
## CC0601 2.776e+05  3.602e-06    0.0000       Inf
## CC0602 7.968e-03  1.255e+02    0.0000       Inf
## CC0603 1.108e+06  9.023e-07    0.0000       Inf
## CC0604 5.609e+05  1.783e-06    0.0000       Inf
## CC0605 4.533e+05  2.206e-06    0.0000       Inf
## CC0606 8.691e+05  1.151e-06    0.0000       Inf
## CC0607 1.389e+06  7.201e-07    0.0000       Inf
## CC0608 4.208e+05  2.376e-06    0.0000       Inf
## CC0609 1.035e+06  9.663e-07    0.0000       Inf
## CC0610 1.860e+05  5.377e-06    0.0000       Inf
## CC0611 1.304e+06  7.667e-07    0.0000       Inf
## CC0612 1.134e+06  8.821e-07    0.0000       Inf
## CC0613 5.251e+05  1.904e-06    0.0000       Inf
## CC0614 1.834e+05  5.453e-06    0.0000       Inf
## CC0615 1.094e+06  9.138e-07    0.0000       Inf
## CC0699 1.321e+06  7.570e-07    0.0000       Inf
## 
## Concordance= 0.959  (se = 0.002 )
## Likelihood ratio test= 4756  on 46 df,   p=<2e-16
## Wald test            = 53.5  on 46 df,   p=0.2
## Score (logrank) test = 8753  on 46 df,   p=<2e-16

3.3.2 Cox stepwise

num <- vector()
j = 1
for(i in c(1:length(data_cox$CC03))){
  if(data_cox$CC23[i] == 0){
  num[j] <- i
  j <- j+1
  }
}
data_00 <- data[num,]
data_01 <- data[-num,]
library(My.stepwise)
my.variable.list <- c("CC03", "CC12", "CC14", "CC0504","CC0508","CC0511","CC0516","CC0608","CC0610")
#My.stepwise.coxph(Time = "time", Status = "status", variable.list = my.variable.list,
#data = data_00, sle = 0.25, sls = 0.25)
res.cox <- coxph(SurvObj ~ CC23, data =  data_cox)
summary(res.cox)

## Call:
## coxph(formula = SurvObj ~ CC23, data = data_cox)
## 
##   n= 45440, number of events= 1044 
## 
##          coef exp(coef) se(coef)     z Pr(>|z|)    
## CC23   6.3443  569.2620   0.2601 24.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## CC23     569.3   0.001757     341.9     947.8
## 
## Concordance= 0.94  (se = 0.002 )
## Likelihood ratio test= 4429  on 1 df,   p=<2e-16
## Wald test            = 595  on 1 df,   p=<2e-16
## Score (logrank) test = 8339  on 1 df,   p=<2e-16

This result shows that the CC23 is the key for contrat length.

3.3.3 Check hypothesis

# Check for violation of proportional hazard (constant HR over time)
(res.zph1 <- cox.zph(res.cox))

##         rho chisq      p
## CC23 0.0564  3.32 0.0684

It under the \(H_0\), means this Cox model conform the hypothesis of proportional hazard.

4 Classification Analysis

The CC14 is a variable which shows whether user shipping to post office by themselves or not. This means if user shows 0 in CC14, it will lead office to cost more money on transport.

4.1 Logistic refression

#2:8 data cut
set.seed(1)
set = sample(1:nrow(data_cox),0.8*nrow(data_cox))
train = data_cox[set,]
test = data_cox[-set,]

#logistic regression
model1<-glm(formula=CC12~.-time-time1-SurvObj-status,data=train,family=binomial(link="logit"),na.action=na.exclude) #Let Logistic model assign to model1

#summary(model1)
pred = predict(model1,newdata = test)

confus.matrix <- table(test$CC12,pred)
sum(diag(confus.matrix))/sum(confus.matrix)

## [1] 0.01364437

Under the full logistic model, the accuracy is tragic.

4.2 Decision Tree

library(rpart)
library(rpart.plot)
data1 <- data_cox[,-(48:51)]

#2:8 data cut
set.seed(1)
set = sample(1:nrow(data1),0.8*nrow(data1))
train = data1[set,]
test = data1[-set,]

tree.model<- rpart(CC12~.,data1)

prp(tree.model,faclen=0,fallen.leaves=TRUE)

pred = predict(tree.model,newdata = test)

confus.matrix <- table(test$CC12,pred)
sum(diag(confus.matrix))/sum(confus.matrix)

## [1] 0.4489437

In the Decision Tree model , we have the accuracy: 0.4489437.

The Taiwan Post Big Data Contest