1.Set the WD and read data# (1)Read data

setwd("~/Documents/MSXF/产品业务分析/逾期率分析/data/markov_chain")
roll_rate = read.csv("20160524_rollrate.csv",stringsAsFactors=F)
abnorm_contract = read.table("abnorm_contract",stringsAsFactors=F)

(2)delete the abnorm contract number

roll_rate = roll_rate[!(roll_rate[,"no_contract"] %in% abnorm_contract[,1]),]
rm(abnorm_contract)

(3)Make the type right####

roll_rate[,"checking_time"] = as.Date(roll_rate[,"checking_time"])
#roll_rate[,"first_stmt_date"] = as.Date(roll_rate[,"first_stmt_date"])
roll_rate[,"loan_code"]  = as.character(roll_rate[,"loan_code"])

(4)Null will be replaced by NA, Replace NA with 0

roll_rate[,"term"] = as.numeric(roll_rate[,"term"])
roll_rate[,"dpd_curr"] = as.numeric(roll_rate[,"dpd_curr"])
roll_rate[,"dpd_curr"][is.na(roll_rate[,"dpd_curr"])] = 0
roll_rate[,"max_dpd_hist"] = as.numeric(roll_rate[,"max_dpd_hist"])
roll_rate[,"max_dpd_hist"][is.na(roll_rate[,"max_dpd_hist"])] = 0

2.Add two colunmn - status and status1###

##(1)Add status column represents the current status -- status
roll_rate[,"status"] = rep(NA,nrow(roll_rate))
roll_rate[32> roll_rate[,"dpd_curr"] &  roll_rate[,"dpd_curr"]>=10,"status"] = "M1"
roll_rate[62> roll_rate[,"dpd_curr"] &  roll_rate[,"dpd_curr"]>=32,"status"] = "M2"
roll_rate[91> roll_rate[,"dpd_curr"] &  roll_rate[,"dpd_curr"]>=62,"status"] = "M3"
roll_rate[roll_rate[,"dpd_curr"]>=91,"status"] = "M3+"
roll_rate[roll_rate[,"dpd_curr"]<10,"status"] = "M0"
unique(roll_rate[,"status"])
## [1] "M0"  "M3"  "M1"  "M2"  "M3+"
table(roll_rate[,"status"])
## 
##     M0     M1     M2     M3    M3+ 
## 279125  20099   6181   3571   5902

(2)Add status column represents the next status – status1

roll_rate[,"status1"] = roll_rate[,"status"]
roll_rate[,"term1"] = roll_rate[,"term"]-1

(3)Left join and make the status1 is the next status

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
roll_rate = left_join(roll_rate[,1:12],roll_rate[,c(1,13,14)],by=c("no_contract"="no_contract","term"="term1"))

get data function

get_data_v = function(data,loan_code,vintage){
  data = data[which(data$loan_code==loan_code &data$vintage==vintage),]
  return(data)
}
#
head(get_data_v(roll_rate,1101,4))
##           no_contract checking_time     first_stmt_date       apply_no
## 206  2015510725007643    2016-05-23 2016-01-27 09:20:42 20151226444221
## 280  2015522725007652    2016-05-23 2016-01-27 11:06:51 20151227444925
## 535  2015220524007649    2016-05-23 2016-01-27 10:25:07 20151226444397
## 683  2015612427007647    2016-05-23 2016-01-27 10:12:59 20151226444317
## 1299 2015130602007645    2016-05-23 2016-01-27 09:35:09 20151219424810
## 1300 2016513101007721    2016-05-23 2016-02-03 09:14:57 20160102459222
##      loan_code vintage term system dpd_curr max_dpd_hist total_repay
## 206       1101       4    0   core       -1           -1        0.00
## 280       1101       4    4   core        0            0     2880.64
## 535       1101       4    3   core        1            1     1815.54
## 683       1101       4    0   core       -1           -1        0.00
## 1299      1101       4    1   core        2            2      754.11
## 1300      1101       4    0   core        0            0     3000.15
##      status status1
## 206      M0      M0
## 280      M0    <NA>
## 535      M0      M0
## 683      M0      M0
## 1299     M0      M0
## 1300     M0      M0

4.Calculate the overdue rate of all customers (term0-term4) 4.1 term0 - term4

ma_term = function(data,term){
  ma1 = table(data$status[which(data$term==term)],data$status1[which(data$term==term)])
  return(ma1)
}
ma_term(data = get_data_v(roll_rate,1101,4),term=0)
##     
##        M0   M1
##   M0 2103  379

Markov Function

Notice If the month changes, the alpha might change

mac_chain = function(data,loan_code,vintage,alpha){
  data = get_data_v(data,loan_code,vintage)
  if(vintage<4){
    print("vintage must greater than 3")
    break
  }else{
    #data
    data = data[which(data$vintage==vintage ),]
    #Alpha 
    alpha = alpha
    #Prepare F1#
    F1 = matrix(c(0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0),byrow=T,ncol=5,nrow=4,
                dimnames=list(c("M0","M1","M2","M3"),c("M0","M1","M2","M3","M3+")))
    #Prepare F2#
    F2 = matrix(c(0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0),byrow=T,ncol=5,nrow=4,
                dimnames=list(c("M0","M1","M2","M3"),c("M0","M1","M2","M3","M3+")))
    #Prepare F3#
    F3 = matrix(c(0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1),byrow=T,ncol=5,nrow=4,
                dimnames=list(c("M0","M1","M2","M3"),c("M0","M1","M2","M3","M3+")))
    ###Term:1###
    T1_6 = as.data.frame.matrix(ma_term(data,1)/rowSums(ma_term(data,1)))
    T1_6[,c("M3","M3+")] = 0 ; T1_6[c("M2","M3"),] = 0
    ###Term:2###
    T2_6 = as.data.frame.matrix(ma_term(data,2)/rowSums(ma_term(data,2)))
    T2_6[,"M3+"] = 0; T2_6[c("M3"),] = 0
    ###Term:3###
    T3_6 = as.data.frame.matrix(ma_term(data,3)/rowSums(ma_term(data,3)))
    T3_6 = T3_6[-5,];T3_6
    
    ######AVG(T1,T2,T3) for the 3 terms######
    T_com = (T1_6+T2_6+T3_6)/3
    ######AVG(F1,F2,F3) for the 3######
    F_com = (F1+F2+F3)/3
    
    ###calculate the M###
    M_com = (T_com-alpha*F_com)/(1-alpha); M_com ; M_com["M3+",] = c(0,0,0,0,1)
    ##Standardize##
    M_com = M_com / rowSums(M_com)
    M_com = as.matrix(M_com)}
  return(M_com)
}
#
mac_chain(roll_rate,1101,4,0.2397)
##            M0          M1         M2        M3      M3+
## M0  0.9947963 0.005203658 0.00000000 0.0000000 0.000000
## M1  0.3151146 0.291170731 0.39371467 0.0000000 0.000000
## M2  0.1169567 0.074708049 0.04259217 0.7657430 0.000000
## M3  0.1171131 0.018017401 0.01801740 0.0270261 0.819826
## M3+ 0.0000000 0.000000000 0.00000000 0.0000000 1.000000

Write up a prediction function

Notice If the month changes, the number M_com must change

pred_roll_rate = function(data,loan_code,vintage,months,alpha){
  M_com = mac_chain(data,loan_code,vintage,alpha)
  data = get_data_v(data,loan_code,vintage)
  data1 = c(table(data$status[which(data$term == max(data$term))]))
  n=1
  while(n<months){
    n = n + 1
    data1 = data1 %*% M_com
  }
  #rownames(data1) = toString(paste0("vintage_",vintage,sep=""))
  rownames(data1) = format(Sys.Date() %m-% months(vintage),format = "%Y-%m")
  return(data1)
}
###
pred_roll_rate(roll_rate,1101,vintage=6,months=6,alpha=0.2397)
##               M0     M1       M2       M3      M3+
## 2015-11 1244.368 130.12 84.92985 75.40859 1508.173

Use/Apply vintage must be greater than 3

pred_roll_rate(roll_rate,1101,vintage=4,months=6,alpha=0.2397)
##               M0       M1       M2      M3      M3+
## 2016-01 1955.341 17.10147 8.576941 9.39647 407.5838
pred_roll_rate(roll_rate,1101,vintage=5,months=6,alpha=0.2397)
##               M0       M1       M2       M3      M3+
## 2015-12 464.1791 34.83368 19.34029 16.46109 301.1858
pred_roll_rate(roll_rate,1101,vintage=6,months=6,alpha=0.2397)
##               M0     M1       M2       M3      M3+
## 2015-11 1244.368 130.12 84.92985 75.40859 1508.173

For vintage less than 3

T_term=function(data,term){
  if(term==0){
    temp = as.data.frame.matrix(ma_term(data,0)/rowSums(ma_term(data,0)))
    temp[,c("M2","M3","M3+")] = 0 ; temp[c("M1","M2","M3"),] = 0 ; temp[c("M3+"),] = c(0,0,0,0,0)
    return(temp)
  }else if(term==1){
    temp = as.data.frame.matrix(ma_term(data,1)/rowSums(ma_term(data,1)))
    temp[,c("M3","M3+")] = 0 ; temp[c("M2","M3"),] = 0 ; temp[c("M3+"),] = c(0,0,0,0,0)
    return(temp)
  }else if(term==2){
    temp = as.data.frame.matrix(ma_term(data,2)/rowSums(ma_term(data,2)))
    temp[c("M3","M3+"),] = 0
    if(!("M3+"%in%colnames(temp))){
      temp[,"M3+"] = 0
    }
    #temp[3,c("M3+")] = 0
    return(temp)
  }else if(term==3){
    temp = as.data.frame.matrix(ma_term(data,3)/rowSums(ma_term(data,3)))
    temp[3,c("M3+")] = 0
    return(temp)
  }else{
    print("term must be less than 4");
    break;
  }
}

Predition:vintage1~3, term1–>term4

#vintage 1:
pred_roll_rate_1_3 = function(data,loan_code,vintage,alpha,months){
  if(loan_code!=1101){
    print("loan_code must be 1101")
  }else{
  M_com  = (mac_chain(data,loan_code,4,alpha) + mac_chain(data,loan_code,5,alpha) + mac_chain(data,loan_code,6,alpha))/3
  if(vintage==1){
    data1 = get_data_v(data,loan_code,1)
    T1 = T_term(get_data_v(data,loan_code,1),0)
    a2 = c(1,0,0,0,0)%*%as.matrix(alpha*T1+(1-alpha)*M_com)
    num = length(which(data1$term==0))
    n = 1
    while(n<months){
      n = n + 1
      a2 = a2 %*% M_com
    }
    vin1_pre = num*a2
    rownames(vin1_pre) = format(Sys.Date() %m-% months(1),format = "%Y-%m")
    result = vin1_pre
  }
  ##vintage 2:
  else if(vintage==2){
    data2 = get_data_v(data,loan_code,2)
    T1 = T_term(data2,0)
    a2 = c(1,0,0,0,0)%*%as.matrix(alpha*T1+(1-alpha)*M_com)
    #a2 
    T2 = T_term(data2,1)
    a3 = a2%*%as.matrix(alpha*T2+(1-alpha)*M_com)
    num = length(which(data2$term==0))
    months=6
    n = 1
    while(n<months){
      n = n + 1
      a3 = a3 %*% M_com
    }
    vin2_pre = num*a3 
    rownames(vin2_pre) = format(Sys.Date() %m-% months(2),format = "%Y-%m")
    result = vin2_pre
  }
  ##vintage 3:
  else if(vintage==3){
    data3 = get_data_v(data,loan_code,3)
    T1 = T_term(data3,0)
    a2 = c(1,0,0,0,0)%*%as.matrix(alpha*T1+(1-alpha)*M_com)
    #a3
    T2 = T_term(data3,1)
    a3 = a2%*%as.matrix(alpha*T2+(1-alpha)*M_com)
    #a4
    T3 = T_term(data3,2)
    a4 = a3%*%as.matrix(alpha*T3+(1-alpha)*M_com)
    num = length(which(data3$term==0))
    n = 1
    while(n<months){
      n = n + 1
      a4 = a4 %*% M_com
    }
    vin3_pre = num*a4
    rownames(vin3_pre) = format(Sys.Date() %m-% months(3),format = "%Y-%m")
    result = vin3_pre
  }else{
    print("vintage must be less than 4")
    break
  }}
  return(result)
}

vintage: 1 ~ 3

pred_roll_rate_1_3(roll_rate,1101,1,0.2397,months=6)
##              M0       M1     M2       M3      M3+
## 2016-04 12624.9 772.6451 412.61 340.0277 980.8195
pred_roll_rate_1_3(roll_rate,1101,2,0.2397,months=6)
##               M0       M1       M2       M3      M3+
## 2016-03 5187.182 317.8564 170.0601 140.7051 569.1965
pred_roll_rate_1_3(roll_rate,1101,3,0.2397,months=6)
##               M0      M1       M2       M3      M3+
## 2016-02 2969.612 182.004 97.40465 80.63737 414.3425

Combine 6 vintage into a dataframe

pred_roll_rate_com = rbind(pred_roll_rate_1_3(roll_rate,1101,1,0.2397,months=6),pred_roll_rate_1_3(roll_rate,1101,2,0.2397,months=6),pred_roll_rate_1_3(roll_rate,1101,3,0.2397,months=6),pred_roll_rate(roll_rate,1101,vintage=4,months=6,alpha=0.2397),pred_roll_rate(roll_rate,1101,vintage=5,months=6,alpha=0.2397),pred_roll_rate(roll_rate,1101,vintage=6,months=6,alpha=0.2397))
pred_roll_rate_com = as.data.frame(pred_roll_rate_com)
###Inser the total row and colums
pred_roll_rate_com $total = rowSums(pred_roll_rate_com )
pred_roll_rate_com ["total",] = colSums(pred_roll_rate_com )
##rate
pred_roll_rate_com_r  = as.data.frame(pred_roll_rate_com/rowSums(pred_roll_rate_com[,-6]))