# data_normalization_using_vector_forall  20180212 by LangYuan
###################################################################################
# 版本更新说明
# 1.补全了缺失的1分钟数据,使得后续处理更方便简洁
# 2.可以参数化控制输出前m分钟和后n分钟的回归数据(1<=m,n<=5)
###################################################################################
# state_flag数据说明
# 0 数据缺失
# 1 其他正常交易时间
# 2 早盘开盘交易时间:09:00(大连、上海、郑州交易所)
# 3 早盘开盘交易时间:09:15(中金所国债) 
# 4 早盘开盘交易时间:09;30(中金所股指)
# 5 早盘小节休息后开始交易时间:10:30
# 6 下午开盘交易时间:13:00(中金所)
# 7 下午开盘交易时间:13:30(大连、上海、郑州交易所)
# 8 夜盘开始交易时间:21:00(上期所,郑商所,大商所)
###################################################################################

# 计时
t_start=Sys.time()

library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(stringr)
# 设置工作路径
setwd("C:\\Users\\Administrator\\Documents\\实习\\测试01_5min最后一根1min与下一根")
# list.files命令将data_all文件夹下所有文件名存入filenames
filenames_5min = list.files("data_5min")
filenames_1min = list.files("data_1min")    
#filename_1min = "JT888_1min.csv"
#filename_5min = "JT888_5min.csv"                                                     
# 用paste命令构建路径变量filelist       
filepath_5min = paste("./data_5min/",filenames_5min,sep="")      
filepath_1min = paste("./data_1min/",filenames_1min,sep="")      
looplen=length(filenames_1min)

## 参数设置
# 设定前一根5分钟K线用来回归下5分钟K线的分钟数,一般取1或5
m=5
# 设定后一个根5分钟K线开始几分钟用于回归,此处考虑特殊需求取2
n=2


# 对文件夹中所有文件进行操作                                                    
for (iii in 1:looplen)
{
  # 读入数据
  kdata_1min=read.csv(filepath_1min[iii])
  kdata_5min=read.csv(filepath_5min[iii])
  
  # 处理一点小bug
  kdata_1min$plotdate.td[1]=kdata_1min$plotdate.td[2]
  kdata_5min$plotdate.td[1]=kdata_5min$plotdate.td[2]
  
  # 计算通用数据
  begtime_1min =kdata_1min$X..时间
  openpri_1min =kdata_1min$开盘价
  highpri_1min =kdata_1min$最高价
  lowpri_1min = kdata_1min$最低价
  closepri_1min=kdata_1min$收盘价
  volamo_1min = kdata_1min$成交量 
  storeamo_1min=kdata_1min$持仓量
  truedate_1min=kdata_1min$plotdate.td
  begtime_1min=as.character(begtime_1min)
  data_len_1min=length(openpri_1min)
  
  begtime_5min =kdata_5min$X..时间
  openpri_5min =kdata_5min$开盘价
  highpri_5min =kdata_5min$最高价
  lowpri_5min = kdata_5min$最低价
  closepri_5min=kdata_5min$收盘价
  volamo_5min = kdata_5min$成交量 
  storeamo_5min=kdata_5min$持仓量
  truedate_5min=kdata_5min$plotdate.td
  begtime_5min=as.character(begtime_5min)
  data_len_5min=length(openpri_5min)
  
  # 构造存储数据的向量
  newdata_len=data_len_5min*5
  begtime_this=vector(length=newdata_len)
  openpri_this=vector(length=newdata_len)
  highpri_this=vector(length=newdata_len)
  lowpri_this=vector(length=newdata_len)
  closepri_this=vector(length=newdata_len)
  volamo_this=vector(length=newdata_len)
  storeamo_this=vector(length=newdata_len)
  truedate_this=vector(length=newdata_len)
  state_flag=vector(length=newdata_len)
  
  pointer_index=(1:data_len_5min)*5
  # 生成完整表
  #对5分钟的每个1分钟进行循环
  for (jj in 0:4){
    # 寻找索引
    time_search_1min=paste(substr(begtime_5min,1,15),as.character(as.numeric(substr(begtime_5min,16,16))+jj),sep="")
    aaa=setdiff(time_search_1min,begtime_1min)
    bbb=intersect(time_search_1min,begtime_1min)
    match_index=pmatch(bbb,time_search_1min)
    match_index_1min=pmatch(bbb,begtime_1min)
    nonmatch_index=pmatch(aaa,time_search_1min)

    # 进行赋值
    new_allindex=pointer_index+jj-4
    new_matchindex=match_index*5+jj-4
    new_nonmatchindex=nonmatch_index*5+jj-4
    begtime_this[new_allindex]=time_search_1min
    truedate_this[new_allindex]=truedate_5min
    openpri_this[new_matchindex]=openpri_1min[match_index_1min]
    highpri_this[new_matchindex]=highpri_1min[match_index_1min]
    lowpri_this[new_matchindex]=lowpri_1min[match_index_1min]
    closepri_this[new_matchindex]=closepri_1min[match_index_1min]
    volamo_this[new_matchindex]=volamo_1min[match_index_1min]
    storeamo_this[new_matchindex]=storeamo_1min[match_index_1min]
    state_flag[new_matchindex]=1
    openpri_this[new_nonmatchindex]=NA
    highpri_this[new_nonmatchindex]=NA
    lowpri_this[new_nonmatchindex]=NA
    closepri_this[new_nonmatchindex]=NA
    volamo_this[new_nonmatchindex]=NA
    storeamo_this[new_nonmatchindex]=NA
    state_flag[new_nonmatchindex]=0
  }

  
  data_5min_all <- data.frame(
    "X..时间" = begtime_this,
    "开盘价" = openpri_this,
    "最高价" = highpri_this,
    "最低价" = lowpri_this,
    "收盘价" = closepri_this,
    "成交量"=volamo_this,
    "持仓量"=storeamo_this,
    "truedate"=truedate_this,
    "state_flag"=state_flag,
    stringsAsFactors = FALSE
  )
  
  
  # 生成回归用数据表(m min regress n min)
  # 建立存储向量;
  openpri_last=vector(length=data_len_5min)
  highpri_last=vector(length=data_len_5min)
  lowpri_last=vector(length=data_len_5min)
  closepri_last=vector(length=data_len_5min)
  volamo_last=vector(length=data_len_5min)
  storeamo_last=vector(length=data_len_5min)
  openpri_next=vector(length=data_len_5min-1)
  highpri_next=vector(length=data_len_5min-1)
  lowpri_next=vector(length=data_len_5min-1)
  closepri_next=vector(length=data_len_5min-1)
  volamo_next=vector(length=data_len_5min-1)
  storeamo_next=vector(length=data_len_5min-1)
  state_flag_next=vector(length=data_len_5min)
  
  openpri_last=openpri_this[pointer_index-m+1]
  closepri_last=closepri_this[pointer_index]
  storeamo_last=storeamo_this[pointer_index]
  highpri_last=highpri_this[pointer_index]
  lowpri_last=lowpri_this[pointer_index]
  volamo_last=volamo_this[pointer_index]
  if (m>1){
    for (kk in 2:m){
        highpri_last=pmax(highpri_last,highpri_this[pointer_index-kk+1])
        lowpri_last=pmin(lowpri_last,lowpri_this[pointer_index-kk+1])
        volamo_last=volamo_last+volamo_this[pointer_index-kk+1]
    }
  }
  truedate_last=truedate_this[pointer_index]
  
  pointer_neindex=pointer_index[1:(data_len_5min-1)]
  openpri_next[1:(data_len_5min-1)]=openpri_this[pointer_neindex+1]
  closepri_next[1:(data_len_5min-1)]=closepri_this[pointer_neindex+n]
  storeamo_next[1:(data_len_5min-1)]=storeamo_this[pointer_neindex+n]
  highpri_next[1:(data_len_5min-1)]=highpri_this[pointer_neindex+1]
  lowpri_next[1:(data_len_5min-1)]=lowpri_this[pointer_neindex+1]
  volamo_next[1:(data_len_5min-1)]=volamo_this[pointer_neindex+1]
  if (n>1){
    for (ll in 2:n){
        highpri_next=pmax(highpri_next,highpri_this[pointer_neindex+ll])
        lowpri_next=pmin(lowpri_next,lowpri_this[pointer_neindex+ll])
        volamo_next=volamo_next+volamo_this[pointer_neindex+ll]
    }
  }
  openpri_next[data_len_5min]=NA
  closepri_next[data_len_5min]=NA
  storeamo_next[data_len_5min]=NA
  highpri_next[data_len_5min]=NA
  lowpri_next[data_len_5min]=NA
  volamo_next[data_len_5min]=NA
  truedate_next=truedate_this[pointer_neindex+1]
  truedate_next[data_len_5min]=NA

  # state_flag数据计算
  # 0 数据缺失
  # 1 其他正常交易时间
  # 2 早盘开盘交易时间:09:00(大连、上海、郑州交易所)
  # 3 早盘开盘交易时间:09:15(中金所国债) 
  # 4 早盘开盘交易时间:09;30(中金所股指)
  # 5 早盘小节休息后开始交易时间:10:30
  # 6 下午开盘交易时间:13:00(中金所)
  # 7 下午开盘交易时间:13:30(大连、上海、郑州交易所)
  # 8 夜盘开始交易时间:21:00(上期所,郑商所,大商所)
  time_next_hm=substr(begtime_this[pointer_neindex+1],12,16)
  time_next_hm[data_len_5min]='99:99'
  index_0=is.na(volamo_last)|is.na(volamo_next)
  withdata_index=!index_0
  index_2=withdata_index&(time_next_hm=='09:00')
  index_3=withdata_index&(time_next_hm=='09:15')
  index_4=withdata_index&(time_next_hm=='09:30')
  index_5=withdata_index&(time_next_hm=='10:30')
  index_6=withdata_index&(time_next_hm=='13:00')
  index_7=withdata_index&(time_next_hm=='13:30')
  index_8=withdata_index&(time_next_hm=='21:00')
  state_flag_next[index_0]=0
  state_flag_next[withdata_index]=1
  state_flag_next[index_2]=2
  state_flag_next[index_3]=3
  state_flag_next[index_4]=4
  state_flag_next[index_5]=5
  state_flag_next[index_6]=6
  state_flag_next[index_7]=7
  state_flag_next[index_8]=8
  
  data_for_reg <- data.frame(
    "X..时间" = begtime_5min, #用前一根5分钟K线开始的时间戳来表示这个回归组合的时间
    "开盘价_last" = openpri_last,
    "最高价_last" = highpri_last,
    "最低价_last" = lowpri_last,
    "收盘价_last" = closepri_last,
    "成交量_last"=volamo_last,
    "持仓量_last"=storeamo_last,
    "truedate_last"=truedate_last,
    "开盘价_next" = openpri_next,
    "最高价_next" = highpri_next,
    "最低价_next" = lowpri_next,
    "收盘价_next" = closepri_next,
    "成交量_next"=volamo_next,
    "持仓量_next"=storeamo_next,
    "truedate_next"=truedate_next,
    "state_flag"=state_flag_next,
    stringsAsFactors = FALSE
  )
  
  outname1=paste(substr(filenames_1min[iii],1,5),'_all.csv',sep="")
  outname2=paste(substr(filenames_1min[iii],1,5),'_reg.csv',sep="")
  # 输出结果到csv文件
  write.csv(data_5min_all,outname1)
  write.csv(data_for_reg,outname2)
  # 输出结果到Excel文件 输不出来溢出报错了
  # write.xlsx(data_5min_1min,"data_processed.xlsx")
  # 输出结果到csv文件
  # write.csv(data_5min_1min,'JT888_processed.csv')
}
# 输出运行时间
t_end=Sys.time()
t_end-t_start
## Time difference of 6.860276 mins