# data_normalization_using_vector_forall 20180212 by LangYuan
###################################################################################
# 版本更新说明
# 1.补全了缺失的1分钟数据,使得后续处理更方便简洁
# 2.可以参数化控制输出前m分钟和后n分钟的回归数据(1<=m,n<=5)
###################################################################################
# state_flag数据说明
# 0 数据缺失
# 1 其他正常交易时间
# 2 早盘开盘交易时间:09:00(大连、上海、郑州交易所)
# 3 早盘开盘交易时间:09:15(中金所国债)
# 4 早盘开盘交易时间:09;30(中金所股指)
# 5 早盘小节休息后开始交易时间:10:30
# 6 下午开盘交易时间:13:00(中金所)
# 7 下午开盘交易时间:13:30(大连、上海、郑州交易所)
# 8 夜盘开始交易时间:21:00(上期所,郑商所,大商所)
###################################################################################
# 计时
t_start=Sys.time()
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(stringr)
# 设置工作路径
setwd("C:\\Users\\Administrator\\Documents\\实习\\测试01_5min最后一根1min与下一根")
# list.files命令将data_all文件夹下所有文件名存入filenames
filenames_5min = list.files("data_5min")
filenames_1min = list.files("data_1min")
#filename_1min = "JT888_1min.csv"
#filename_5min = "JT888_5min.csv"
# 用paste命令构建路径变量filelist
filepath_5min = paste("./data_5min/",filenames_5min,sep="")
filepath_1min = paste("./data_1min/",filenames_1min,sep="")
looplen=length(filenames_1min)
## 参数设置
# 设定前一根5分钟K线用来回归下5分钟K线的分钟数,一般取1或5
m=5
# 设定后一个根5分钟K线开始几分钟用于回归,此处考虑特殊需求取2
n=2
# 对文件夹中所有文件进行操作
for (iii in 1:looplen)
{
# 读入数据
kdata_1min=read.csv(filepath_1min[iii])
kdata_5min=read.csv(filepath_5min[iii])
# 处理一点小bug
kdata_1min$plotdate.td[1]=kdata_1min$plotdate.td[2]
kdata_5min$plotdate.td[1]=kdata_5min$plotdate.td[2]
# 计算通用数据
begtime_1min =kdata_1min$X..时间
openpri_1min =kdata_1min$开盘价
highpri_1min =kdata_1min$最高价
lowpri_1min = kdata_1min$最低价
closepri_1min=kdata_1min$收盘价
volamo_1min = kdata_1min$成交量
storeamo_1min=kdata_1min$持仓量
truedate_1min=kdata_1min$plotdate.td
begtime_1min=as.character(begtime_1min)
data_len_1min=length(openpri_1min)
begtime_5min =kdata_5min$X..时间
openpri_5min =kdata_5min$开盘价
highpri_5min =kdata_5min$最高价
lowpri_5min = kdata_5min$最低价
closepri_5min=kdata_5min$收盘价
volamo_5min = kdata_5min$成交量
storeamo_5min=kdata_5min$持仓量
truedate_5min=kdata_5min$plotdate.td
begtime_5min=as.character(begtime_5min)
data_len_5min=length(openpri_5min)
# 构造存储数据的向量
newdata_len=data_len_5min*5
begtime_this=vector(length=newdata_len)
openpri_this=vector(length=newdata_len)
highpri_this=vector(length=newdata_len)
lowpri_this=vector(length=newdata_len)
closepri_this=vector(length=newdata_len)
volamo_this=vector(length=newdata_len)
storeamo_this=vector(length=newdata_len)
truedate_this=vector(length=newdata_len)
state_flag=vector(length=newdata_len)
pointer_index=(1:data_len_5min)*5
# 生成完整表
#对5分钟的每个1分钟进行循环
for (jj in 0:4){
# 寻找索引
time_search_1min=paste(substr(begtime_5min,1,15),as.character(as.numeric(substr(begtime_5min,16,16))+jj),sep="")
aaa=setdiff(time_search_1min,begtime_1min)
bbb=intersect(time_search_1min,begtime_1min)
match_index=pmatch(bbb,time_search_1min)
match_index_1min=pmatch(bbb,begtime_1min)
nonmatch_index=pmatch(aaa,time_search_1min)
# 进行赋值
new_allindex=pointer_index+jj-4
new_matchindex=match_index*5+jj-4
new_nonmatchindex=nonmatch_index*5+jj-4
begtime_this[new_allindex]=time_search_1min
truedate_this[new_allindex]=truedate_5min
openpri_this[new_matchindex]=openpri_1min[match_index_1min]
highpri_this[new_matchindex]=highpri_1min[match_index_1min]
lowpri_this[new_matchindex]=lowpri_1min[match_index_1min]
closepri_this[new_matchindex]=closepri_1min[match_index_1min]
volamo_this[new_matchindex]=volamo_1min[match_index_1min]
storeamo_this[new_matchindex]=storeamo_1min[match_index_1min]
state_flag[new_matchindex]=1
openpri_this[new_nonmatchindex]=NA
highpri_this[new_nonmatchindex]=NA
lowpri_this[new_nonmatchindex]=NA
closepri_this[new_nonmatchindex]=NA
volamo_this[new_nonmatchindex]=NA
storeamo_this[new_nonmatchindex]=NA
state_flag[new_nonmatchindex]=0
}
data_5min_all <- data.frame(
"X..时间" = begtime_this,
"开盘价" = openpri_this,
"最高价" = highpri_this,
"最低价" = lowpri_this,
"收盘价" = closepri_this,
"成交量"=volamo_this,
"持仓量"=storeamo_this,
"truedate"=truedate_this,
"state_flag"=state_flag,
stringsAsFactors = FALSE
)
# 生成回归用数据表(m min regress n min)
# 建立存储向量;
openpri_last=vector(length=data_len_5min)
highpri_last=vector(length=data_len_5min)
lowpri_last=vector(length=data_len_5min)
closepri_last=vector(length=data_len_5min)
volamo_last=vector(length=data_len_5min)
storeamo_last=vector(length=data_len_5min)
openpri_next=vector(length=data_len_5min-1)
highpri_next=vector(length=data_len_5min-1)
lowpri_next=vector(length=data_len_5min-1)
closepri_next=vector(length=data_len_5min-1)
volamo_next=vector(length=data_len_5min-1)
storeamo_next=vector(length=data_len_5min-1)
state_flag_next=vector(length=data_len_5min)
openpri_last=openpri_this[pointer_index-m+1]
closepri_last=closepri_this[pointer_index]
storeamo_last=storeamo_this[pointer_index]
highpri_last=highpri_this[pointer_index]
lowpri_last=lowpri_this[pointer_index]
volamo_last=volamo_this[pointer_index]
if (m>1){
for (kk in 2:m){
highpri_last=pmax(highpri_last,highpri_this[pointer_index-kk+1])
lowpri_last=pmin(lowpri_last,lowpri_this[pointer_index-kk+1])
volamo_last=volamo_last+volamo_this[pointer_index-kk+1]
}
}
truedate_last=truedate_this[pointer_index]
pointer_neindex=pointer_index[1:(data_len_5min-1)]
openpri_next[1:(data_len_5min-1)]=openpri_this[pointer_neindex+1]
closepri_next[1:(data_len_5min-1)]=closepri_this[pointer_neindex+n]
storeamo_next[1:(data_len_5min-1)]=storeamo_this[pointer_neindex+n]
highpri_next[1:(data_len_5min-1)]=highpri_this[pointer_neindex+1]
lowpri_next[1:(data_len_5min-1)]=lowpri_this[pointer_neindex+1]
volamo_next[1:(data_len_5min-1)]=volamo_this[pointer_neindex+1]
if (n>1){
for (ll in 2:n){
highpri_next=pmax(highpri_next,highpri_this[pointer_neindex+ll])
lowpri_next=pmin(lowpri_next,lowpri_this[pointer_neindex+ll])
volamo_next=volamo_next+volamo_this[pointer_neindex+ll]
}
}
openpri_next[data_len_5min]=NA
closepri_next[data_len_5min]=NA
storeamo_next[data_len_5min]=NA
highpri_next[data_len_5min]=NA
lowpri_next[data_len_5min]=NA
volamo_next[data_len_5min]=NA
truedate_next=truedate_this[pointer_neindex+1]
truedate_next[data_len_5min]=NA
# state_flag数据计算
# 0 数据缺失
# 1 其他正常交易时间
# 2 早盘开盘交易时间:09:00(大连、上海、郑州交易所)
# 3 早盘开盘交易时间:09:15(中金所国债)
# 4 早盘开盘交易时间:09;30(中金所股指)
# 5 早盘小节休息后开始交易时间:10:30
# 6 下午开盘交易时间:13:00(中金所)
# 7 下午开盘交易时间:13:30(大连、上海、郑州交易所)
# 8 夜盘开始交易时间:21:00(上期所,郑商所,大商所)
time_next_hm=substr(begtime_this[pointer_neindex+1],12,16)
time_next_hm[data_len_5min]='99:99'
index_0=is.na(volamo_last)|is.na(volamo_next)
withdata_index=!index_0
index_2=withdata_index&(time_next_hm=='09:00')
index_3=withdata_index&(time_next_hm=='09:15')
index_4=withdata_index&(time_next_hm=='09:30')
index_5=withdata_index&(time_next_hm=='10:30')
index_6=withdata_index&(time_next_hm=='13:00')
index_7=withdata_index&(time_next_hm=='13:30')
index_8=withdata_index&(time_next_hm=='21:00')
state_flag_next[index_0]=0
state_flag_next[withdata_index]=1
state_flag_next[index_2]=2
state_flag_next[index_3]=3
state_flag_next[index_4]=4
state_flag_next[index_5]=5
state_flag_next[index_6]=6
state_flag_next[index_7]=7
state_flag_next[index_8]=8
data_for_reg <- data.frame(
"X..时间" = begtime_5min, #用前一根5分钟K线开始的时间戳来表示这个回归组合的时间
"开盘价_last" = openpri_last,
"最高价_last" = highpri_last,
"最低价_last" = lowpri_last,
"收盘价_last" = closepri_last,
"成交量_last"=volamo_last,
"持仓量_last"=storeamo_last,
"truedate_last"=truedate_last,
"开盘价_next" = openpri_next,
"最高价_next" = highpri_next,
"最低价_next" = lowpri_next,
"收盘价_next" = closepri_next,
"成交量_next"=volamo_next,
"持仓量_next"=storeamo_next,
"truedate_next"=truedate_next,
"state_flag"=state_flag_next,
stringsAsFactors = FALSE
)
outname1=paste(substr(filenames_1min[iii],1,5),'_all.csv',sep="")
outname2=paste(substr(filenames_1min[iii],1,5),'_reg.csv',sep="")
# 输出结果到csv文件
write.csv(data_5min_all,outname1)
write.csv(data_for_reg,outname2)
# 输出结果到Excel文件 输不出来溢出报错了
# write.xlsx(data_5min_1min,"data_processed.xlsx")
# 输出结果到csv文件
# write.csv(data_5min_1min,'JT888_processed.csv')
}
# 输出运行时间
t_end=Sys.time()
t_end-t_start
## Time difference of 6.860276 mins