suppressPackageStartupMessages(require('BBmisc'))

## 读取程序包
pkg <- c('devtools', 'tidyverse', 'timetk', 'lubridate', 'plyr', 'dplyr', 'magrittr', 'purrr', 'stringr', 'reshape', 'formattable', 'microbenchmark', 'knitr', 'kableExtra', 'VIM', 'mice', 'miceAdds', 'mi', 'mitools', 'Amelia', 'missForest', 'Hmisc', 'DMwR', 'imputeTS', 'tidyimpute', 'mtsdi', 'xts', 'forecast', 'marima', 'missMDA')

suppressAll(lib(pkg))
funs <- c('convertOHLC.R')
l_ply(funs, function(x) source(paste0('./function/', x)))

algo <- c('interpolation', 'locf', 'mean', 'random', 'kalman', 'ma')
rm(pkg, funs)

1 简介

1.1 介绍弥补数据

由于在科研binary.com Interview Question I - Interday High Frequency Trading Models Comparison测试高频率量化交易时,从fxcm/MarketData下载的数据并不完整1binary.com 面试试题 I - 单变量数据缺失值管理尝试弥补缺失值不果,单变量无法辨认开市价、最高价、最低价和闭市价之间的关系。

1.2 impueTS程序包

imputeTS - Time Series Missing Value Imputation in R讲述miceAmeliamissMDAVIM都是多变量弥补数据程序包,而imputeTS乃单变量弥补数据程序包,不过程序包中的seadec()函数乃弥补季节性数据。

Simple Imputation Imputation Plots & Statistics Datasets
na.locf na.interpolation plotNA.distribution tsAirgap
na.mean na.kalman plotNA.distributionBar tsAirgapComplete
na.random na.ma plotNA.gapsize tsHeating
na.replace na.seadec plotNA.imputations tsHeatingComplete
na.remove na.seasplit statsNA tsNH4
tsNH4Complete

Table 1: General Overview imputeTS package

Function Option Description
na.interpolation linear Imputation by Linear Interpolation
spline Imputation by Spline Interpolation
stine Imputation by Stineman Interpolation
na.kalman StructTS Imputation by Structural Model & Kalman Smoothing
auto.arima Imputation by ARIMA State Space Representation & Kalman Sm.
na.locf locf Imputation by Last Observation Carried Forward
nocb Imputation by Next Observation Carried Backward
na.ma simple Missing Value Imputation by Simple Moving Average
linear Missing Value Imputation by Linear Weighted Moving Average
exponential Missing Value Imputation by Exponential Weighted Moving Average
na.mean mean MissingValue Imputation by Mean Value
median Missing Value Imputation by Median Value
mode Missing Value Imputation by Mode Value
na.random Missing Value Imputation by Random Sample
na.replace Replace Missing Values by a Defined Value
na.seadec Seasonally Decomposed Missing Value Imputation
na.seasplit Seasonally Splitted Missing Value Imputation
na.remove Remove Missing Values

Table 3: Overview Imputation Algorithms

1.3 Amelia程序包

Amelia II: A Program for Missing Data介绍Amelia程序包,而AMELIA II - A Program for Missing Data教导如何使用该程序包。Error in as.POSIXct.numeric(value) : ‘origin’ must be supplied #18显示时间变量无法弥补,故此对于Amelia缺失值,僕得省略掉时间变量,仅设置价格变量为缺失值而已。

1.4 其它程序包

mice程序包可以使用lm函数将弥补数据线型化,tidyr程序包中有个fill()函数可以。而dendextend::na_locf()会比zoo::na.locf()高效率,不过弥补数据时会遇到一些参数问题。

2 数据

2.1 读取数据

2.1.1 1分钟数据

和之前的单变量一样,首先僕随机导入每分钟为1个时间单位的数据。

Error in optim(init[mask], getLike, method = "L-BFGS-B", lower = rep(0, : L-BFGS-B needs finite values of 'fn'
17. optim(init[mask], getLike, method = "L-BFGS-B", lower = rep(0, np + 1L), upper = rep(Inf, np + 1L), control = optim.control)
16. StructTS(data, ...)
15. na.kalman(data, ...)
14. apply.base.algorithm(data, algorithm = algorithm, ...)
13. .f(.x[[i]], ...)
12. map(., na.seadec, algorithm = x)
11. function_list[[i]](value)
10. freduce(value, `_function_list`)
9. `_fseq`(`_lhs`)
8. eval(quote(`_fseq`(`_lhs`)), env, env)
7. eval(quote(`_fseq`(`_lhs`)), env, env)
6. withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
5. data_m1_NA %>% dplyr::select(starts_with("Ask"), starts_with("Bid")) %>% map(na.seadec, algorithm = x) %>% as.tibble
4. FUN(X[[i]], ...)
3. lapply(pieces, .fun, ...)
2. structure(lapply(pieces, .fun, ...), dim = dim(pieces))
1. llply(algo, function(x) { data_m1_NA %>% dplyr::select(starts_with("Ask"), starts_with("Bid")) %>% map(na.seadec, algorithm = x) %>% as.tibble })

由于频频出现错误信息#imputeTS/issues/26,于此僕使用sort(sample(length(fls), 1))随机筛选1个文件。

pth <- 'C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/'
fls <- list.files(pth, pattern = '^Y[0-9]{4}W[1-9]{1,2}_m1.rds$')

## 1分钟数据
## 由于频频出现错误信息,于此僕使用sort(sample(length(fls), 1))随机筛选4个文件。
data_m1 <- llply(fls[sort(sample(length(fls), 1))], function(x) {
    y <- readRDS(paste0(pth, x)) %>% 
      dplyr::rename(index = DateTime) %>% 
      mutate(index = index %>% mdy_hms %>% 
               .POSIXct(tz = 'Europe/Athens') %>% 
               force_tz())
    
    yw <- x %>% str_extract_all('Y[0-9]{4}W[0-9]{1,2}') %>% 
      str_split_fixed('[A-Z]{1}', 3) %>% .[,-1]
    
    nch <- y$index[1] %>% substr(nchar(.)+2, nchar(.)+3)
    y %<>% mutate(
      year = as.numeric(yw[1]), week = as.numeric(yw[2]), 
      nch = nch, index = if_else(
        nch == '23', index + hours(1), index)) %>% 
      dplyr::select(-nch)
    }) %>% bind_rows %>% tbl_df %>% arrange(index)
dim(data_m1)
## [1] 7149   11
data_m1
## # A tibble: 7,149 x 11
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:02:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:05:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:06:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
## # ... with 7,139 more rows, and 4 more variables: AskLow <dbl>,
## #   AskClose <dbl>, year <dbl>, week <dbl>
## 检验原始数据是否存在偏差。
data_m1 %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1 %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1) %>% 
  kable(caption = 'Bias Imputation') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
Bias Imputation
index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose year week bias.open bias.high bias.low bias.close

2.1.2 Tick数据转为1分钟数据

接着,导入Tick数据2,并且转为每分钟为1时间单位。

pth <- 'C:/Users/scibr/Documents/GitHub/scibrokes/real-time-fxcm/data/USDJPY/'
fls <- list.files(pth, pattern = '^Y[0-9]{4}W[1-9]{1,2}.rds$')

## Tick数据转为1分钟数据
## 由于频频出现错误信息,于此僕使用sort(sample(length(fls), 1))随机筛选2个文件。
data_tm1 <- llply(fls[sort(sample(length(fls), 1))], function(x) {
    y <- readRDS(paste0(pth, x)) %>% 
      convertOHLC(combine = TRUE)
    
    yw <- x %>% str_extract_all('Y[0-9]{4}W[0-9]{1,2}') %>% 
      str_split_fixed('[A-Z]{1}', 3) %>% .[,-1]
    y %<>% mutate(
      year = as.numeric(yw[1]), week = as.numeric(yw[2]), .)
    }) %>% bind_rows %>% tbl_df %>% arrange(index)
dim(data_tm1)
## [1] 1628   11
data_tm1
## # A tibble: 1,628 x 11
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:06:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:08:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:09:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:11:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
## # ... with 1,618 more rows, and 4 more variables: AskLow <dbl>,
## #   AskClose <dbl>, year <dbl>, week <dbl>
## 检验原始数据是否存在偏差。
data_tm1 %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1 %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1) %>% 
  kable(caption = 'Bias Imputation') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
Bias Imputation
index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose year week bias.open bias.high bias.low bias.close

2.2 设置缺失值

2.2.1 1分钟数据

现在尝试随机设置缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(index, BidOpen, BidHigh, BidLow, BidClose, AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.01)
data_m1_NA
## # A tibble: 7,149 x 9
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:02:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:05:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:06:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
## # ... with 7,139 more rows, and 2 more variables: AskLow <dbl>,
## #   AskClose <dbl>
data_m1_NA %>% md.pattern

##      AskClose BidHigh BidLow AskOpen index AskLow BidOpen BidClose AskHigh
## 6528        1       1      1       1     1      1       1        1       1
## 79          1       1      1       1     1      1       1        1       0
## 78          1       1      1       1     1      1       1        0       1
## 72          1       1      1       1     1      1       0        1       1
## 1           1       1      1       1     1      1       0        1       0
## 2           1       1      1       1     1      1       0        0       1
## 71          1       1      1       1     1      0       1        1       1
## 66          1       1      1       1     0      1       1        1       1
## 62          1       1      1       0     1      1       1        1       1
## 1           1       1      1       0     1      1       1        1       0
## 1           1       1      1       0     1      0       1        1       1
## 1           1       1      1       0     1      0       1        0       1
## 1           1       1      1       0     0      1       1        1       1
## 62          1       1      0       1     1      1       1        1       1
## 1           1       1      0       1     1      1       1        1       0
## 1           1       1      0       1     0      1       1        1       1
## 1           1       1      0       0     1      1       1        1       1
## 56          1       0      1       1     1      1       1        1       1
## 1           1       0      1       1     1      1       1        1       0
## 1           1       0      1       1     1      1       1        0       1
## 1           1       0      1       1     1      1       0        1       1
## 1           1       0      1       1     0      1       1        1       1
## 1           1       0      0       1     1      1       1        1       1
## 54          0       1      1       1     1      1       1        1       1
## 1           0       1      1       1     1      1       1        1       0
## 1           0       1      1       1     1      1       1        0       1
## 2           0       1      1       1     1      0       1        1       1
## 1           0       1      1       1     0      1       1        1       1
## 1           0       1      1       0     1      1       1        1       1
##            60      61     66      68    70     75      76       83      84
##         
## 6528   0
## 79     1
## 78     1
## 72     1
## 1      2
## 2      2
## 71     1
## 66     1
## 62     1
## 1      2
## 1      2
## 1      3
## 1      2
## 62     1
## 1      2
## 1      2
## 1      2
## 56     1
## 1      2
## 1      2
## 1      2
## 1      2
## 1      2
## 54     1
## 1      2
## 1      2
## 2      2
## 1      2
## 1      2
##      643
data_m1_NA %>% md.pairs
## $rr
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index     7079    7003    7019   7014     6996    7012    6995   7004
## BidOpen   7003    7073    7013   7007     6992    7005    6990   6998
## BidHigh   7019    7013    7088   7023     7006    7020    7005   7013
## BidLow    7014    7007    7023   7083     7000    7016    7000   7008
## BidClose  6996    6992    7006   7000     7066    6999    6982   6992
## AskOpen   7012    7005    7020   7016     6999    7081    6998   7008
## AskHigh   6995    6990    7005   7000     6982    6998    7065   6990
## AskLow    7004    6998    7013   7008     6992    7008    6990   7074
## AskClose  7020    7013    7028   7023     7007    7022    7006   7016
##          AskClose
## index        7020
## BidOpen      7013
## BidHigh      7028
## BidLow       7023
## BidClose     7007
## AskOpen      7022
## AskHigh      7006
## AskLow       7016
## AskClose     7089
## 
## $rm
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index        0      76      60     65       83      67      84     75
## BidOpen     70       0      60     66       81      68      83     75
## BidHigh     69      75       0     65       82      68      83     75
## BidLow      69      76      60      0       83      67      83     75
## BidClose    70      74      60     66        0      67      84     74
## AskOpen     69      76      61     65       82       0      83     73
## AskHigh     70      75      60     65       83      67       0     75
## AskLow      70      76      61     66       82      66      84      0
## AskClose    69      76      61     66       82      67      83     73
##          AskClose
## index          59
## BidOpen        60
## BidHigh        60
## BidLow         60
## BidClose       59
## AskOpen        59
## AskHigh        59
## AskLow         58
## AskClose        0
## 
## $mr
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index        0      70      69     69       70      69      70     70
## BidOpen     76       0      75     76       74      76      75     76
## BidHigh     60      60       0     60       60      61      60     61
## BidLow      65      66      65      0       66      65      65     66
## BidClose    83      81      82     83        0      82      83     82
## AskOpen     67      68      68     67       67       0      67     66
## AskHigh     84      83      83     83       84      83       0     84
## AskLow      75      75      75     75       74      73      75      0
## AskClose    59      60      60     60       59      59      59     58
##          AskClose
## index          69
## BidOpen        76
## BidHigh        61
## BidLow         66
## BidClose       82
## AskOpen        67
## AskHigh        83
## AskLow         73
## AskClose        0
## 
## $mm
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index       70       0       1      1        0       1       0      0
## BidOpen      0      76       1      0        2       0       1      0
## BidHigh      1       1      61      1        1       0       1      0
## BidLow       1       0       1     66        0       1       1      0
## BidClose     0       2       1      0       83       1       0      1
## AskOpen      1       0       0      1        1      68       1      2
## AskHigh      0       1       1      1        0       1      84      0
## AskLow       0       0       0      0        1       2       0     75
## AskClose     1       0       0      0        1       1       1      2
##          AskClose
## index           1
## BidOpen         0
## BidHigh         0
## BidLow          0
## BidClose        1
## AskOpen         1
## AskHigh         1
## AskLow          2
## AskClose       60

2.2.2 Tick数据转为1分钟数据

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(index, BidOpen, BidHigh, BidLow, BidClose, AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.01)
data_tm1_NA
## # A tibble: 1,628 x 9
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:06:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:08:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:09:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:11:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:17:00    111.    111.    NA      111.    111.    111.
## # ... with 1,618 more rows, and 2 more variables: AskLow <dbl>,
## #   AskClose <dbl>
data_tm1_NA %>% md.pattern

##      BidHigh BidClose AskLow BidOpen BidLow AskHigh index AskClose AskOpen
## 1485       1        1      1       1      1       1     1        1       1
## 18         1        1      1       1      1       1     1        1       0
## 18         1        1      1       1      1       1     1        0       1
## 17         1        1      1       1      1       1     0        1       1
## 15         1        1      1       1      1       0     1        1       1
## 1          1        1      1       1      1       0     0        1       1
## 16         1        1      1       1      0       1     1        1       1
## 1          1        1      1       1      0       1     1        1       0
## 17         1        1      1       0      1       1     1        1       1
## 16         1        1      0       1      1       1     1        1       1
## 12         1        0      1       1      1       1     1        1       1
## 11         0        1      1       1      1       1     1        1       1
## 1          0        1      1       1      1       0     1        1       1
##           12       12     16      17     17      17    18       18      19
##         
## 1485   0
## 18     1
## 18     1
## 17     1
## 15     1
## 1      2
## 16     1
## 1      2
## 17     1
## 16     1
## 12     1
## 11     1
## 1      2
##      146
data_tm1_NA %>% md.pairs
## $rr
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index     1610    1593    1598   1593     1598    1591    1594   1594
## BidOpen   1593    1611    1599   1594     1599    1592    1594   1595
## BidHigh   1598    1599    1616   1599     1604    1597    1600   1600
## BidLow    1593    1594    1599   1611     1599    1593    1594   1595
## BidClose  1598    1599    1604   1599     1616    1597    1599   1600
## AskOpen   1591    1592    1597   1593     1597    1609    1592   1593
## AskHigh   1594    1594    1600   1594     1599    1592    1611   1595
## AskLow    1594    1595    1600   1595     1600    1593    1595   1612
## AskClose  1592    1593    1598   1593     1598    1591    1593   1594
##          AskClose
## index        1592
## BidOpen      1593
## BidHigh      1598
## BidLow       1593
## BidClose     1598
## AskOpen      1591
## AskHigh      1593
## AskLow       1594
## AskClose     1610
## 
## $rm
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index        0      17      12     17       12      19      16     16
## BidOpen     18       0      12     17       12      19      17     16
## BidHigh     18      17       0     17       12      19      16     16
## BidLow      18      17      12      0       12      18      17     16
## BidClose    18      17      12     17        0      19      17     16
## AskOpen     18      17      12     16       12       0      17     16
## AskHigh     17      17      11     17       12      19       0     16
## AskLow      18      17      12     17       12      19      17      0
## AskClose    18      17      12     17       12      19      17     16
##          AskClose
## index          18
## BidOpen        18
## BidHigh        18
## BidLow         18
## BidClose       18
## AskOpen        18
## AskHigh        18
## AskLow         18
## AskClose        0
## 
## $mr
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index        0      18      18     18       18      18      17     18
## BidOpen     17       0      17     17       17      17      17     17
## BidHigh     12      12       0     12       12      12      11     12
## BidLow      17      17      17      0       17      16      17     17
## BidClose    12      12      12     12        0      12      12     12
## AskOpen     19      19      19     18       19       0      19     19
## AskHigh     16      17      16     17       17      17       0     17
## AskLow      16      16      16     16       16      16      16      0
## AskClose    18      18      18     18       18      18      18     18
##          AskClose
## index          18
## BidOpen        17
## BidHigh        12
## BidLow         17
## BidClose       12
## AskOpen        19
## AskHigh        17
## AskLow         16
## AskClose        0
## 
## $mm
##          index BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow
## index       18       0       0      0        0       0       1      0
## BidOpen      0      17       0      0        0       0       0      0
## BidHigh      0       0      12      0        0       0       1      0
## BidLow       0       0       0     17        0       1       0      0
## BidClose     0       0       0      0       12       0       0      0
## AskOpen      0       0       0      1        0      19       0      0
## AskHigh      1       0       1      0        0       0      17      0
## AskLow       0       0       0      0        0       0       0     16
## AskClose     0       0       0      0        0       0       0      0
##          AskClose
## index           0
## BidOpen         0
## BidHigh         0
## BidLow          0
## BidClose        0
## AskOpen         0
## AskHigh         0
## AskLow          0
## AskClose       18

3 统计模式

3.1 弥补缺失值

tttt <- data_m1_NA[-1] %>% amelia
## -- Imputation 1 --
## 
##   1  2
## 
## -- Imputation 2 --
## 
##   1  2
## 
## -- Imputation 3 --
## 
##   1  2
## 
## -- Imputation 4 --
## 
##   1  2
## 
## -- Imputation 5 --
## 
##   1  2
llply(tttt$imputations, function(x) {
    x %>% mutate(
  VA = if_else(AskOpen <= AskHigh & AskOpen >= AskLow & 
               AskClose <= AskHigh & AskClose >= AskLow & 
               AskHigh >= AskLow, 1, 0), 
  VB = if_else(BidOpen <= BidHigh & BidOpen >= BidLow & 
               BidClose <= BidHigh & BidClose >= BidLow & 
               BidHigh >= BidLow, 1, 0)) %>% 
  dplyr::filter(VA == 0|VB == 0)
})
## $imp1
## # A tibble: 114 x 10
##    BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose    VA
##      <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1    104.    104.   104.     104.    104.    104.   104.     104.     0
##  2    104.    104.   104.     104.    104.    104.   104.     104.     0
##  3    104.    104.   104.     104.    104.    104.   104.     104.     1
##  4    104.    104.   104.     104.    104.    104.   104.     104.     1
##  5    104.    104.   104.     104.    104.    104.   104.     104.     0
##  6    104.    104.   104.     104.    104.    104.   104.     104.     0
##  7    104.    104.   104.     104.    104.    104.   104.     104.     1
##  8    104.    104.   104.     104.    104.    104.   104.     104.     1
##  9    104.    104.   104.     104.    104.    104.   104.     104.     1
## 10    104.    104.   104.     104.    104.    104.   104.     104.     1
## # ... with 104 more rows, and 1 more variable: VB <dbl>
## 
## $imp2
## # A tibble: 106 x 10
##    BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose    VA
##      <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1    104.    104.   104.     104.    104.    104.   104.     104.     0
##  2    104.    104.   104.     104.    104.    104.   104.     104.     0
##  3    104.    104.   104.     104.    104.    104.   104.     104.     1
##  4    104.    104.   104.     104.    104.    104.   104.     104.     0
##  5    104.    104.   104.     104.    104.    104.   104.     104.     1
##  6    104.    104.   104.     104.    104.    104.   104.     104.     1
##  7    104.    104.   104.     104.    104.    104.   104.     104.     1
##  8    104.    104.   104.     104.    104.    104.   104.     104.     1
##  9    104.    104.   104.     104.    104.    104.   104.     104.     1
## 10    104.    104.   104.     104.    104.    104.   104.     104.     1
## # ... with 96 more rows, and 1 more variable: VB <dbl>
## 
## $imp3
## # A tibble: 107 x 10
##    BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose    VA
##      <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1    104.    104.   104.     104.    104.    104.   104.     104.     0
##  2    104.    104.   104.     104.    104.    104.   104.     104.     0
##  3    104.    104.   104.     104.    104.    104.   104.     104.     0
##  4    104.    104.   104.     104.    104.    104.   104.     104.     0
##  5    104.    104.   104.     104.    104.    104.   104.     104.     1
##  6    104.    104.   104.     104.    104.    104.   104.     104.     0
##  7    104.    104.   104.     104.    104.    104.   104.     104.     0
##  8    104.    104.   104.     104.    104.    104.   104.     104.     1
##  9    104.    104.   104.     104.    104.    104.   104.     104.     1
## 10    104.    104.   104.     104.    104.    104.   104.     104.     1
## # ... with 97 more rows, and 1 more variable: VB <dbl>
## 
## $imp4
## # A tibble: 96 x 10
##    BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose    VA
##      <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1    104.    104.   104.     104.    104.    104.   104.     104.     0
##  2    104.    104.   104.     104.    104.    104.   104.     104.     0
##  3    104.    104.   104.     104.    104.    104.   104.     104.     0
##  4    104.    104.   104.     104.    104.    104.   104.     104.     1
##  5    104.    104.   104.     104.    104.    104.   104.     104.     1
##  6    104.    104.   104.     104.    104.    104.   104.     104.     1
##  7    104.    104.   104.     104.    104.    104.   104.     104.     1
##  8    104.    104.   104.     104.    104.    104.   104.     104.     1
##  9    104.    104.   104.     104.    104.    104.   104.     104.     0
## 10    104     104.   104.     104.    104.    104.   104.     104.     0
## # ... with 86 more rows, and 1 more variable: VB <dbl>
## 
## $imp5
## # A tibble: 95 x 10
##    BidOpen BidHigh BidLow BidClose AskOpen AskHigh AskLow AskClose    VA
##      <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl> <dbl>
##  1    104.    104.   104.     104.    104.    104.   104.     104.     0
##  2    104.    104.   104.     104.    104.    104.   104.     104.     1
##  3    104.    104.   104.     104.    104.    104.   104.     104.     0
##  4    104.    104.   104.     104.    104.    104.   104.     104.     1
##  5    104.    104.   104.     104.    104.    104.   104.     104.     0
##  6    104.    104.   104.     104.    104.    104.   104.     104.     1
##  7    104.    104.   104.     104.    104.    104.   104.     104.     1
##  8    104.    104.   104.     104.    104.    104.   104.     104.     1
##  9    104.    104.   104.     104.    104.    104.   104.     104.     1
## 10    104.    104.   104.     104.    104.    104.   104.     104.     1
## # ... with 85 more rows, and 1 more variable: VB <dbl>

经过测试以上数据,结果发现amelia也是单变量数据弥补。

注释:单变量弥补的数据将会与之前单变量预测数据一样,就是出现偏差,例如:

  • 开市价高于最高价
  • 开市价低于最低价
  • 最高价低于开市价
  • 最高价低于最低价
  • 最高价低于闭市价
  • 最低价高于开市价
  • 最低价高于最高价
  • 最低价高于闭市价
  • 闭市价高于最高价
  • 闭市价低于最低价

3.2 1% 缺失值

3.2.1 1分钟数据

以下使用imputeTS::na.seadec()弥补1%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.01) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_1_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_1_impTS) <- algo
data_m1_1_impTS %<>% ldply %>% tbl_df

data_m1_1_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_1_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 702 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 692 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_1_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_1_impTS %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0048958 0.0062946 0.0048958 0.0062946
kalman 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0050357 0.0064345 0.0050357 0.0064345
locf 0.0000014 0.0000031 0.0000018 0.0000036 0.0000029 0.0000025 0.0037768 0.0053154 0.0043363 0.0058749
ma 0.0000007 0.0000014 0.0000012 0.0000013 0.0000013 0.0000012 0.0079731 0.0088124 0.0067142 0.0086725
mean 0.0005507 0.0005905 0.0004618 0.0006969 0.0005831 0.0005750 0.0194433 0.0179046 0.0205623 0.0194433
random 0.0024076 0.0006038 0.0023222 0.0008296 0.0012519 0.0015408 0.0145475 0.0093719 0.0195832 0.0145475

以下使用Amelia::amelia()弥补1%数据缺失值。

data_m1_1_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2
## 
## -- Imputation 2 --
## 
##   1  2
## 
## -- Imputation 3 --
## 
##   1  2
## 
## -- Imputation 4 --
## 
##   1  2
## 
## -- Imputation 5 --
## 
##   1  2
data_m1_1_amelia %>% anyNA
## [1] FALSE
data_m1_1_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_1_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 237 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:37:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 02:44:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 02:46:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 07:04:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 12:23:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 16:06:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 17:35:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 19:19:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 23:38:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-18 00:27:00    104.    104.   104.     104.    104.
## # ... with 227 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_1_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_1_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0 1e-07 1e-07 3e-07 1e-07 1e-07 0.0039166 0.0034970 0.0030774 0.0029375
imp2 0 1e-07 0e+00 2e-07 1e-07 1e-07 0.0032172 0.0034970 0.0029375 0.0036369
imp3 0 0e+00 0e+00 2e-07 1e-07 1e-07 0.0036369 0.0033571 0.0030774 0.0032172
imp4 0 1e-07 0e+00 2e-07 1e-07 1e-07 0.0037768 0.0036369 0.0029375 0.0032172
imp5 0 0e+00 0e+00 2e-07 1e-07 1e-07 0.0043363 0.0043363 0.0034970 0.0040565

以下使用tidyr::fill()弥补1%数据缺失值。

data_m1_1_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_1_tidyr %>% anyNA
## [1] FALSE
data_m1_1_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_1_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 69 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 02:25:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 02:34:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 03:52:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 04:04:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 04:46:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 08:00:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 10:16:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 12:23:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 14:11:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 16:58:00    104.    104.   104.     104.    104.    104.
## # ... with 59 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_1_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_1_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
1.4e-06 3.1e-06 1.8e-06 3.6e-06 2.9e-06 2.5e-06 0.0037768 0.0053154 0.0043363 0.0058749

3.2.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补1%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.01) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_1_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_1_impTS) <- algo
data_tm1_1_impTS %<>% ldply %>% tbl_df

data_tm1_1_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_1_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 143 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 locf     111.    111.   111.     111.    111.    111.   111.     111.
##  9 locf     111.    111.   111.     111.    111.    111.   111.     111.
## 10 locf     111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 133 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_1_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_1_impTS %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06478 48.03953 48.09285 48.06702 48.06647 48.06605 0.0024570 0.0012285 0.0030713 0.0018428
kalman 48.06484 48.03953 48.09285 48.06702 48.06646 48.06606 0.0030713 0.0024570 0.0030713 0.0024570
locf 48.06471 48.03943 48.09250 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.0024570
ma 48.06507 48.03947 48.09286 48.06706 48.06647 48.06612 0.0055283 0.0055283 0.0042998 0.0055283
mean 48.06836 48.03966 48.09137 48.06870 48.06658 48.06702 0.0165848 0.0184275 0.0128993 0.0159705
random 48.06803 48.03820 48.10323 48.06020 48.06721 48.06742 0.0221130 0.0178133 0.0233415 0.0196560

以下使用Amelia::amelia()弥补1%数据缺失值。

data_tm1_1_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2
## 
## -- Imputation 2 --
## 
##   1  2
## 
## -- Imputation 3 --
## 
##   1  2
## 
## -- Imputation 4 --
## 
##   1  2
## 
## -- Imputation 5 --
## 
##   1  2
data_tm1_1_amelia %>% anyNA
## [1] FALSE
data_tm1_1_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_1_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 67 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 03:37:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 04:59:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 05:33:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 07:13:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 11:32:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 13:46:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 18:02:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 18:38:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 20:20:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 20:25:00    111.    111.   111.     111.    111.
## # ... with 57 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_1_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_1_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06482 48.03931 48.09267 48.06713 48.06637 48.06598 0.0055283 0.0055283 0.0030713 0.0030713
imp2 48.06482 48.03929 48.09266 48.06683 48.06626 48.06590 0.0055283 0.0042998 0.0055283 0.0049140
imp3 48.06477 48.03933 48.09253 48.06716 48.06634 48.06595 0.0036855 0.0036855 0.0024570 0.0024570
imp4 48.06483 48.03946 48.09270 48.06697 48.06638 48.06599 0.0036855 0.0030713 0.0042998 0.0036855
imp5 48.06493 48.03940 48.09263 48.06691 48.06631 48.06597 0.0049140 0.0042998 0.0049140 0.0042998

以下使用tidyr::fill()弥补1%数据缺失值。

data_tm1_1_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_1_tidyr %>% anyNA
## [1] FALSE
data_tm1_1_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_1_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 16 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 01:05:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 01:26:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 04:59:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 05:14:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 09:31:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 11:53:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 13:06:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 16:49:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 18:02:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 18:07:00    111.    111.   111.     111.    111.    111.
## 11 2017-08-07 18:38:00    111.    111.   111.     111.    111.    111.
## 12 2017-08-07 18:53:00    111.    111.   111.     111.    111.    111.
## 13 2017-08-07 20:47:00    111.    111.   111.     111.    111.    111.
## 14 2017-08-07 23:18:00    111.    111.   111.     111.    111.    111.
## 15 2017-08-08 02:07:00    111.    111.   111.     111.    111.    111.
## 16 2017-08-08 03:28:00    111.    111.   111.     111.    111.    111.
## # ... with 6 more variables: AskLow <dbl>, AskClose <dbl>,
## #   bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_1_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_1_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06471 48.03943 48.0925 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.002457

3.3 10% 缺失值

3.3.1 1分钟数据

以下使用imputeTS::na.seadec()弥补10%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.1) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_10_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_10_impTS) <- algo
data_m1_10_impTS %<>% ldply %>% tbl_df

data_m1_10_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_10_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 6,105 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 6,095 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_10_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_10_impTS %>% 
  kable(caption = 'MSE 10% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 10% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0000090 0.0000071 0.0000075 0.0000095 0.0000080 0.0000083 0.0495174 0.0479787 0.0492377 0.0496573
kalman 0.0000090 0.0000071 0.0000075 0.0000096 0.0000081 0.0000083 0.0513359 0.0500769 0.0509162 0.0517555
locf 0.0000190 0.0000160 0.0000167 0.0000208 0.0000178 0.0000181 0.0530144 0.0507763 0.0471395 0.0464401
ma 0.0000108 0.0000100 0.0000107 0.0000129 0.0000112 0.0000111 0.0672821 0.0636453 0.0614072 0.0665827
mean 0.0057660 0.0063513 0.0060217 0.0066473 0.0063401 0.0061966 0.1719122 0.1493915 0.1814240 0.1751294
random 0.0077868 0.0226350 0.0087245 0.0291799 0.0201798 0.0170816 0.1092460 0.0870052 0.1375017 0.1214156

以下使用Amelia::amelia()弥补10%数据缺失值。

data_m1_10_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2
## 
## -- Imputation 2 --
## 
##   1  2
## 
## -- Imputation 3 --
## 
##   1  2
## 
## -- Imputation 4 --
## 
##   1  2
## 
## -- Imputation 5 --
## 
##   1  2
data_m1_10_amelia %>% anyNA
## [1] FALSE
data_m1_10_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_10_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 2,601 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:01:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:02:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:11:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:13:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:16:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:22:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:30:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:36:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:39:00    104.    104.   104.     104.    104.
## # ... with 2,591 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_10_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_10_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 1.0e-06 1.3e-06 1.1e-06 1.1e-06 1.2e-06 1.1e-06 0.0430829 0.0442020 0.0348300 0.0386068
imp2 1.1e-06 9.0e-07 1.1e-06 9.0e-07 1.0e-06 1.0e-06 0.0432228 0.0395860 0.0372080 0.0365086
imp3 9.0e-07 1.2e-06 1.1e-06 1.0e-06 1.1e-06 1.1e-06 0.0440621 0.0432228 0.0370681 0.0381872
imp4 1.2e-06 1.1e-06 1.1e-06 1.0e-06 1.1e-06 1.1e-06 0.0402854 0.0415443 0.0345503 0.0379074
imp5 1.1e-06 1.1e-06 1.1e-06 9.0e-07 1.0e-06 1.1e-06 0.0416842 0.0401455 0.0348300 0.0359491

以下使用tidyr::fill()弥补10%数据缺失值。

data_m1_10_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_10_tidyr %>% anyNA
## [1] FALSE
data_m1_10_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_10_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 674 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:31:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 01:20:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 01:30:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 01:37:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 01:40:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 01:49:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 01:57:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 02:19:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 02:25:00    104.    104.   104.     104.    104.    104.
## # ... with 664 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_10_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_10_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
1.9e-05 1.6e-05 1.67e-05 2.08e-05 1.78e-05 1.81e-05 0.0530144 0.0507763 0.0471395 0.0464401

3.3.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补10%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.1) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_10_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_10_impTS) <- algo
data_tm1_10_impTS %<>% ldply %>% tbl_df

data_tm1_10_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_10_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,411 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 1,401 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_10_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_10_impTS %>% 
  kable(caption = 'MSE 10% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 10% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06382 48.03829 48.09319 48.06827 48.06658 48.06589 0.0546683 0.0485258 0.0454545 0.0423833
kalman 48.06387 48.03828 48.09319 48.06821 48.06656 48.06589 0.0528256 0.0491400 0.0448403 0.0448403
locf 48.06388 48.04010 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973
ma 48.06389 48.03796 48.09338 48.06803 48.06646 48.06581 0.0626536 0.0540541 0.0540541 0.0552826
mean 48.07145 48.03286 48.10848 48.06614 48.06916 48.06973 0.1726044 0.1603194 0.1572482 0.1664619
random 47.91482 48.20938 48.19080 48.04397 48.14805 48.08974 0.1848894 0.1025799 0.2291155 0.1633907

以下使用Amelia::amelia()弥补10%数据缺失值。

data_tm1_10_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4
## 
## -- Imputation 2 --
## 
##   1  2  3
## 
## -- Imputation 3 --
## 
##   1  2  3
## 
## -- Imputation 4 --
## 
##   1  2  3
## 
## -- Imputation 5 --
## 
##   1  2  3
data_tm1_10_amelia %>% anyNA
## [1] FALSE
data_tm1_10_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_10_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 627 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:45:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:57:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 01:07:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 01:18:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 01:21:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 01:33:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 01:36:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 02:07:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 02:09:00    111.    111.   111.     111.    111.
## # ... with 617 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_10_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_10_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06460 48.03961 48.09248 48.06715 48.06642 48.06596 0.0479115 0.0337838 0.0411548 0.0307125
imp2 48.06486 48.03910 48.09271 48.06702 48.06627 48.06592 0.0423833 0.0393120 0.0337838 0.0337838
imp3 48.06492 48.03909 48.09336 48.06684 48.06643 48.06605 0.0472973 0.0411548 0.0509828 0.0485258
imp4 48.06482 48.03913 48.09300 48.06658 48.06624 48.06588 0.0466830 0.0386978 0.0417690 0.0350123
imp5 48.06506 48.03883 48.09322 48.06684 48.06630 48.06599 0.0565111 0.0466830 0.0448403 0.0399263

以下使用tidyr::fill()弥补10%数据缺失值。

data_tm1_10_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_10_tidyr %>% anyNA
## [1] FALSE
data_tm1_10_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_10_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 164 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:45:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 01:03:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 01:04:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 01:11:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 01:21:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 02:09:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 02:32:00    111.    111.   111.     111.    111.    111.
## # ... with 154 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_10_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_10_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06388 48.0401 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973

3.4 20% 缺失值

3.4.1 1分钟数据

以下使用imputeTS::na.seadec()弥补20%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.2) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_20_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_20_impTS) <- algo
data_m1_20_impTS %<>% ldply %>% tbl_df

data_m1_20_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_20_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 11,975 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 11,965 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_20_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_20_impTS %>% 
  kable(caption = 'MSE 20% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 20% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0000229 0.0000146 0.0000154 0.0000201 0.0000167 0.0000182 0.1054693 0.0988950 0.0949783 0.0948384
kalman 0.0000229 0.0000146 0.0000154 0.0000202 0.0000167 0.0000183 0.1088264 0.1022521 0.0976360 0.0986152
locf 0.0000526 0.0000327 0.0000319 0.0000438 0.0000362 0.0000403 0.1105050 0.0969366 0.0981956 0.0916212
ma 0.0000275 0.0000192 0.0000198 0.0000245 0.0000212 0.0000228 0.1292488 0.1209959 0.1112044 0.1222549
mean 0.0123887 0.0117783 0.0116303 0.0126437 0.0120174 0.0121103 0.3066163 0.2652119 0.2957057 0.3014408
random 0.0308892 0.0318163 0.0327874 0.0353737 0.0333258 0.0327167 0.3551546 0.3561337 0.3119317 0.3477409

以下使用Amelia::amelia()弥20%数据缺失值。

data_m1_20_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3
## 
## -- Imputation 2 --
## 
##   1  2  3
## 
## -- Imputation 3 --
## 
##   1  2  3
## 
## -- Imputation 4 --
## 
##   1  2  3
## 
## -- Imputation 5 --
## 
##   1  2  3
data_m1_20_amelia %>% anyNA
## [1] FALSE
data_m1_20_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_20_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 5,306 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:00:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:13:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:16:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:17:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:27:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:33:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:38:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:39:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:41:00    104.    104.   104.     104.    104.
## # ... with 5,296 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_20_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_20_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 5.1e-06 3.9e-06 4.4e-06 3.9e-06 4.1e-06 4.3e-06 0.0969366 0.0854665 0.0762344 0.0698000
imp2 5.0e-06 3.9e-06 3.6e-06 3.7e-06 3.7e-06 4.0e-06 0.0952581 0.0830885 0.0769338 0.0706393
imp3 4.6e-06 3.5e-06 4.5e-06 3.8e-06 3.9e-06 4.1e-06 0.0951182 0.0811302 0.0819695 0.0742761
imp4 4.4e-06 3.9e-06 4.0e-06 3.5e-06 3.8e-06 3.9e-06 0.0952581 0.0853266 0.0769338 0.0730172
imp5 4.2e-06 3.8e-06 3.7e-06 3.4e-06 3.6e-06 3.8e-06 0.0959575 0.0844873 0.0770737 0.0706393

以下使用tidyr::fill()弥补20%数据缺失值。

data_m1_20_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_20_tidyr %>% anyNA
## [1] FALSE
data_m1_20_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_20_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,314 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:17:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:19:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:20:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:21:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:31:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:33:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:42:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:55:00    104.    104.   104.     104.    104.    104.
## # ... with 1,304 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_20_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_20_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
5.26e-05 3.27e-05 3.19e-05 4.38e-05 3.62e-05 4.03e-05 0.110505 0.0969366 0.0981956 0.0916212

3.4.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补20%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.2) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_20_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_20_impTS) <- algo
data_tm1_20_impTS %<>% ldply %>% tbl_df

data_tm1_20_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_20_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 2,538 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 2,528 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_20_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_20_impTS %>% 
  kable(caption = 'MSE 20% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 20% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06431 48.03889 48.09359 48.06921 48.06723 48.06650 0.0872236 0.0896806 0.0755528 0.0853808
kalman 48.06438 48.03886 48.09359 48.06910 48.06718 48.06648 0.0927518 0.0945946 0.0816953 0.0933661
locf 48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088
ma 48.06454 48.03813 48.09355 48.06860 48.06676 48.06620 0.1087224 0.1068796 0.0939803 0.1081081
mean 48.07973 48.02832 48.08681 48.08731 48.06748 48.07054 0.2782555 0.2721130 0.2395577 0.2800983
random 48.45310 48.01260 48.05931 47.86129 47.97773 48.09657 0.3366093 0.3218673 0.3126536 0.3359951

以下使用Amelia::amelia()弥补20%数据缺失值。

data_tm1_20_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5
## 
## -- Imputation 4 --
## 
##   1  2  3  4
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7
data_tm1_20_amelia %>% anyNA
## [1] FALSE
data_tm1_20_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_20_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,293 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:24:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:41:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:50:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:51:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 01:01:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 01:19:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 01:20:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 01:22:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 01:26:00    111.    111.   111.     111.    111.
## # ... with 1,283 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_20_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_20_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06460 48.03932 48.09376 48.06750 48.06686 48.06629 0.1056511 0.0970516 0.0970516 0.0982801
imp2 48.06493 48.03925 48.09323 48.06796 48.06682 48.06635 0.0921376 0.0884521 0.0823096 0.0902948
imp3 48.06402 48.04021 48.09313 48.06711 48.06682 48.06612 0.0902948 0.0743243 0.0939803 0.0859951
imp4 48.06378 48.03896 48.09348 48.06752 48.06665 48.06593 0.0921376 0.0804668 0.0878378 0.0853808
imp5 48.06468 48.03944 48.09354 48.06753 48.06684 48.06630 0.1007371 0.0786241 0.0853808 0.0687961

以下使用tidyr::fill()弥补20%数据缺失值。

data_tm1_20_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_20_tidyr %>% anyNA
## [1] FALSE
data_tm1_20_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_20_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 291 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:22:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:24:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:41:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 01:00:00    111.    111.   111.     111.    111.    111.
## # ... with 281 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_20_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_20_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088

3.5 30% 缺失值

3.5.1 1分钟数据

以下使用imputeTS::na.seadec()弥补30%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.3) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_30_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_30_impTS) <- algo
data_m1_30_impTS %<>% ldply %>% tbl_df

data_m1_30_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_30_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 16,045 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 16,035 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_30_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_30_impTS %>% 
  kable(caption = 'MSE 30% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 30% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0000374 0.0000247 0.0000282 0.0000342 0.0000290 0.0000311 0.1573647 0.1435166 0.1454749 0.1502308
kalman 0.0000373 0.0000247 0.0000282 0.0000344 0.0000291 0.0000312 0.1603021 0.1458945 0.1481326 0.1537278
locf 0.0000830 0.0000646 0.0000672 0.0000785 0.0000701 0.0000733 0.1658973 0.1559659 0.1465939 0.1577843
ma 0.0000425 0.0000321 0.0000351 0.0000407 0.0000359 0.0000376 0.1805847 0.1612813 0.1653378 0.1772276
mean 0.0190881 0.0174648 0.0182534 0.0190274 0.0182485 0.0184584 0.4018744 0.3536159 0.3729193 0.4076095
random 0.0891399 0.0503076 0.0991078 0.0534894 0.0676349 0.0730112 0.4879004 0.5048258 0.3189257 0.3983774

以下使用Amelia::amelia()弥补30%数据缺失值。

data_m1_30_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4
## 
## -- Imputation 2 --
## 
##   1  2  3  4
## 
## -- Imputation 3 --
## 
##   1  2  3  4
## 
## -- Imputation 4 --
## 
##   1  2  3  4
## 
## -- Imputation 5 --
## 
##   1  2  3  4
data_m1_30_amelia %>% anyNA
## [1] FALSE
data_m1_30_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_30_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 7,961 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:00:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:07:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:09:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:11:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:16:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:17:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:23:00    104.    104.   104.     104.    104.
## # ... with 7,951 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_30_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_30_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 1.12e-05 8.6e-06 9.7e-06 8.2e-06 8.8e-06 9.4e-06 0.1486921 0.1268709 0.1289691 0.1187579
imp2 1.06e-05 7.7e-06 9.2e-06 8.0e-06 8.3e-06 8.9e-06 0.1419779 0.1198769 0.1221150 0.1113442
imp3 1.05e-05 7.9e-06 9.4e-06 8.0e-06 8.4e-06 9.0e-06 0.1437963 0.1226745 0.1253322 0.1134424
imp4 1.12e-05 8.9e-06 9.2e-06 8.1e-06 8.8e-06 9.4e-06 0.1421178 0.1278500 0.1281298 0.1249126
imp5 1.17e-05 8.3e-06 9.6e-06 8.2e-06 8.7e-06 9.4e-06 0.1407190 0.1202965 0.1254721 0.1179186

以下使用tidyr::fill()弥补30%数据缺失值。

data_m1_30_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_30_tidyr %>% anyNA
## [1] FALSE
data_m1_30_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_30_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,929 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:05:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:10:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:11:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:19:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:23:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:38:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:39:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:42:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:44:00    104.    104.   104.     104.    104.    104.
## # ... with 1,919 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_30_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_30_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
8.3e-05 6.46e-05 6.72e-05 7.85e-05 7.01e-05 7.33e-05 0.1658973 0.1559659 0.1465939 0.1577843

3.5.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补30%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.3) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_30_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_30_impTS) <- algo
data_tm1_30_impTS %<>% ldply %>% tbl_df

data_tm1_30_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_30_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 3,569 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 3,559 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_30_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_30_impTS %>% 
  kable(caption = 'MSE 30% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 30% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06474 48.03922 48.09377 48.06792 48.06697 48.06641 0.1461916 0.1351351 0.1332924 0.1461916
kalman 48.06447 48.03942 48.09377 48.06791 48.06703 48.06639 0.1529484 0.1418919 0.1363636 0.1541769
locf 48.06791 48.04236 48.09580 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477
ma 48.06320 48.04018 48.09408 48.06792 48.06739 48.06635 0.1566339 0.1345209 0.1547912 0.1689189
mean 48.07771 48.04385 48.10906 48.09428 48.08240 48.08122 0.3642506 0.3507371 0.3224816 0.3869779
random 48.25972 48.06207 48.33286 47.77778 48.05757 48.10811 0.5165848 0.4864865 0.5558968 0.5423833

以下使用Amelia::amelia()弥补30%数据缺失值。

data_tm1_30_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8
data_tm1_30_amelia %>% anyNA
## [1] FALSE
data_tm1_30_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_30_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,704 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:04:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:07:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:17:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:18:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:22:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:50:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:52:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:59:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 01:00:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 01:12:00    111.    111.   111.     111.    111.
## # ... with 1,694 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_30_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_30_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06165 48.03872 48.09386 48.06788 48.06682 48.06553 0.1425061 0.1314496 0.1173219 0.1216216
imp2 48.06313 48.03819 48.09304 48.06734 48.06619 48.06543 0.1412776 0.1332924 0.1025799 0.1068796
imp3 48.06348 48.03940 48.09352 48.06649 48.06647 48.06572 0.1308354 0.1210074 0.1081081 0.1093366
imp4 48.06310 48.04006 48.09321 48.06804 48.06711 48.06610 0.1351351 0.1283784 0.1062654 0.1142506
imp5 48.06414 48.03950 48.09303 48.06767 48.06673 48.06608 0.1400491 0.1265356 0.1056511 0.1068796

以下使用tidyr::fill()弥补30%数据缺失值。

data_tm1_30_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_30_tidyr %>% anyNA
## [1] FALSE
data_tm1_30_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_30_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 431 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:06:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:18:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:43:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:45:00    111.    111.   111.     111.    111.    111.
## # ... with 421 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_30_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_30_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06791 48.04236 48.0958 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477

3.6 50% 缺失值

3.6.1 1分钟数据

以下使用imputeTS::na.seadec()弥补50%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.5) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_50_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_50_impTS) <- algo
data_m1_50_impTS %<>% ldply %>% tbl_df

data_m1_50_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_50_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 22,342 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 22,332 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_50_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_50_impTS %>% 
  kable(caption = 'MSE 50% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 50% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0000704 0.0000582 0.0000780 0.0000750 0.0000704 0.0000704 0.2460484 0.2282837 0.2087005 0.2559799
kalman 0.0000704 0.0000582 0.0000783 0.0000750 0.0000705 0.0000705 0.2530424 0.2322003 0.2149951 0.2632536
locf 0.0001652 0.0001420 0.0001480 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017
ma 0.0000804 0.0000680 0.0001001 0.0000872 0.0000851 0.0000839 0.2808784 0.2552805 0.2345783 0.2899706
mean 0.0306066 0.0306853 0.0305917 0.0302918 0.0305230 0.0305439 0.4684571 0.4066303 0.4129249 0.4671982
random 0.0401562 0.0407120 0.0546502 0.0518731 0.0490784 0.0468479 0.8181564 0.7682193 0.7954959 0.7911596

以下使用Amelia::amelia()弥补50%数据缺失值。

data_m1_50_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8
data_m1_50_amelia %>% anyNA
## [1] FALSE
data_m1_50_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_50_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 12,149 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:04:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:05:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:07:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:09:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:11:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:13:00    104.    104.   104.     104.    104.
## # ... with 12,139 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_50_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_50_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0002917 0.0002841 0.0002886 0.0002930 0.0002886 0.0002894 0.2471674 0.2261855 0.1993286 0.1994685
imp2 0.0004936 0.0004858 0.0004941 0.0004908 0.0004902 0.0004911 0.2330396 0.2022661 0.2007274 0.1881382
imp3 0.0005796 0.0005732 0.0005761 0.0005780 0.0005758 0.0005767 0.2238075 0.2071618 0.1945727 0.1948524
imp4 0.0004111 0.0004062 0.0004048 0.0004095 0.0004068 0.0004079 0.2572388 0.2260456 0.2201707 0.2075815
imp5 0.0005000 0.0004896 0.0005036 0.0004981 0.0004971 0.0004978 0.2285634 0.2045041 0.2000280 0.1926144

以下使用tidyr::fill()弥补50%数据缺失值。

data_m1_50_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_50_tidyr %>% anyNA
## [1] FALSE
data_m1_50_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_50_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 2,974 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:06:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:10:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:11:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:12:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:13:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:20:00    104.    104.   104.     104.    104.    104.
## # ... with 2,964 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_50_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_50_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0001652 0.000142 0.000148 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017

3.6.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补50%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.5) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_50_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_50_impTS) <- algo
data_tm1_50_impTS %<>% ldply %>% tbl_df

data_tm1_50_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_50_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,649 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 4,639 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_50_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_50_impTS %>% 
  kable(caption = 'MSE 50% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 50% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06646 48.03692 48.09347 48.06623 48.06554 48.06577 0.2223587 0.2094595 0.1799754 0.2260442
kalman 48.06676 48.03676 48.09347 48.06627 48.06550 48.06581 0.2340295 0.2168305 0.1848894 0.2315725
locf 48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995
ma 48.06724 48.03628 48.09353 48.06685 48.06555 48.06597 0.2469287 0.2266585 0.1996314 0.2432432
mean 48.10011 48.05789 48.10226 48.06800 48.07605 48.08206 0.4484029 0.3900491 0.3832924 0.4318182
random 48.61875 48.18569 47.83183 47.39597 47.80450 48.00806 0.6044226 0.5761671 0.5909091 0.6173219

以下使用Amelia::amelia()弥补50%数据缺失值。

data_tm1_50_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45
data_tm1_50_amelia %>% anyNA
## [1] FALSE
data_tm1_50_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_50_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 2,800 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:03:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:07:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:08:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:18:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:24:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:40:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:42:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:43:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:45:00    111.    111.   111.     111.    111.
## # ... with 2,790 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_50_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_50_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06908 48.04628 48.10127 48.07136 48.07297 48.07200 0.2192875 0.1934889 0.2039312 0.2027027
imp2 48.06819 48.04509 48.09811 48.07183 48.07168 48.07080 0.2340295 0.2235872 0.2094595 0.2260442
imp3 48.06598 48.04386 48.09902 48.06832 48.07040 48.06930 0.2156020 0.1971744 0.2082310 0.2100737
imp4 48.06758 48.04450 48.09766 48.07323 48.07180 48.07074 0.2143735 0.2039312 0.1947174 0.2063882
imp5 48.06570 48.04092 48.09609 48.06943 48.06881 48.06803 0.2506143 0.2346437 0.2223587 0.2297297

以下使用tidyr::fill()弥补50%数据缺失值。

data_tm1_50_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_50_tidyr %>% anyNA
## [1] FALSE
data_tm1_50_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_50_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 656 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:18:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:43:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:45:00    111.    111.   111.     111.    111.    111.
## # ... with 646 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_50_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_50_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995

3.7 65% 缺失值

3.7.1 1分钟数据

以下使用imputeTS::na.seadec()弥补65%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.65) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_65_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_65_impTS) <- algo
data_m1_65_impTS %<>% ldply %>% tbl_df

data_m1_65_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_65_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 24,251 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 24,241 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_65_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_65_impTS %>% 
  kable(caption = 'MSE 65% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 65% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0001279 0.0001238 0.0001136 0.0001206 0.0001193 0.0001215 0.3662051 0.3241013 0.2831165 0.3394880
kalman 0.0001282 0.0001238 0.0001133 0.0001202 0.0001191 0.0001214 0.3674640 0.3241013 0.2845153 0.3415862
locf 0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752
ma 0.0001512 0.0001488 0.0001313 0.0001449 0.0001417 0.0001441 0.3813121 0.3329137 0.3022800 0.3621486
mean 0.0400602 0.0406286 0.0391629 0.0395314 0.0397743 0.0398458 0.4402014 0.3983774 0.3761365 0.4364247
random 0.0533171 0.0806651 0.1361035 0.1684688 0.1284125 0.1096386 0.2796195 0.2432508 0.6848510 0.6998182

以下使用Amelia::amelia()弥补65%数据缺失值。

data_m1_65_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
data_m1_65_amelia %>% anyNA
## [1] FALSE
data_m1_65_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_65_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 17,317 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:04:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:07:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:15:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:16:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:17:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:18:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:20:00    104.    104.   104.     104.    104.
## # ... with 17,307 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_65_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_65_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0037038 0.0036950 0.0037028 0.0037144 0.0037040 0.0037040 0.3534760 0.3313750 0.3049378 0.3376696
imp2 0.0037461 0.0037553 0.0037236 0.0037351 0.0037380 0.0037400 0.3432648 0.3056372 0.2940271 0.3113722
imp3 0.0040437 0.0040587 0.0040400 0.0040502 0.0040496 0.0040481 0.3513778 0.3303959 0.2906700 0.3392083
imp4 0.0040114 0.0039923 0.0040178 0.0040065 0.0040055 0.0040070 0.3424255 0.3155686 0.2971045 0.3151490
imp5 0.0041922 0.0041956 0.0041963 0.0042075 0.0041998 0.0041979 0.3347321 0.3033991 0.2861939 0.3043782

以下使用tidyr::fill()弥补65%数据缺失值。

data_m1_65_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_65_tidyr %>% anyNA
## [1] FALSE
data_m1_65_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_65_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 3,737 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:10:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:11:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:12:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:13:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:14:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:15:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:18:00    104.    104.   104.     104.    104.    104.
## # ... with 3,727 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_65_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_65_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752

3.7.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补65%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.65) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_65_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_65_impTS) <- algo
data_tm1_65_impTS %<>% ldply %>% tbl_df

data_tm1_65_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_65_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 5,425 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 5,415 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_65_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_65_impTS %>% 
  kable(caption = 'MSE 65% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 65% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.07036 48.04468 48.08991 48.07116 48.06858 48.06903 0.3095823 0.2659705 0.2457002 0.2948403
kalman 48.07038 48.04468 48.08991 48.07061 48.06840 48.06890 0.3114251 0.2684275 0.2450860 0.2911548
locf 48.06478 48.04620 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.2942260 0.3445946
ma 48.07108 48.04394 48.09041 48.07027 48.06821 48.06892 0.3353808 0.2764128 0.2647420 0.3101966
mean 48.06675 48.02981 48.12859 48.11165 48.09002 48.08420 0.4367322 0.3814496 0.3783784 0.4176904
random 47.50568 48.49940 49.01626 47.85631 48.45733 48.21942 0.8832924 0.6977887 0.9195332 0.8298526

以下使用Amelia::amelia()弥补65%数据缺失值。

data_tm1_65_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48
data_tm1_65_amelia %>% anyNA
## [1] FALSE
data_tm1_65_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_65_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 3,558 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:03:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:04:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:05:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:06:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:09:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:18:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:22:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:24:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:38:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:39:00    111.    111.   111.     111.    111.
## # ... with 3,548 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_65_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_65_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06753 48.03945 48.09411 48.06812 48.06723 48.06730 0.3519656 0.3028256 0.2714988 0.2542998
imp2 48.06781 48.03969 48.09423 48.06629 48.06673 48.06700 0.3347666 0.2819410 0.2813268 0.2647420
imp3 48.06148 48.03451 48.08888 48.06000 48.06113 48.06122 0.3132678 0.2727273 0.2880835 0.2788698
imp4 48.06864 48.03979 48.09638 48.06667 48.06761 48.06787 0.3464373 0.2929975 0.3015971 0.2745700
imp5 48.06606 48.03924 48.08798 48.06334 48.06352 48.06416 0.3335381 0.2899263 0.2506143 0.2555283

以下使用tidyr::fill()弥补65%数据缺失值。

data_tm1_65_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_65_tidyr %>% anyNA
## [1] FALSE
data_tm1_65_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_65_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 804 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:06:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:08:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:09:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:11:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:22:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:24:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:38:00    111.    111.   111.     111.    111.    111.
## # ... with 794 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_65_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_65_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06478 48.0462 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.294226 0.3445946

3.8 70% 缺失值

3.8.1 1分钟数据

以下使用imputeTS::na.seadec()弥补70%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.7) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_70_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_70_impTS) <- algo
data_m1_70_impTS %<>% ldply %>% tbl_df

data_m1_70_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_70_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 26,788 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 26,778 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_70_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_70_impTS %>% 
  kable(caption = 'MSE 70% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 70% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3927822 0.3453630 0.3225626 0.3881662
kalman 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3959994 0.3457826 0.3231221 0.3883061
locf 0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165
ma 0.0001928 0.0001537 0.0001548 0.0001982 0.0001689 0.0001749 0.4151630 0.3650860 0.3400476 0.4130648
mean 0.0434017 0.0423406 0.0426192 0.0426670 0.0425423 0.0427571 0.4084487 0.3775353 0.3564135 0.4071898
random 0.1551202 0.0523911 0.1117988 0.2454679 0.1365526 0.1411945 0.8742481 0.8708910 0.7637432 0.8887956

以下使用Amelia::amelia()弥补70%数据缺失值。

data_m1_70_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
data_m1_70_amelia %>% anyNA
## [1] FALSE
data_m1_70_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_70_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 20,129 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:00:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:01:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:04:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:06:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:09:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:14:00    104.    104.   104.     104.    104.
## # ... with 20,119 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_70_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_70_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0065354 0.0065133 0.0065083 0.0064779 0.0064999 0.0065087 0.4273325 0.3474612 0.3517975 0.3743181
imp2 0.0058556 0.0058747 0.0058369 0.0058674 0.0058597 0.0058586 0.4127850 0.3397678 0.3446636 0.3655057
imp3 0.0063261 0.0062797 0.0063244 0.0063016 0.0063019 0.0063080 0.4242551 0.3555742 0.3613093 0.3979578
imp4 0.0062205 0.0061725 0.0061818 0.0061766 0.0061770 0.0061879 0.4052315 0.3361309 0.3291369 0.3536159
imp5 0.0063617 0.0063084 0.0063286 0.0063186 0.0063185 0.0063293 0.4506924 0.3659253 0.3615890 0.3909638

以下使用tidyr::fill()弥补70%数据缺失值。

data_m1_70_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_70_tidyr %>% anyNA
## [1] FALSE
data_m1_70_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_70_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,143 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:05:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:06:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:10:00    104.    104.   104.     104.    104.    104.
## # ... with 4,133 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_70_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_70_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165

3.8.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补70%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.7) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_70_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_70_impTS) <- algo
data_tm1_70_impTS %<>% ldply %>% tbl_df

data_tm1_70_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_70_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 6,299 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 6,289 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_70_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_70_impTS %>% 
  kable(caption = 'MSE 70% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 70% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.05448 48.02946 48.08666 48.07724 48.06445 48.06196 0.3421376 0.3243243 0.2616708 0.3452088
kalman 48.05440 48.02955 48.08650 48.07724 48.06443 48.06192 0.3415233 0.3212531 0.2653563 0.3445946
locf 48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054
ma 48.05462 48.02955 48.08596 48.07753 48.06435 48.06191 0.3617936 0.3445946 0.2856265 0.3703931
mean 48.12663 48.00807 48.08176 48.06426 48.05137 48.07018 0.8335381 0.7665848 0.3243243 0.3863636
random 47.68556 47.12402 47.95719 49.26794 48.11638 48.00868 0.9324324 0.9262899 0.9146192 0.9275184

以下使用Amelia::amelia()弥补70%数据缺失值。

data_tm1_70_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
data_tm1_70_amelia %>% anyNA
## [1] FALSE
data_tm1_70_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_70_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 3,697 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:02:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:03:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:04:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:05:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:06:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:08:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:18:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:22:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:39:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
## # ... with 3,687 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_70_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_70_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06299 48.03345 48.09202 48.06911 48.06486 48.06439 0.3378378 0.3015971 0.2702703 0.2850123
imp2 48.06721 48.03401 48.10299 48.07360 48.07020 48.06945 0.3310811 0.3065111 0.2788698 0.3003686
imp3 48.05517 48.02155 48.08947 48.06446 48.05849 48.05766 0.3187961 0.3175676 0.2911548 0.3402948
imp4 48.05450 48.03275 48.09509 48.06249 48.06344 48.06121 0.3243243 0.2628993 0.2807125 0.2585995
imp5 48.06417 48.04374 48.09782 48.08140 48.07432 48.07178 0.2997543 0.2972973 0.2899263 0.3230958

以下使用tidyr::fill()弥补70%数据缺失值。

data_tm1_70_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_70_tidyr %>% anyNA
## [1] FALSE
data_tm1_70_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_70_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 905 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:06:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:07:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:08:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:18:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:22:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:24:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:38:00    111.    111.   111.     111.    111.    111.
## # ... with 895 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_70_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_70_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054

3.9 80% 缺失值

3.9.1 1分钟数据

以下使用imputeTS::na.seadec()弥补80%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.8) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_80_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_80_impTS) <- algo
data_m1_80_impTS %<>% ldply %>% tbl_df

data_m1_80_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_80_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 28,397 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 28,387 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_80_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_80_impTS %>% 
  kable(caption = 'MSE 80% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 80% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0002653 0.0002080 0.0002403 0.0002507 0.0002330 0.0002411 0.4865016 0.4063505 0.4062107 0.4692964
kalman 0.0002647 0.0002080 0.0002403 0.0002511 0.0002331 0.0002410 0.4876206 0.4055113 0.4049517 0.4662191
locf 0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.5302840 0.4497132 0.4574066 0.5218912
ma 0.0003359 0.0002728 0.0002943 0.0003105 0.0002925 0.0003034 0.5045461 0.4288712 0.4227165 0.4869212
mean 0.0486525 0.0484767 0.0482804 0.0487259 0.0484943 0.0485339 0.2989229 0.2805987 0.2605959 0.2943069
random 0.0967558 0.0565147 0.1428038 0.0580634 0.0857940 0.0885344 0.8461323 0.9290810 0.1155406 0.8341027

以下使用Amelia::amelia()弥补80%数据缺失值。

data_m1_80_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74
data_m1_80_amelia %>% anyNA
## [1] FALSE
data_m1_80_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_80_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 22,084 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:01:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:02:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:04:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:06:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:07:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:09:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:15:00    104.    104.   104.     104.    104.
## # ... with 22,074 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_80_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_80_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0179774 0.0179669 0.0179299 0.0179521 0.0179496 0.0179566 0.4511120 0.3912435 0.3925024 0.4497132
imp2 0.0181715 0.0182881 0.0182233 0.0182494 0.0182536 0.0182331 0.4567072 0.3929221 0.3660652 0.4357253
imp3 0.0188257 0.0189006 0.0189500 0.0188341 0.0188949 0.0188776 0.4779689 0.4242551 0.4165618 0.5053854
imp4 0.0178892 0.0178482 0.0178864 0.0178254 0.0178533 0.0178623 0.4648203 0.4056511 0.3899846 0.4616030
imp5 0.0171595 0.0171508 0.0172147 0.0171742 0.0171799 0.0171748 0.4348860 0.3748776 0.3765562 0.4386628

以下使用tidyr::fill()弥补80%数据缺失值。

data_m1_80_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_80_tidyr %>% anyNA
## [1] FALSE
data_m1_80_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_80_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,865 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:02:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:10:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:17:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:18:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:19:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:24:00    104.    104.   104.     104.    104.    104.
## # ... with 4,855 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_80_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_80_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.530284 0.4497132 0.4574066 0.5218912

3.9.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补80%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.8) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_80_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_80_impTS) <- algo
data_tm1_80_impTS %<>% ldply %>% tbl_df

data_tm1_80_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_80_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 6,887 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 6,877 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_80_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_80_impTS %>% 
  kable(caption = 'MSE 80% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 80% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.06756 48.04682 48.09353 48.06823 48.06952 48.06903 0.4250614 0.3525799 0.3789926 0.4244472
kalman 48.06761 48.04682 48.09357 48.06821 48.06953 48.06905 0.4256757 0.3519656 0.3820639 0.4256757
locf 48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597
ma 48.06795 48.04561 48.09361 48.06843 48.06922 48.06890 0.4398034 0.3808354 0.3863636 0.4391892
mean 48.05587 48.05459 48.17520 48.12306 48.11762 48.10218 0.8802211 0.2708845 0.8157248 0.2954545
random 49.44640 48.28223 49.41269 47.95915 48.55136 48.77512 0.9686732 0.9705160 0.9017199 0.9133907

以下使用Amelia::amelia()弥补80%数据缺失值。

data_tm1_80_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
data_tm1_80_amelia %>% anyNA
## [1] FALSE
data_tm1_80_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_80_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,328 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:03:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:04:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:06:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:39:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:41:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:52:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:55:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:56:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:57:00    111.    111.   111.     111.    111.
## # ... with 4,318 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_80_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_80_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.09547 48.06079 48.12047 48.08773 48.08966 48.09112 0.3599509 0.3396806 0.3065111 0.3476658
imp2 48.09635 48.03502 48.09513 48.06611 48.06542 48.07315 0.4809582 0.4379607 0.3550369 0.3574939
imp3 48.06543 48.03745 48.09131 48.05879 48.06252 48.06325 0.4778870 0.3384521 0.3673219 0.2868550
imp4 48.07939 48.04066 48.10873 48.06645 48.07195 48.07381 0.4662162 0.3765356 0.4041769 0.4011057
imp5 48.08769 48.05208 48.11440 48.07758 48.08135 48.08294 0.3980344 0.3470516 0.3200246 0.3304668

以下使用tidyr::fill()弥补80%数据缺失值。

data_tm1_80_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_80_tidyr %>% anyNA
## [1] FALSE
data_tm1_80_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_80_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,028 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:05:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:40:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:41:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:42:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:43:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:45:00    111.    111.   111.     111.    111.    111.
## # ... with 1,018 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_80_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_80_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597

3.10 85% 缺失值

3.10.1 1分钟数据

以下使用imputeTS::na.seadec()弥补85%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.85) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_85_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_85_impTS) <- algo
data_m1_85_impTS %<>% ldply %>% tbl_df

data_m1_85_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_85_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 29,379 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 29,369 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_85_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_85_impTS %>% 
  kable(caption = 'MSE 85% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 85% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0003568 0.0002415 0.0003425 0.0003855 0.0003232 0.0003316 0.5390964 0.4617429 0.4374038 0.5209120
kalman 0.0003567 0.0002414 0.0003399 0.0003848 0.0003220 0.0003307 0.5336411 0.4581060 0.4311092 0.5150371
locf 0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.4825850 0.4667786 0.5409148
ma 0.0004343 0.0003234 0.0004471 0.0004450 0.0004052 0.0004124 0.5718282 0.4860820 0.4788082 0.5623164
mean 0.0521145 0.0515481 0.0514613 0.0510225 0.0513439 0.0515366 0.2410127 0.2432508 0.2106588 0.2477270
random 0.3240766 0.0639116 0.0581029 0.0510604 0.0576916 0.1242879 0.9664289 0.1843614 0.9805567 0.9148133

以下使用Amelia::amelia()弥补85%数据缺失值。

data_m1_85_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106
data_m1_85_amelia %>% anyNA
## [1] FALSE
data_m1_85_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_85_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 24,613 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:00:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:01:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:02:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:03:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:05:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:06:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:11:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:12:00    104.    104.   104.     104.    104.
## # ... with 24,603 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_85_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_85_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0296854 0.0294646 0.0295183 0.0295090 0.0294973 0.0295443 0.5305637 0.4369842 0.4357253 0.5021681
imp2 0.0285124 0.0284944 0.0284871 0.0284606 0.0284807 0.0284886 0.5101413 0.4222968 0.4008952 0.4583858
imp3 0.0309386 0.0309092 0.0309865 0.0308879 0.0309279 0.0309305 0.5620366 0.4683172 0.4411806 0.5178347
imp4 0.0295915 0.0292050 0.0294749 0.0293755 0.0293518 0.0294117 0.5662330 0.4571269 0.4434187 0.5016086
imp5 0.0296229 0.0295257 0.0296607 0.0295922 0.0295929 0.0296004 0.5403553 0.4505525 0.4560078 0.5295846

以下使用tidyr::fill()弥补85%数据缺失值。

data_m1_85_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_85_tidyr %>% anyNA
## [1] FALSE
data_m1_85_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_85_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,980 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:02:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:18:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:19:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:20:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:21:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:22:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:23:00    104.    104.   104.     104.    104.    104.
## # ... with 4,970 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_85_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_85_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.482585 0.4667786 0.5409148

3.10.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补85%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.85) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_85_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_85_impTS) <- algo
data_tm1_85_impTS %<>% ldply %>% tbl_df

data_tm1_85_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_85_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 6,434 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 6,424 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_85_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_85_impTS %>% 
  kable(caption = 'MSE 85% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 85% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.05584 48.04052 48.08479 48.06113 48.06214 48.06057 0.4410319 0.3660934 0.3900491 0.4606880
kalman 48.05593 48.04052 48.08479 48.06086 48.06206 48.06052 0.4404177 0.3593366 0.3875921 0.4484029
locf 48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.5491400
ma 48.05618 48.04110 48.08498 48.06091 48.06233 48.06079 0.5116708 0.4183047 0.4551597 0.5098280
mean 48.11939 48.09357 48.10504 48.10364 48.10075 48.10541 0.2260442 0.1984029 0.2125307 0.2278870
random 49.15168 47.72769 49.75232 47.47577 48.31860 48.52687 0.9895577 0.9889435 0.9631450 0.9649877

以下使用Amelia::amelia()弥补85%数据缺失值。

data_tm1_85_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147
data_tm1_85_amelia %>% anyNA
## [1] FALSE
data_tm1_85_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_85_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,380 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:02:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:03:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:06:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:07:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:08:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:09:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:11:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:17:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:18:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:24:00    111.    111.   111.     111.    111.
## # ... with 4,370 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_85_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_85_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.06422 48.06042 48.11247 48.11366 48.09552 48.08769 0.3765356 0.3568796 0.3427518 0.3820639
imp2 48.07333 48.06250 48.11384 48.10463 48.09366 48.08858 0.3562654 0.3427518 0.3200246 0.3839066
imp3 48.06908 48.06095 48.10012 48.10192 48.08767 48.08302 0.3568796 0.3396806 0.3218673 0.3488943
imp4 48.07092 48.06230 48.11810 48.08630 48.08890 48.08441 0.4459459 0.3200246 0.3495086 0.2696560
imp5 48.05452 48.04422 48.10806 48.09895 48.08375 48.07644 0.4299754 0.4103194 0.3869779 0.4434889

以下使用tidyr::fill()弥补85%数据缺失值。

data_tm1_85_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_85_tidyr %>% anyNA
## [1] FALSE
data_tm1_85_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_85_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,170 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:02:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:03:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:04:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:17:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:18:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:22:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:24:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:38:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:39:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:40:00    111.    111.   111.     111.    111.    111.
## # ... with 1,160 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_85_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_85_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.54914

3.11 90% 缺失值

3.11.1 1分钟数据

以下使用imputeTS::na.seadec()弥补90%数据缺失值。

data_m1_NA <- data_m1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.9) %>% 
  cbind(data_m1[1], .) %>% tbl_df

data_m1_90_impTS <- llply(algo, function(x) {
  data_m1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_m1_90_impTS) <- algo
data_m1_90_impTS %<>% ldply %>% tbl_df

data_m1_90_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_90_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 35,473 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  2 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  3 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  4 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  5 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  6 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  7 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  8 inte~    104.    104.   104.     104.    104.    104.   104.     104.
##  9 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## 10 inte~    104.    104.   104.     104.    104.    104.   104.     104.
## # ... with 35,463 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_m1_90_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_90_impTS %>% 
  kable(caption = 'MSE 90% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 90% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 0.0004602 0.0005112 0.0004253 0.0005540 0.0004968 0.0004877 0.5963072 0.5367184 0.4974122 0.6212058
kalman 0.0004602 0.0005172 0.0004253 0.0005541 0.0004989 0.0004892 0.5978459 0.5383970 0.4965729 0.6207861
locf 0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398
ma 0.0006192 0.0006425 0.0005668 0.0007449 0.0006514 0.0006434 0.6475031 0.5676318 0.5312631 0.6360330
mean 0.0554369 0.0545033 0.0548398 0.0556674 0.0550035 0.0551119 0.9363547 0.9440481 0.1286893 0.9380333
random 0.0716249 0.1366404 0.0568678 0.0790605 0.0908562 0.0860484 0.1349839 0.0870052 0.9282417 0.9373339

以下使用Amelia::amelia()弥补90%数据缺失值。

data_m1_90_amelia <- data_m1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223
data_m1_90_amelia %>% anyNA
## [1] FALSE
data_m1_90_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_90_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 24,833 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2016-10-17 00:00:00    104.    104.   104.     104.    104.
##  2 imp1  2016-10-17 00:01:00    104.    104.   104.     104.    104.
##  3 imp1  2016-10-17 00:02:00    104.    104.   104.     104.    104.
##  4 imp1  2016-10-17 00:04:00    104.    104.   104.     104.    104.
##  5 imp1  2016-10-17 00:05:00    104.    104.   104.     104.    104.
##  6 imp1  2016-10-17 00:08:00    104.    104.   104.     104.    104.
##  7 imp1  2016-10-17 00:09:00    104.    104.   104.     104.    104.
##  8 imp1  2016-10-17 00:10:00    104.    104.   104.     104.    104.
##  9 imp1  2016-10-17 00:13:00    104.    104.   104.     104.    104.
## 10 imp1  2016-10-17 00:14:00    104.    104.   104.     104.    104.
## # ... with 24,823 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_m1_90_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_90_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 0.0461595 0.0461769 0.0463285 0.0462737 0.0462597 0.0462346 0.5740663 0.4841237 0.4734928 0.5550427
imp2 0.0444710 0.0444534 0.0444560 0.0445331 0.0444808 0.0444784 0.5337809 0.4397818 0.4294307 0.4981116
imp3 0.0458608 0.0460663 0.0465595 0.0462600 0.0462953 0.0461867 0.5509862 0.4834243 0.4192195 0.5245489
imp4 0.0462646 0.0464083 0.0464868 0.0463752 0.0464234 0.0463837 0.5572807 0.4809064 0.4618828 0.5704294
imp5 0.0440889 0.0441617 0.0444101 0.0440200 0.0441973 0.0441702 0.5168555 0.4466359 0.4127850 0.5109806

以下使用tidyr::fill()弥补90%数据缺失值。

data_m1_90_tidyr <- data_m1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_m1_90_tidyr %>% anyNA
## [1] FALSE
data_m1_90_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_m1_90_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 5,503 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2016-10-17 00:00:00    104.    104.   104.     104.    104.    104.
##  2 2016-10-17 00:01:00    104.    104.   104.     104.    104.    104.
##  3 2016-10-17 00:02:00    104.    104.   104.     104.    104.    104.
##  4 2016-10-17 00:03:00    104.    104.   104.     104.    104.    104.
##  5 2016-10-17 00:04:00    104.    104.   104.     104.    104.    104.
##  6 2016-10-17 00:05:00    104.    104.   104.     104.    104.    104.
##  7 2016-10-17 00:06:00    104.    104.   104.     104.    104.    104.
##  8 2016-10-17 00:07:00    104.    104.   104.     104.    104.    104.
##  9 2016-10-17 00:08:00    104.    104.   104.     104.    104.    104.
## 10 2016-10-17 00:09:00    104.    104.   104.     104.    104.    104.
## # ... with 5,493 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_m1_90_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_m1_90_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398

3.11.2 Tick数据转为1分钟数据

以下使用imputeTS::na.seadec()弥补90%数据缺失值。

data_tm1_NA <- data_tm1 %>% 
  dplyr::select(BidOpen, BidHigh, BidLow, BidClose, 
                AskOpen, AskHigh, AskLow,  AskClose) %>% 
  prodNA(noNA = 0.9) %>% 
  cbind(data_tm1[1], .) %>% tbl_df

data_tm1_90_impTS <- llply(algo, function(x) {
  data_tm1_NA %>% 
    dplyr::select(starts_with('Ask'), starts_with('Bid')) %>% 
    map(na.seadec, algorithm = x) %>% as.tibble
  })
names(data_tm1_90_impTS) <- algo
data_tm1_90_impTS %<>% ldply %>% tbl_df

data_tm1_90_impTS %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_90_impTS %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 6,763 x 13
##    .id   AskOpen AskHigh AskLow AskClose BidOpen BidHigh BidLow BidClose
##    <chr>   <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>
##  1 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  2 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  3 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  4 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  5 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  6 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  7 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  8 inte~    111.    111.   111.     111.    111.    111.   111.     111.
##  9 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## 10 inte~    111.    111.   111.     111.    111.    111.   111.     111.
## # ... with 6,753 more rows, and 4 more variables: bias.open <dbl>,
## #   bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
data_tm1_90_impTS %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_90_impTS %>% 
  kable(caption = 'MSE 90% 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE 90% 缺失
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
interpolation 48.09238 48.02948 48.08101 48.06488 48.05846 48.06694 0.5270270 0.5165848 0.4600737 0.6339066
kalman 48.09188 48.03086 48.08101 48.06581 48.05923 48.06739 0.5221130 0.5079853 0.4459459 0.6216216
locf 48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.5368550 0.5018428 0.5122850 0.6375921
ma 48.09193 48.03144 48.08093 48.06593 48.05943 48.06756 0.5737101 0.5110565 0.4944717 0.6228501
mean 48.03712 48.06868 48.06397 48.17040 48.10102 48.08504 0.1572482 0.1547912 0.1480344 0.1658477
random 48.11770 47.55666 47.88066 47.62295 47.68676 47.79449 0.9410319 0.9367322 0.9613022 0.9637592

以下使用Amelia::amelia()弥补90%数据缺失值。

data_tm1_90_amelia <- data_tm1_NA %>% 
  amelia %>% 
  .$imputations %>% 
  ldply %>% tbl_df
## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
##  241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
##  261
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
##  241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
##  261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
##  281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
##  301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
##  321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
##  341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
##  361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
##  381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
##  241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
##  261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
##  281 282 283 284 285 286 287 288 289
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
##  241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
##  261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
## 
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
##  41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##  61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
##  81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##  101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
##  121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
##  141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
##  161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
##  181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
##  201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
##  221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
##  241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
##  261
data_tm1_90_amelia %>% anyNA
## [1] FALSE
data_tm1_90_amelia %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_90_amelia %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 4,291 x 14
##    .id   index               BidOpen BidHigh BidLow BidClose AskOpen
##    <chr> <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>
##  1 imp1  2017-08-07 00:02:00    111.    111.   111.     111.    111.
##  2 imp1  2017-08-07 00:07:00    111.    111.   111.     111.    111.
##  3 imp1  2017-08-07 00:08:00    111.    111.   111.     111.    111.
##  4 imp1  2017-08-07 00:09:00    111.    111.   111.     111.    111.
##  5 imp1  2017-08-07 00:22:00    111.    111.   111.     111.    111.
##  6 imp1  2017-08-07 00:42:00    111.    111.   111.     111.    111.
##  7 imp1  2017-08-07 00:44:00    111.    111.   111.     111.    111.
##  8 imp1  2017-08-07 00:45:00    111.    111.   111.     111.    111.
##  9 imp1  2017-08-07 00:55:00    111.    111.   111.     111.    111.
## 10 imp1  2017-08-07 00:57:00    111.    111.   111.     111.    111.
## # ... with 4,281 more rows, and 7 more variables: AskHigh <dbl>,
## #   AskLow <dbl>, AskClose <dbl>, bias.open <dbl>, bias.high <dbl>,
## #   bias.low <dbl>, bias.close <dbl>
data_tm1_90_amelia %<>% 
  ddply(.(.id), summarise, 
        AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
        AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
        AskLow = mean((AskLow - data_m1$AskLow)^2), 
        AskClose = mean((AskClose - data_m1$AskClose)^2), 
        Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
        Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
        bias.open = sum(bias.open)/length(bias.open), 
        bias.high = sum(bias.high)/length(bias.high), 
        bias.low = sum(bias.low)/length(bias.low), 
        bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_90_amelia %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
.id AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
imp1 48.08072 48.03486 48.07313 48.05499 48.05433 48.06093 0.4189189 0.3409091 0.3114251 0.3544226
imp2 48.10489 48.05921 48.08469 48.07178 48.07189 48.08014 0.4600737 0.3789926 0.2911548 0.3568796
imp3 48.07160 48.04393 48.06767 48.04916 48.05359 48.05809 0.3452088 0.3132678 0.2463145 0.3003686
imp4 48.10228 48.04703 48.07421 48.07541 48.06555 48.07473 0.4576167 0.3851351 0.2585995 0.3538084
imp5 48.06921 48.04484 48.06735 48.03865 48.05028 48.05501 0.4299754 0.2929975 0.3108108 0.3882064

以下使用tidyr::fill()弥补90%数据缺失值。

data_tm1_90_tidyr <- data_tm1_NA %>% 
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose) %>% #default direction down
  fill(BidOpen, BidHigh, BidLow, BidClose, 
       AskOpen, AskHigh, AskLow, AskClose, .direction = 'up')

data_tm1_90_tidyr %>% anyNA
## [1] FALSE
data_tm1_90_tidyr %<>% mutate(
  bias.open = if_else(AskOpen>AskHigh|AskOpen<AskLow, 1, 0), 
  bias.high = if_else(AskHigh<AskOpen|AskHigh<AskLow|AskHigh<AskClose, 1, 0), 
  bias.low = if_else(AskLow>AskOpen|AskLow>AskHigh|AskLow>AskClose, 1, 0), 
  bias.close = if_else(AskClose>AskHigh|AskClose<AskLow, 1, 0))

data_tm1_90_tidyr %>% 
  dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
## # A tibble: 1,183 x 13
##    index               BidOpen BidHigh BidLow BidClose AskOpen AskHigh
##    <dttm>                <dbl>   <dbl>  <dbl>    <dbl>   <dbl>   <dbl>
##  1 2017-08-07 00:18:00    111.    111.   111.     111.    111.    111.
##  2 2017-08-07 00:22:00    111.    111.   111.     111.    111.    111.
##  3 2017-08-07 00:24:00    111.    111.   111.     111.    111.    111.
##  4 2017-08-07 00:38:00    111.    111.   111.     111.    111.    111.
##  5 2017-08-07 00:39:00    111.    111.   111.     111.    111.    111.
##  6 2017-08-07 00:40:00    111.    111.   111.     111.    111.    111.
##  7 2017-08-07 00:41:00    111.    111.   111.     111.    111.    111.
##  8 2017-08-07 00:42:00    111.    111.   111.     111.    111.    111.
##  9 2017-08-07 00:43:00    111.    111.   111.     111.    111.    111.
## 10 2017-08-07 00:44:00    111.    111.   111.     111.    111.    111.
## # ... with 1,173 more rows, and 6 more variables: AskLow <dbl>,
## #   AskClose <dbl>, bias.open <dbl>, bias.high <dbl>, bias.low <dbl>,
## #   bias.close <dbl>
data_tm1_90_tidyr %<>% 
  summarise(
    AskOpen = mean((AskOpen - data_m1$AskOpen)^2), 
    AskHigh = mean((AskHigh - data_m1$AskHigh)^2), 
    AskLow = mean((AskLow - data_m1$AskLow)^2), 
    AskClose = mean((AskClose - data_m1$AskClose)^2), 
    Mean.HLC = (AskHigh + AskLow + AskClose)/3, 
    Mean.OHLC = (AskOpen + AskHigh + AskLow + AskClose)/4, 
    bias.open = sum(bias.open)/length(bias.open), 
    bias.high = sum(bias.high)/length(bias.high), 
    bias.low = sum(bias.low)/length(bias.low), 
    bias.close = sum(bias.close)/length(bias.close)) %>% tbl_df

data_tm1_90_tidyr %>% 
  kable(caption = 'MSE') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%')#, height = '400px')
MSE
AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close
48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.536855 0.5018428 0.512285 0.6375921

4 偏差比较

4.1 单变量弥补数据偏差比较

以下日内数据比较出在指定的缺失值占有数,该数据弥补得来后的误差与偏差会有多高。

dfm1_impTS <- list(`1%` = data_m1_1_impTS, `10%` = data_m1_10_impTS, 
                   `20%` = data_m1_20_impTS, `30%` = data_m1_30_impTS, 
                   `50%` = data_m1_50_impTS, `65%` = data_m1_65_impTS, 
                   `70%` = data_m1_70_impTS, `80%` = data_m1_80_impTS, 
                   `85%` = data_m1_85_impTS, `90%` = data_m1_90_impTS)

dfm2_impTS <- list(`1%` = data_tm1_1_impTS, `10%` = data_tm1_10_impTS, 
                   `20%` = data_tm1_20_impTS, `30%` = data_tm1_30_impTS, 
                   `50%` = data_tm1_50_impTS, `65%` = data_tm1_65_impTS, 
                   `70%` = data_tm1_70_impTS, `80%` = data_tm1_80_impTS, 
                   `85%` = data_tm1_85_impTS, `90%` = data_tm1_90_impTS)

## Summarise
dfm1_impTS %<>% ldply(function(x) x %>% dplyr::rename(Model = .id)) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)
dfm2_impTS %<>% ldply(function(x) x %>% dplyr::rename(Model = .id)) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)

dfm1_impTS %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
1% kalman 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0050357 0.0064345 0.0050357 0.0064345 0.0057351
1% interpolation 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0048958 0.0062946 0.0048958 0.0062946 0.0055952
1% ma 0.0000007 0.0000014 0.0000012 0.0000013 0.0000013 0.0000012 0.0079731 0.0088124 0.0067142 0.0086725 0.0080431
1% locf 0.0000014 0.0000031 0.0000018 0.0000036 0.0000029 0.0000025 0.0037768 0.0053154 0.0043363 0.0058749 0.0048258
10% interpolation 0.0000090 0.0000071 0.0000075 0.0000095 0.0000080 0.0000083 0.0495174 0.0479787 0.0492377 0.0496573 0.0490978
10% kalman 0.0000090 0.0000071 0.0000075 0.0000096 0.0000081 0.0000083 0.0513359 0.0500769 0.0509162 0.0517555 0.0510211
10% ma 0.0000108 0.0000100 0.0000107 0.0000129 0.0000112 0.0000111 0.0672821 0.0636453 0.0614072 0.0665827 0.0647293
20% interpolation 0.0000229 0.0000146 0.0000154 0.0000201 0.0000167 0.0000182 0.1054693 0.0988950 0.0949783 0.0948384 0.0985453
20% kalman 0.0000229 0.0000146 0.0000154 0.0000202 0.0000167 0.0000183 0.1088264 0.1022521 0.0976360 0.0986152 0.1018324
10% locf 0.0000190 0.0000160 0.0000167 0.0000208 0.0000178 0.0000181 0.0530144 0.0507763 0.0471395 0.0464401 0.0493426
20% ma 0.0000275 0.0000192 0.0000198 0.0000245 0.0000212 0.0000228 0.1292488 0.1209959 0.1112044 0.1222549 0.1209260
30% interpolation 0.0000374 0.0000247 0.0000282 0.0000342 0.0000290 0.0000311 0.1573647 0.1435166 0.1454749 0.1502308 0.1491467
30% kalman 0.0000373 0.0000247 0.0000282 0.0000344 0.0000291 0.0000312 0.1603021 0.1458945 0.1481326 0.1537278 0.1520143
30% ma 0.0000425 0.0000321 0.0000351 0.0000407 0.0000359 0.0000376 0.1805847 0.1612813 0.1653378 0.1772276 0.1711078
20% locf 0.0000526 0.0000327 0.0000319 0.0000438 0.0000362 0.0000403 0.1105050 0.0969366 0.0981956 0.0916212 0.0993146
30% locf 0.0000830 0.0000646 0.0000672 0.0000785 0.0000701 0.0000733 0.1658973 0.1559659 0.1465939 0.1577843 0.1565604
50% interpolation 0.0000704 0.0000582 0.0000780 0.0000750 0.0000704 0.0000704 0.2460484 0.2282837 0.2087005 0.2559799 0.2347531
50% kalman 0.0000704 0.0000582 0.0000783 0.0000750 0.0000705 0.0000705 0.2530424 0.2322003 0.2149951 0.2632536 0.2408728
50% ma 0.0000804 0.0000680 0.0001001 0.0000872 0.0000851 0.0000839 0.2808784 0.2552805 0.2345783 0.2899706 0.2651769
65% kalman 0.0001282 0.0001238 0.0001133 0.0001202 0.0001191 0.0001214 0.3674640 0.3241013 0.2845153 0.3415862 0.3294167
65% interpolation 0.0001279 0.0001238 0.0001136 0.0001206 0.0001193 0.0001215 0.3662051 0.3241013 0.2831165 0.3394880 0.3282277
65% ma 0.0001512 0.0001488 0.0001313 0.0001449 0.0001417 0.0001441 0.3813121 0.3329137 0.3022800 0.3621486 0.3446636
70% kalman 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3959994 0.3457826 0.3231221 0.3883061 0.3633026
70% interpolation 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3927822 0.3453630 0.3225626 0.3881662 0.3622185
50% locf 0.0001652 0.0001420 0.0001480 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017 0.2603161
70% ma 0.0001928 0.0001537 0.0001548 0.0001982 0.0001689 0.0001749 0.4151630 0.3650860 0.3400476 0.4130648 0.3833403
80% interpolation 0.0002653 0.0002080 0.0002403 0.0002507 0.0002330 0.0002411 0.4865016 0.4063505 0.4062107 0.4692964 0.4420898
80% kalman 0.0002647 0.0002080 0.0002403 0.0002511 0.0002331 0.0002410 0.4876206 0.4055113 0.4049517 0.4662191 0.4410757
65% locf 0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752 0.3538257
80% ma 0.0003359 0.0002728 0.0002943 0.0003105 0.0002925 0.0003034 0.5045461 0.4288712 0.4227165 0.4869212 0.4607637
85% kalman 0.0003567 0.0002414 0.0003399 0.0003848 0.0003220 0.0003307 0.5336411 0.4581060 0.4311092 0.5150371 0.4844734
85% interpolation 0.0003568 0.0002415 0.0003425 0.0003855 0.0003232 0.0003316 0.5390964 0.4617429 0.4374038 0.5209120 0.4897888
70% locf 0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165 0.3983424
85% ma 0.0004343 0.0003234 0.0004471 0.0004450 0.0004052 0.0004124 0.5718282 0.4860820 0.4788082 0.5623164 0.5247587
90% interpolation 0.0004602 0.0005112 0.0004253 0.0005540 0.0004968 0.0004877 0.5963072 0.5367184 0.4974122 0.6212058 0.5629109
90% kalman 0.0004602 0.0005172 0.0004253 0.0005541 0.0004989 0.0004892 0.5978459 0.5383970 0.4965729 0.6207861 0.5634005
1% mean 0.0005507 0.0005905 0.0004618 0.0006969 0.0005831 0.0005750 0.0194433 0.0179046 0.0205623 0.0194433 0.0193384
80% locf 0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.5302840 0.4497132 0.4574066 0.5218912 0.4898238
90% ma 0.0006192 0.0006425 0.0005668 0.0007449 0.0006514 0.0006434 0.6475031 0.5676318 0.5312631 0.6360330 0.5956078
85% locf 0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.4825850 0.4667786 0.5409148 0.5123794
1% random 0.0024076 0.0006038 0.0023222 0.0008296 0.0012519 0.0015408 0.0145475 0.0093719 0.0195832 0.0145475 0.0145125
90% locf 0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398 0.5776332
10% mean 0.0057660 0.0063513 0.0060217 0.0066473 0.0063401 0.0061966 0.1719122 0.1493915 0.1814240 0.1751294 0.1694643
20% mean 0.0123887 0.0117783 0.0116303 0.0126437 0.0120174 0.0121103 0.3066163 0.2652119 0.2957057 0.3014408 0.2922437
30% mean 0.0190881 0.0174648 0.0182534 0.0190274 0.0182485 0.0184584 0.4018744 0.3536159 0.3729193 0.4076095 0.3840048
10% random 0.0077868 0.0226350 0.0087245 0.0291799 0.0201798 0.0170816 0.1092460 0.0870052 0.1375017 0.1214156 0.1137921
50% mean 0.0306066 0.0306853 0.0305917 0.0302918 0.0305230 0.0305439 0.4684571 0.4066303 0.4129249 0.4671982 0.4388026
20% random 0.0308892 0.0318163 0.0327874 0.0353737 0.0333258 0.0327167 0.3551546 0.3561337 0.3119317 0.3477409 0.3427402
65% mean 0.0400602 0.0406286 0.0391629 0.0395314 0.0397743 0.0398458 0.4402014 0.3983774 0.3761365 0.4364247 0.4127850
70% mean 0.0434017 0.0423406 0.0426192 0.0426670 0.0425423 0.0427571 0.4084487 0.3775353 0.3564135 0.4071898 0.3873968
80% mean 0.0486525 0.0484767 0.0482804 0.0487259 0.0484943 0.0485339 0.2989229 0.2805987 0.2605959 0.2943069 0.2836061
50% random 0.0401562 0.0407120 0.0546502 0.0518731 0.0490784 0.0468479 0.8181564 0.7682193 0.7954959 0.7911596 0.7932578
85% mean 0.0521145 0.0515481 0.0514613 0.0510225 0.0513439 0.0515366 0.2410127 0.2432508 0.2106588 0.2477270 0.2356623
90% mean 0.0554369 0.0545033 0.0548398 0.0556674 0.0550035 0.0551119 0.9363547 0.9440481 0.1286893 0.9380333 0.7367814
85% random 0.3240766 0.0639116 0.0581029 0.0510604 0.0576916 0.1242879 0.9664289 0.1843614 0.9805567 0.9148133 0.7615401
30% random 0.0891399 0.0503076 0.0991078 0.0534894 0.0676349 0.0730112 0.4879004 0.5048258 0.3189257 0.3983774 0.4275073
80% random 0.0967558 0.0565147 0.1428038 0.0580634 0.0857940 0.0885344 0.8461323 0.9290810 0.1155406 0.8341027 0.6812142
90% random 0.0716249 0.1366404 0.0568678 0.0790605 0.0908562 0.0860484 0.1349839 0.0870052 0.9282417 0.9373339 0.5218912
65% random 0.0533171 0.0806651 0.1361035 0.1684688 0.1284125 0.1096386 0.2796195 0.2432508 0.6848510 0.6998182 0.4768849
70% random 0.1551202 0.0523911 0.1117988 0.2454679 0.1365526 0.1411945 0.8742481 0.8708910 0.7637432 0.8887956 0.8494195
dfm2_impTS %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
90% random 48.11770 47.55666 47.88066 47.62295 47.68676 47.79449 0.9410319 0.9367322 0.9613022 0.9637592 0.9507064
50% random 48.61875 48.18569 47.83183 47.39597 47.80450 48.00806 0.6044226 0.5761671 0.5909091 0.6173219 0.5972052
20% random 48.45310 48.01260 48.05931 47.86129 47.97773 48.09657 0.3366093 0.3218673 0.3126536 0.3359951 0.3267813
90% locf 48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.5368550 0.5018428 0.5122850 0.6375921 0.5471437
70% mean 48.12663 48.00807 48.08176 48.06426 48.05137 48.07018 0.8335381 0.7665848 0.3243243 0.3863636 0.5777027
30% random 48.25972 48.06207 48.33286 47.77778 48.05757 48.10811 0.5165848 0.4864865 0.5558968 0.5423833 0.5253378
90% interpolation 48.09238 48.02948 48.08101 48.06488 48.05846 48.06694 0.5270270 0.5165848 0.4600737 0.6339066 0.5343980
90% kalman 48.09188 48.03086 48.08101 48.06581 48.05923 48.06739 0.5221130 0.5079853 0.4459459 0.6216216 0.5244165
90% ma 48.09193 48.03144 48.08093 48.06593 48.05943 48.06756 0.5737101 0.5110565 0.4944717 0.6228501 0.5505221
85% locf 48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.5491400 0.5138206
80% locf 48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597 0.4450246
85% kalman 48.05593 48.04052 48.08479 48.06086 48.06206 48.06052 0.4404177 0.3593366 0.3875921 0.4484029 0.4089373
85% interpolation 48.05584 48.04052 48.08479 48.06113 48.06214 48.06057 0.4410319 0.3660934 0.3900491 0.4606880 0.4144656
85% ma 48.05618 48.04110 48.08498 48.06091 48.06233 48.06079 0.5116708 0.4183047 0.4551597 0.5098280 0.4737408
70% locf 48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054 0.3773034
70% ma 48.05462 48.02955 48.08596 48.07753 48.06435 48.06191 0.3617936 0.3445946 0.2856265 0.3703931 0.3406020
70% kalman 48.05440 48.02955 48.08650 48.07724 48.06443 48.06192 0.3415233 0.3212531 0.2653563 0.3445946 0.3181818
70% interpolation 48.05448 48.02946 48.08666 48.07724 48.06445 48.06196 0.3421376 0.3243243 0.2616708 0.3452088 0.3183354
50% locf 48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995 0.2519963
50% kalman 48.06676 48.03676 48.09347 48.06627 48.06550 48.06581 0.2340295 0.2168305 0.1848894 0.2315725 0.2168305
50% interpolation 48.06646 48.03692 48.09347 48.06623 48.06554 48.06577 0.2223587 0.2094595 0.1799754 0.2260442 0.2094595
50% ma 48.06724 48.03628 48.09353 48.06685 48.06555 48.06597 0.2469287 0.2266585 0.1996314 0.2432432 0.2291155
1% locf 48.06471 48.03943 48.09250 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.0024570 0.0050676
65% locf 48.06478 48.04620 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.2942260 0.3445946 0.3267813
10% ma 48.06389 48.03796 48.09338 48.06803 48.06646 48.06581 0.0626536 0.0540541 0.0540541 0.0552826 0.0565111
1% kalman 48.06484 48.03953 48.09285 48.06702 48.06646 48.06606 0.0030713 0.0024570 0.0030713 0.0024570 0.0027641
1% interpolation 48.06478 48.03953 48.09285 48.06702 48.06647 48.06605 0.0024570 0.0012285 0.0030713 0.0018428 0.0021499
1% ma 48.06507 48.03947 48.09286 48.06706 48.06647 48.06612 0.0055283 0.0055283 0.0042998 0.0055283 0.0052211
10% kalman 48.06387 48.03828 48.09319 48.06821 48.06656 48.06589 0.0528256 0.0491400 0.0448403 0.0448403 0.0479115
1% mean 48.06836 48.03966 48.09137 48.06870 48.06658 48.06702 0.0165848 0.0184275 0.0128993 0.0159705 0.0159705
10% interpolation 48.06382 48.03829 48.09319 48.06827 48.06658 48.06589 0.0546683 0.0485258 0.0454545 0.0423833 0.0477580
20% ma 48.06454 48.03813 48.09355 48.06860 48.06676 48.06620 0.1087224 0.1068796 0.0939803 0.1081081 0.1044226
20% locf 48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088 0.0992015
30% interpolation 48.06474 48.03922 48.09377 48.06792 48.06697 48.06641 0.1461916 0.1351351 0.1332924 0.1461916 0.1402027
30% kalman 48.06447 48.03942 48.09377 48.06791 48.06703 48.06639 0.1529484 0.1418919 0.1363636 0.1541769 0.1463452
20% kalman 48.06438 48.03886 48.09359 48.06910 48.06718 48.06648 0.0927518 0.0945946 0.0816953 0.0933661 0.0906020
1% random 48.06803 48.03820 48.10323 48.06020 48.06721 48.06742 0.0221130 0.0178133 0.0233415 0.0196560 0.0207310
10% locf 48.06388 48.04010 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973 0.0526720
20% interpolation 48.06431 48.03889 48.09359 48.06921 48.06723 48.06650 0.0872236 0.0896806 0.0755528 0.0853808 0.0844595
30% ma 48.06320 48.04018 48.09408 48.06792 48.06739 48.06635 0.1566339 0.1345209 0.1547912 0.1689189 0.1537162
20% mean 48.07973 48.02832 48.08681 48.08731 48.06748 48.07054 0.2782555 0.2721130 0.2395577 0.2800983 0.2675061
65% ma 48.07108 48.04394 48.09041 48.07027 48.06821 48.06892 0.3353808 0.2764128 0.2647420 0.3101966 0.2966830
65% kalman 48.07038 48.04468 48.08991 48.07061 48.06840 48.06890 0.3114251 0.2684275 0.2450860 0.2911548 0.2790233
30% locf 48.06791 48.04236 48.09580 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477 0.1570946
65% interpolation 48.07036 48.04468 48.08991 48.07116 48.06858 48.06903 0.3095823 0.2659705 0.2457002 0.2948403 0.2790233
10% mean 48.07145 48.03286 48.10848 48.06614 48.06916 48.06973 0.1726044 0.1603194 0.1572482 0.1664619 0.1641585
80% ma 48.06795 48.04561 48.09361 48.06843 48.06922 48.06890 0.4398034 0.3808354 0.3863636 0.4391892 0.4115479
80% interpolation 48.06756 48.04682 48.09353 48.06823 48.06952 48.06903 0.4250614 0.3525799 0.3789926 0.4244472 0.3952703
80% kalman 48.06761 48.04682 48.09357 48.06821 48.06953 48.06905 0.4256757 0.3519656 0.3820639 0.4256757 0.3963452
50% mean 48.10011 48.05789 48.10226 48.06800 48.07605 48.08206 0.4484029 0.3900491 0.3832924 0.4318182 0.4133907
30% mean 48.07771 48.04385 48.10906 48.09428 48.08240 48.08122 0.3642506 0.3507371 0.3224816 0.3869779 0.3561118
65% mean 48.06675 48.02981 48.12859 48.11165 48.09002 48.08420 0.4367322 0.3814496 0.3783784 0.4176904 0.4035627
85% mean 48.11939 48.09357 48.10504 48.10364 48.10075 48.10541 0.2260442 0.1984029 0.2125307 0.2278870 0.2162162
90% mean 48.03712 48.06868 48.06397 48.17040 48.10102 48.08504 0.1572482 0.1547912 0.1480344 0.1658477 0.1564803
70% random 47.68556 47.12402 47.95719 49.26794 48.11638 48.00868 0.9324324 0.9262899 0.9146192 0.9275184 0.9252150
80% mean 48.05587 48.05459 48.17520 48.12306 48.11762 48.10218 0.8802211 0.2708845 0.8157248 0.2954545 0.5655713
10% random 47.91482 48.20938 48.19080 48.04397 48.14805 48.08974 0.1848894 0.1025799 0.2291155 0.1633907 0.1699939
85% random 49.15168 47.72769 49.75232 47.47577 48.31860 48.52687 0.9895577 0.9889435 0.9631450 0.9649877 0.9766585
65% random 47.50568 48.49940 49.01626 47.85631 48.45733 48.21942 0.8832924 0.6977887 0.9195332 0.8298526 0.8326167
80% random 49.44640 48.28223 49.41269 47.95915 48.55136 48.77512 0.9686732 0.9705160 0.9017199 0.9133907 0.9385749

4.2 多变量弥补数据偏差比较

dfm1_amelia <- list(`1%` = data_m1_1_amelia, `10%` = data_m1_10_amelia, 
                    `20%` = data_m1_20_amelia, `30%` = data_m1_30_amelia, 
                    `50%` = data_m1_50_amelia, `65%` = data_m1_65_amelia, 
                    `70%` = data_m1_70_amelia, `80%` = data_m1_80_amelia, 
                    `85%` = data_m1_85_amelia, `90%` = data_m1_90_amelia)

dfm2_amelia <- list(`1%` = data_tm1_1_amelia, `10%` = data_tm1_10_amelia, 
                    `20%` = data_tm1_20_amelia, `30%` = data_tm1_30_amelia, 
                    `50%` = data_tm1_50_amelia, `65%` = data_tm1_65_amelia, 
                    `70%` = data_tm1_70_amelia, `80%` = data_tm1_80_amelia, 
                    `85%` = data_tm1_85_amelia, `90%` = data_tm1_90_amelia)

## Summarise
dfm1_amelia %<>% ldply(function(x) x %>% dplyr::rename(Model = .id)) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)
dfm2_amelia %<>% ldply(function(x) x %>% dplyr::rename(Model = .id)) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)

dfm1_amelia %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
1% imp3 0.0000000 0.0000000 0.0000000 0.0000002 0.0000001 0.0000001 0.0036369 0.0033571 0.0030774 0.0032172 0.0033221
1% imp5 0.0000000 0.0000000 0.0000000 0.0000002 0.0000001 0.0000001 0.0043363 0.0043363 0.0034970 0.0040565 0.0040565
1% imp2 0.0000000 0.0000001 0.0000000 0.0000002 0.0000001 0.0000001 0.0032172 0.0034970 0.0029375 0.0036369 0.0033221
1% imp4 0.0000000 0.0000001 0.0000000 0.0000002 0.0000001 0.0000001 0.0037768 0.0036369 0.0029375 0.0032172 0.0033921
1% imp1 0.0000000 0.0000001 0.0000001 0.0000003 0.0000001 0.0000001 0.0039166 0.0034970 0.0030774 0.0029375 0.0033571
10% imp2 0.0000011 0.0000009 0.0000011 0.0000009 0.0000010 0.0000010 0.0432228 0.0395860 0.0372080 0.0365086 0.0391313
10% imp5 0.0000011 0.0000011 0.0000011 0.0000009 0.0000010 0.0000011 0.0416842 0.0401455 0.0348300 0.0359491 0.0381522
10% imp4 0.0000012 0.0000011 0.0000011 0.0000010 0.0000011 0.0000011 0.0402854 0.0415443 0.0345503 0.0379074 0.0385718
10% imp3 0.0000009 0.0000012 0.0000011 0.0000010 0.0000011 0.0000011 0.0440621 0.0432228 0.0370681 0.0381872 0.0406351
10% imp1 0.0000010 0.0000013 0.0000011 0.0000011 0.0000012 0.0000011 0.0430829 0.0442020 0.0348300 0.0386068 0.0401804
20% imp5 0.0000042 0.0000038 0.0000037 0.0000034 0.0000036 0.0000038 0.0959575 0.0844873 0.0770737 0.0706393 0.0820394
20% imp2 0.0000050 0.0000039 0.0000036 0.0000037 0.0000037 0.0000040 0.0952581 0.0830885 0.0769338 0.0706393 0.0814799
20% imp4 0.0000044 0.0000039 0.0000040 0.0000035 0.0000038 0.0000039 0.0952581 0.0853266 0.0769338 0.0730172 0.0826339
20% imp3 0.0000046 0.0000035 0.0000045 0.0000038 0.0000039 0.0000041 0.0951182 0.0811302 0.0819695 0.0742761 0.0831235
20% imp1 0.0000051 0.0000039 0.0000044 0.0000039 0.0000041 0.0000043 0.0969366 0.0854665 0.0762344 0.0698000 0.0821094
30% imp2 0.0000106 0.0000077 0.0000092 0.0000080 0.0000083 0.0000089 0.1419779 0.1198769 0.1221150 0.1113442 0.1238285
30% imp3 0.0000105 0.0000079 0.0000094 0.0000080 0.0000084 0.0000090 0.1437963 0.1226745 0.1253322 0.1134424 0.1263114
30% imp5 0.0000117 0.0000083 0.0000096 0.0000082 0.0000087 0.0000094 0.1407190 0.1202965 0.1254721 0.1179186 0.1261016
30% imp4 0.0000112 0.0000089 0.0000092 0.0000081 0.0000088 0.0000094 0.1421178 0.1278500 0.1281298 0.1249126 0.1307526
30% imp1 0.0000112 0.0000086 0.0000097 0.0000082 0.0000088 0.0000094 0.1486921 0.1268709 0.1289691 0.1187579 0.1308225
50% imp1 0.0002917 0.0002841 0.0002886 0.0002930 0.0002886 0.0002894 0.2471674 0.2261855 0.1993286 0.1994685 0.2180375
50% imp4 0.0004111 0.0004062 0.0004048 0.0004095 0.0004068 0.0004079 0.2572388 0.2260456 0.2201707 0.2075815 0.2277591
50% imp2 0.0004936 0.0004858 0.0004941 0.0004908 0.0004902 0.0004911 0.2330396 0.2022661 0.2007274 0.1881382 0.2060428
50% imp5 0.0005000 0.0004896 0.0005036 0.0004981 0.0004971 0.0004978 0.2285634 0.2045041 0.2000280 0.1926144 0.2064275
50% imp3 0.0005796 0.0005732 0.0005761 0.0005780 0.0005758 0.0005767 0.2238075 0.2071618 0.1945727 0.1948524 0.2050986
65% imp1 0.0037038 0.0036950 0.0037028 0.0037144 0.0037040 0.0037040 0.3534760 0.3313750 0.3049378 0.3376696 0.3318646
65% imp2 0.0037461 0.0037553 0.0037236 0.0037351 0.0037380 0.0037400 0.3432648 0.3056372 0.2940271 0.3113722 0.3135753
65% imp4 0.0040114 0.0039923 0.0040178 0.0040065 0.0040055 0.0040070 0.3424255 0.3155686 0.2971045 0.3151490 0.3175619
65% imp3 0.0040437 0.0040587 0.0040400 0.0040502 0.0040496 0.0040481 0.3513778 0.3303959 0.2906700 0.3392083 0.3279130
65% imp5 0.0041922 0.0041956 0.0041963 0.0042075 0.0041998 0.0041979 0.3347321 0.3033991 0.2861939 0.3043782 0.3071758
70% imp2 0.0058556 0.0058747 0.0058369 0.0058674 0.0058597 0.0058586 0.4127850 0.3397678 0.3446636 0.3655057 0.3656805
70% imp4 0.0062205 0.0061725 0.0061818 0.0061766 0.0061770 0.0061879 0.4052315 0.3361309 0.3291369 0.3536159 0.3560288
70% imp3 0.0063261 0.0062797 0.0063244 0.0063016 0.0063019 0.0063080 0.4242551 0.3555742 0.3613093 0.3979578 0.3847741
70% imp5 0.0063617 0.0063084 0.0063286 0.0063186 0.0063185 0.0063293 0.4506924 0.3659253 0.3615890 0.3909638 0.3922926
70% imp1 0.0065354 0.0065133 0.0065083 0.0064779 0.0064999 0.0065087 0.4273325 0.3474612 0.3517975 0.3743181 0.3752273
80% imp5 0.0171595 0.0171508 0.0172147 0.0171742 0.0171799 0.0171748 0.4348860 0.3748776 0.3765562 0.4386628 0.4062456
80% imp4 0.0178892 0.0178482 0.0178864 0.0178254 0.0178533 0.0178623 0.4648203 0.4056511 0.3899846 0.4616030 0.4305148
80% imp1 0.0179774 0.0179669 0.0179299 0.0179521 0.0179496 0.0179566 0.4511120 0.3912435 0.3925024 0.4497132 0.4211428
80% imp2 0.0181715 0.0182881 0.0182233 0.0182494 0.0182536 0.0182331 0.4567072 0.3929221 0.3660652 0.4357253 0.4128549
80% imp3 0.0188257 0.0189006 0.0189500 0.0188341 0.0188949 0.0188776 0.4779689 0.4242551 0.4165618 0.5053854 0.4560428
85% imp2 0.0285124 0.0284944 0.0284871 0.0284606 0.0284807 0.0284886 0.5101413 0.4222968 0.4008952 0.4583858 0.4479298
85% imp4 0.0295915 0.0292050 0.0294749 0.0293755 0.0293518 0.0294117 0.5662330 0.4571269 0.4434187 0.5016086 0.4920968
85% imp1 0.0296854 0.0294646 0.0295183 0.0295090 0.0294973 0.0295443 0.5305637 0.4369842 0.4357253 0.5021681 0.4763603
85% imp5 0.0296229 0.0295257 0.0296607 0.0295922 0.0295929 0.0296004 0.5403553 0.4505525 0.4560078 0.5295846 0.4941251
85% imp3 0.0309386 0.0309092 0.0309865 0.0308879 0.0309279 0.0309305 0.5620366 0.4683172 0.4411806 0.5178347 0.4973423
90% imp5 0.0440889 0.0441617 0.0444101 0.0440200 0.0441973 0.0441702 0.5168555 0.4466359 0.4127850 0.5109806 0.4718142
90% imp2 0.0444710 0.0444534 0.0444560 0.0445331 0.0444808 0.0444784 0.5337809 0.4397818 0.4294307 0.4981116 0.4752763
90% imp1 0.0461595 0.0461769 0.0463285 0.0462737 0.0462597 0.0462346 0.5740663 0.4841237 0.4734928 0.5550427 0.5216814
90% imp3 0.0458608 0.0460663 0.0465595 0.0462600 0.0462953 0.0461867 0.5509862 0.4834243 0.4192195 0.5245489 0.4945447
90% imp4 0.0462646 0.0464083 0.0464868 0.0463752 0.0464234 0.0463837 0.5572807 0.4809064 0.4618828 0.5704294 0.5176248
dfm2_amelia %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
90% imp5 48.06921 48.04484 48.06735 48.03865 48.05028 48.05501 0.4299754 0.2929975 0.3108108 0.3882064 0.3554975
90% imp3 48.07160 48.04393 48.06767 48.04916 48.05359 48.05809 0.3452088 0.3132678 0.2463145 0.3003686 0.3012899
90% imp1 48.08072 48.03486 48.07313 48.05499 48.05433 48.06093 0.4189189 0.3409091 0.3114251 0.3544226 0.3564189
70% imp3 48.05517 48.02155 48.08947 48.06446 48.05849 48.05766 0.3187961 0.3175676 0.2911548 0.3402948 0.3169533
65% imp3 48.06148 48.03451 48.08888 48.06000 48.06113 48.06122 0.3132678 0.2727273 0.2880835 0.2788698 0.2882371
80% imp3 48.06543 48.03745 48.09131 48.05879 48.06252 48.06325 0.4778870 0.3384521 0.3673219 0.2868550 0.3676290
70% imp4 48.05450 48.03275 48.09509 48.06249 48.06344 48.06121 0.3243243 0.2628993 0.2807125 0.2585995 0.2816339
65% imp5 48.06606 48.03924 48.08798 48.06334 48.06352 48.06416 0.3335381 0.2899263 0.2506143 0.2555283 0.2824017
70% imp1 48.06299 48.03345 48.09202 48.06911 48.06486 48.06439 0.3378378 0.3015971 0.2702703 0.2850123 0.2986794
80% imp2 48.09635 48.03502 48.09513 48.06611 48.06542 48.07315 0.4809582 0.4379607 0.3550369 0.3574939 0.4078624
90% imp4 48.10228 48.04703 48.07421 48.07541 48.06555 48.07473 0.4576167 0.3851351 0.2585995 0.3538084 0.3637899
30% imp2 48.06313 48.03819 48.09304 48.06734 48.06619 48.06543 0.1412776 0.1332924 0.1025799 0.1068796 0.1210074
10% imp4 48.06482 48.03913 48.09300 48.06658 48.06624 48.06588 0.0466830 0.0386978 0.0417690 0.0350123 0.0405405
1% imp2 48.06482 48.03929 48.09266 48.06683 48.06626 48.06590 0.0055283 0.0042998 0.0055283 0.0049140 0.0050676
10% imp2 48.06486 48.03910 48.09271 48.06702 48.06627 48.06592 0.0423833 0.0393120 0.0337838 0.0337838 0.0373157
10% imp5 48.06506 48.03883 48.09322 48.06684 48.06630 48.06599 0.0565111 0.0466830 0.0448403 0.0399263 0.0469902
1% imp5 48.06493 48.03940 48.09263 48.06691 48.06631 48.06597 0.0049140 0.0042998 0.0049140 0.0042998 0.0046069
1% imp3 48.06477 48.03933 48.09253 48.06716 48.06634 48.06595 0.0036855 0.0036855 0.0024570 0.0024570 0.0030713
1% imp1 48.06482 48.03931 48.09267 48.06713 48.06637 48.06598 0.0055283 0.0055283 0.0030713 0.0030713 0.0042998
1% imp4 48.06483 48.03946 48.09270 48.06697 48.06638 48.06599 0.0036855 0.0030713 0.0042998 0.0036855 0.0036855
10% imp1 48.06460 48.03961 48.09248 48.06715 48.06642 48.06596 0.0479115 0.0337838 0.0411548 0.0307125 0.0383907
10% imp3 48.06492 48.03909 48.09336 48.06684 48.06643 48.06605 0.0472973 0.0411548 0.0509828 0.0485258 0.0469902
30% imp3 48.06348 48.03940 48.09352 48.06649 48.06647 48.06572 0.1308354 0.1210074 0.1081081 0.1093366 0.1173219
20% imp4 48.06378 48.03896 48.09348 48.06752 48.06665 48.06593 0.0921376 0.0804668 0.0878378 0.0853808 0.0864558
65% imp2 48.06781 48.03969 48.09423 48.06629 48.06673 48.06700 0.3347666 0.2819410 0.2813268 0.2647420 0.2906941
30% imp5 48.06414 48.03950 48.09303 48.06767 48.06673 48.06608 0.1400491 0.1265356 0.1056511 0.1068796 0.1197789
20% imp2 48.06493 48.03925 48.09323 48.06796 48.06682 48.06635 0.0921376 0.0884521 0.0823096 0.0902948 0.0882985
20% imp3 48.06402 48.04021 48.09313 48.06711 48.06682 48.06612 0.0902948 0.0743243 0.0939803 0.0859951 0.0861486
30% imp1 48.06165 48.03872 48.09386 48.06788 48.06682 48.06553 0.1425061 0.1314496 0.1173219 0.1216216 0.1282248
20% imp5 48.06468 48.03944 48.09354 48.06753 48.06684 48.06630 0.1007371 0.0786241 0.0853808 0.0687961 0.0833845
20% imp1 48.06460 48.03932 48.09376 48.06750 48.06686 48.06629 0.1056511 0.0970516 0.0970516 0.0982801 0.0995086
30% imp4 48.06310 48.04006 48.09321 48.06804 48.06711 48.06610 0.1351351 0.1283784 0.1062654 0.1142506 0.1210074
65% imp1 48.06753 48.03945 48.09411 48.06812 48.06723 48.06730 0.3519656 0.3028256 0.2714988 0.2542998 0.2951474
65% imp4 48.06864 48.03979 48.09638 48.06667 48.06761 48.06787 0.3464373 0.2929975 0.3015971 0.2745700 0.3039005
50% imp5 48.06570 48.04092 48.09609 48.06943 48.06881 48.06803 0.2506143 0.2346437 0.2223587 0.2297297 0.2343366
70% imp2 48.06721 48.03401 48.10299 48.07360 48.07020 48.06945 0.3310811 0.3065111 0.2788698 0.3003686 0.3042076
50% imp3 48.06598 48.04386 48.09902 48.06832 48.07040 48.06930 0.2156020 0.1971744 0.2082310 0.2100737 0.2077703
50% imp2 48.06819 48.04509 48.09811 48.07183 48.07168 48.07080 0.2340295 0.2235872 0.2094595 0.2260442 0.2232801
50% imp4 48.06758 48.04450 48.09766 48.07323 48.07180 48.07074 0.2143735 0.2039312 0.1947174 0.2063882 0.2048526
90% imp2 48.10489 48.05921 48.08469 48.07178 48.07189 48.08014 0.4600737 0.3789926 0.2911548 0.3568796 0.3717752
80% imp4 48.07939 48.04066 48.10873 48.06645 48.07195 48.07381 0.4662162 0.3765356 0.4041769 0.4011057 0.4120086
50% imp1 48.06908 48.04628 48.10127 48.07136 48.07297 48.07200 0.2192875 0.1934889 0.2039312 0.2027027 0.2048526
70% imp5 48.06417 48.04374 48.09782 48.08140 48.07432 48.07178 0.2997543 0.2972973 0.2899263 0.3230958 0.3025184
80% imp5 48.08769 48.05208 48.11440 48.07758 48.08135 48.08294 0.3980344 0.3470516 0.3200246 0.3304668 0.3488943
85% imp5 48.05452 48.04422 48.10806 48.09895 48.08375 48.07644 0.4299754 0.4103194 0.3869779 0.4434889 0.4176904
85% imp3 48.06908 48.06095 48.10012 48.10192 48.08767 48.08302 0.3568796 0.3396806 0.3218673 0.3488943 0.3418305
85% imp4 48.07092 48.06230 48.11810 48.08630 48.08890 48.08441 0.4459459 0.3200246 0.3495086 0.2696560 0.3462838
80% imp1 48.09547 48.06079 48.12047 48.08773 48.08966 48.09112 0.3599509 0.3396806 0.3065111 0.3476658 0.3384521
85% imp2 48.07333 48.06250 48.11384 48.10463 48.09366 48.08858 0.3562654 0.3427518 0.3200246 0.3839066 0.3507371
85% imp1 48.06422 48.06042 48.11247 48.11366 48.09552 48.08769 0.3765356 0.3568796 0.3427518 0.3820639 0.3645577

4.3 tidyr::fill弥补数据偏差比较

dfm1_tidyr <- list(`1%` = data_m1_1_tidyr, `10%` = data_m1_10_tidyr, 
                   `20%` = data_m1_20_tidyr, `30%` = data_m1_30_tidyr, 
                   `50%` = data_m1_50_tidyr, `65%` = data_m1_65_tidyr, 
                   `70%` = data_m1_70_tidyr, `80%` = data_m1_80_tidyr, 
                   `85%` = data_m1_85_tidyr, `90%` = data_m1_90_tidyr)

dfm2_tidyr <- list(`1%` = data_tm1_1_tidyr, `10%` = data_tm1_10_tidyr, 
                   `20%` = data_tm1_20_tidyr, `30%` = data_tm1_30_tidyr, 
                   `50%` = data_tm1_50_tidyr, `65%` = data_tm1_65_tidyr, 
                   `70%` = data_tm1_70_tidyr, `80%` = data_tm1_80_tidyr, 
                   `85%` = data_tm1_85_tidyr, `90%` = data_tm1_90_tidyr)

## Summarise
dfm1_tidyr %<>% ldply(function(x) x %>% mutate(Model = factor('tidyr_fill'))) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  dplyr::select(.id, Model, AskOpen, AskHigh, AskLow, AskClose, 
                Mean.HLC, Mean.OHLC, bias.open, bias.high, bias.low, bias.close, bias) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)
dfm2_tidyr %<>% ldply(function(x) x %>% mutate(Model = factor('tidyr_fill'))) %>% 
  tbl_df %>% 
  mutate(bias = (bias.open + bias.high + bias.low + bias.close)/4) %>% 
  dplyr::select(.id, Model, AskOpen, AskHigh, AskLow, AskClose, 
                Mean.HLC, Mean.OHLC, bias.open, bias.high, bias.low, bias.close, bias) %>% 
  arrange(Mean.HLC, Mean.OHLC, bias)

dfm1_tidyr %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
1% tidyr_fill 0.0000014 0.0000031 0.0000018 0.0000036 0.0000029 0.0000025 0.0037768 0.0053154 0.0043363 0.0058749 0.0048258
10% tidyr_fill 0.0000190 0.0000160 0.0000167 0.0000208 0.0000178 0.0000181 0.0530144 0.0507763 0.0471395 0.0464401 0.0493426
20% tidyr_fill 0.0000526 0.0000327 0.0000319 0.0000438 0.0000362 0.0000403 0.1105050 0.0969366 0.0981956 0.0916212 0.0993146
30% tidyr_fill 0.0000830 0.0000646 0.0000672 0.0000785 0.0000701 0.0000733 0.1658973 0.1559659 0.1465939 0.1577843 0.1565604
50% tidyr_fill 0.0001652 0.0001420 0.0001480 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017 0.2603161
65% tidyr_fill 0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752 0.3538257
70% tidyr_fill 0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165 0.3983424
80% tidyr_fill 0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.5302840 0.4497132 0.4574066 0.5218912 0.4898238
85% tidyr_fill 0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.4825850 0.4667786 0.5409148 0.5123794
90% tidyr_fill 0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398 0.5776332
dfm2_tidyr %>% 
  kable(caption = 'MSE 缺失值') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
  scroll_box(width = '100%', height = '400px')
MSE 缺失
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
90% tidyr_fill 48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.5368550 0.5018428 0.5122850 0.6375921 0.5471437
85% tidyr_fill 48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.5491400 0.5138206
80% tidyr_fill 48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597 0.4450246
70% tidyr_fill 48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054 0.3773034
50% tidyr_fill 48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995 0.2519963
1% tidyr_fill 48.06471 48.03943 48.09250 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.0024570 0.0050676
65% tidyr_fill 48.06478 48.04620 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.2942260 0.3445946 0.3267813
20% tidyr_fill 48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088 0.0992015
10% tidyr_fill 48.06388 48.04010 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973 0.0526720
30% tidyr_fill 48.06791 48.04236 48.09580 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477 0.1570946

4.4 综合数据偏差比较

## 1-min data.
dfm1 <- list(dfm1_impTS, dfm1_amelia, dfm1_tidyr) %>% bind_rows

dfm1 %>% ddply(.(.id), arrange, Mean.HLC, Mean.OHLC, bias) %>% tbl_df %>% 
  kable(caption = 'Bias Comparison') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  group_rows('1%', 1, 12, label_row_css = 'background-color: #e68a00; color: #fff;') %>%
  group_rows('10%', 13, 24, label_row_css = 'background-color: #ff0000; color: #fff;') %>%
  group_rows('20%', 25, 36, label_row_css = 'background-color: #bf80ff; color: #fff;') %>%
  group_rows('30%', 37, 48, label_row_css = 'background-color: #66ff33; color: #fff;') %>%
  group_rows('50%', 49, 60, label_row_css = 'background-color: #6666ff; color: #fff;') %>%
  group_rows('65%', 61, 72, label_row_css = 'background-color: #66e0ff; color: #fff;') %>%
  group_rows('70%', 73, 84, label_row_css = 'background-color:#0066ff; color: #fff;') %>%
  group_rows('80%', 85, 96, label_row_css = 'background-color: #ff9900; color: #fff;') %>%
  group_rows('85%', 97, 108, label_row_css = 'background-color: #33ff33; color: #fff;') %>%
  group_rows('90%', 109, 120, label_row_css = 'background-color: #339966; color: #fff;') %>%
  scroll_box(width = '100%', height = '400px')
Bias Comparison
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
1%
1% imp3 0.0000000 0.0000000 0.0000000 0.0000002 0.0000001 0.0000001 0.0036369 0.0033571 0.0030774 0.0032172 0.0033221
1% imp5 0.0000000 0.0000000 0.0000000 0.0000002 0.0000001 0.0000001 0.0043363 0.0043363 0.0034970 0.0040565 0.0040565
1% imp2 0.0000000 0.0000001 0.0000000 0.0000002 0.0000001 0.0000001 0.0032172 0.0034970 0.0029375 0.0036369 0.0033221
1% imp4 0.0000000 0.0000001 0.0000000 0.0000002 0.0000001 0.0000001 0.0037768 0.0036369 0.0029375 0.0032172 0.0033921
1% imp1 0.0000000 0.0000001 0.0000001 0.0000003 0.0000001 0.0000001 0.0039166 0.0034970 0.0030774 0.0029375 0.0033571
1% kalman 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0050357 0.0064345 0.0050357 0.0064345 0.0057351
1% interpolation 0.0000006 0.0000008 0.0000009 0.0000013 0.0000010 0.0000009 0.0048958 0.0062946 0.0048958 0.0062946 0.0055952
1% ma 0.0000007 0.0000014 0.0000012 0.0000013 0.0000013 0.0000012 0.0079731 0.0088124 0.0067142 0.0086725 0.0080431
1% locf 0.0000014 0.0000031 0.0000018 0.0000036 0.0000029 0.0000025 0.0037768 0.0053154 0.0043363 0.0058749 0.0048258
1% tidyr_fill 0.0000014 0.0000031 0.0000018 0.0000036 0.0000029 0.0000025 0.0037768 0.0053154 0.0043363 0.0058749 0.0048258
1% mean 0.0005507 0.0005905 0.0004618 0.0006969 0.0005831 0.0005750 0.0194433 0.0179046 0.0205623 0.0194433 0.0193384
1% random 0.0024076 0.0006038 0.0023222 0.0008296 0.0012519 0.0015408 0.0145475 0.0093719 0.0195832 0.0145475 0.0145125
10%
10% imp2 0.0000011 0.0000009 0.0000011 0.0000009 0.0000010 0.0000010 0.0432228 0.0395860 0.0372080 0.0365086 0.0391313
10% imp5 0.0000011 0.0000011 0.0000011 0.0000009 0.0000010 0.0000011 0.0416842 0.0401455 0.0348300 0.0359491 0.0381522
10% imp4 0.0000012 0.0000011 0.0000011 0.0000010 0.0000011 0.0000011 0.0402854 0.0415443 0.0345503 0.0379074 0.0385718
10% imp3 0.0000009 0.0000012 0.0000011 0.0000010 0.0000011 0.0000011 0.0440621 0.0432228 0.0370681 0.0381872 0.0406351
10% imp1 0.0000010 0.0000013 0.0000011 0.0000011 0.0000012 0.0000011 0.0430829 0.0442020 0.0348300 0.0386068 0.0401804
10% interpolation 0.0000090 0.0000071 0.0000075 0.0000095 0.0000080 0.0000083 0.0495174 0.0479787 0.0492377 0.0496573 0.0490978
10% kalman 0.0000090 0.0000071 0.0000075 0.0000096 0.0000081 0.0000083 0.0513359 0.0500769 0.0509162 0.0517555 0.0510211
10% ma 0.0000108 0.0000100 0.0000107 0.0000129 0.0000112 0.0000111 0.0672821 0.0636453 0.0614072 0.0665827 0.0647293
10% locf 0.0000190 0.0000160 0.0000167 0.0000208 0.0000178 0.0000181 0.0530144 0.0507763 0.0471395 0.0464401 0.0493426
10% tidyr_fill 0.0000190 0.0000160 0.0000167 0.0000208 0.0000178 0.0000181 0.0530144 0.0507763 0.0471395 0.0464401 0.0493426
10% mean 0.0057660 0.0063513 0.0060217 0.0066473 0.0063401 0.0061966 0.1719122 0.1493915 0.1814240 0.1751294 0.1694643
10% random 0.0077868 0.0226350 0.0087245 0.0291799 0.0201798 0.0170816 0.1092460 0.0870052 0.1375017 0.1214156 0.1137921
20%
20% imp5 0.0000042 0.0000038 0.0000037 0.0000034 0.0000036 0.0000038 0.0959575 0.0844873 0.0770737 0.0706393 0.0820394
20% imp2 0.0000050 0.0000039 0.0000036 0.0000037 0.0000037 0.0000040 0.0952581 0.0830885 0.0769338 0.0706393 0.0814799
20% imp4 0.0000044 0.0000039 0.0000040 0.0000035 0.0000038 0.0000039 0.0952581 0.0853266 0.0769338 0.0730172 0.0826339
20% imp3 0.0000046 0.0000035 0.0000045 0.0000038 0.0000039 0.0000041 0.0951182 0.0811302 0.0819695 0.0742761 0.0831235
20% imp1 0.0000051 0.0000039 0.0000044 0.0000039 0.0000041 0.0000043 0.0969366 0.0854665 0.0762344 0.0698000 0.0821094
20% interpolation 0.0000229 0.0000146 0.0000154 0.0000201 0.0000167 0.0000182 0.1054693 0.0988950 0.0949783 0.0948384 0.0985453
20% kalman 0.0000229 0.0000146 0.0000154 0.0000202 0.0000167 0.0000183 0.1088264 0.1022521 0.0976360 0.0986152 0.1018324
20% ma 0.0000275 0.0000192 0.0000198 0.0000245 0.0000212 0.0000228 0.1292488 0.1209959 0.1112044 0.1222549 0.1209260
20% locf 0.0000526 0.0000327 0.0000319 0.0000438 0.0000362 0.0000403 0.1105050 0.0969366 0.0981956 0.0916212 0.0993146
20% tidyr_fill 0.0000526 0.0000327 0.0000319 0.0000438 0.0000362 0.0000403 0.1105050 0.0969366 0.0981956 0.0916212 0.0993146
20% mean 0.0123887 0.0117783 0.0116303 0.0126437 0.0120174 0.0121103 0.3066163 0.2652119 0.2957057 0.3014408 0.2922437
20% random 0.0308892 0.0318163 0.0327874 0.0353737 0.0333258 0.0327167 0.3551546 0.3561337 0.3119317 0.3477409 0.3427402
30%
30% imp2 0.0000106 0.0000077 0.0000092 0.0000080 0.0000083 0.0000089 0.1419779 0.1198769 0.1221150 0.1113442 0.1238285
30% imp3 0.0000105 0.0000079 0.0000094 0.0000080 0.0000084 0.0000090 0.1437963 0.1226745 0.1253322 0.1134424 0.1263114
30% imp5 0.0000117 0.0000083 0.0000096 0.0000082 0.0000087 0.0000094 0.1407190 0.1202965 0.1254721 0.1179186 0.1261016
30% imp4 0.0000112 0.0000089 0.0000092 0.0000081 0.0000088 0.0000094 0.1421178 0.1278500 0.1281298 0.1249126 0.1307526
30% imp1 0.0000112 0.0000086 0.0000097 0.0000082 0.0000088 0.0000094 0.1486921 0.1268709 0.1289691 0.1187579 0.1308225
30% interpolation 0.0000374 0.0000247 0.0000282 0.0000342 0.0000290 0.0000311 0.1573647 0.1435166 0.1454749 0.1502308 0.1491467
30% kalman 0.0000373 0.0000247 0.0000282 0.0000344 0.0000291 0.0000312 0.1603021 0.1458945 0.1481326 0.1537278 0.1520143
30% ma 0.0000425 0.0000321 0.0000351 0.0000407 0.0000359 0.0000376 0.1805847 0.1612813 0.1653378 0.1772276 0.1711078
30% locf 0.0000830 0.0000646 0.0000672 0.0000785 0.0000701 0.0000733 0.1658973 0.1559659 0.1465939 0.1577843 0.1565604
30% tidyr_fill 0.0000830 0.0000646 0.0000672 0.0000785 0.0000701 0.0000733 0.1658973 0.1559659 0.1465939 0.1577843 0.1565604
30% mean 0.0190881 0.0174648 0.0182534 0.0190274 0.0182485 0.0184584 0.4018744 0.3536159 0.3729193 0.4076095 0.3840048
30% random 0.0891399 0.0503076 0.0991078 0.0534894 0.0676349 0.0730112 0.4879004 0.5048258 0.3189257 0.3983774 0.4275073
50%
50% interpolation 0.0000704 0.0000582 0.0000780 0.0000750 0.0000704 0.0000704 0.2460484 0.2282837 0.2087005 0.2559799 0.2347531
50% kalman 0.0000704 0.0000582 0.0000783 0.0000750 0.0000705 0.0000705 0.2530424 0.2322003 0.2149951 0.2632536 0.2408728
50% ma 0.0000804 0.0000680 0.0001001 0.0000872 0.0000851 0.0000839 0.2808784 0.2552805 0.2345783 0.2899706 0.2651769
50% locf 0.0001652 0.0001420 0.0001480 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017 0.2603161
50% tidyr_fill 0.0001652 0.0001420 0.0001480 0.0001916 0.0001606 0.0001617 0.2764023 0.2480067 0.2397538 0.2771017 0.2603161
50% imp1 0.0002917 0.0002841 0.0002886 0.0002930 0.0002886 0.0002894 0.2471674 0.2261855 0.1993286 0.1994685 0.2180375
50% imp4 0.0004111 0.0004062 0.0004048 0.0004095 0.0004068 0.0004079 0.2572388 0.2260456 0.2201707 0.2075815 0.2277591
50% imp2 0.0004936 0.0004858 0.0004941 0.0004908 0.0004902 0.0004911 0.2330396 0.2022661 0.2007274 0.1881382 0.2060428
50% imp5 0.0005000 0.0004896 0.0005036 0.0004981 0.0004971 0.0004978 0.2285634 0.2045041 0.2000280 0.1926144 0.2064275
50% imp3 0.0005796 0.0005732 0.0005761 0.0005780 0.0005758 0.0005767 0.2238075 0.2071618 0.1945727 0.1948524 0.2050986
50% mean 0.0306066 0.0306853 0.0305917 0.0302918 0.0305230 0.0305439 0.4684571 0.4066303 0.4129249 0.4671982 0.4388026
50% random 0.0401562 0.0407120 0.0546502 0.0518731 0.0490784 0.0468479 0.8181564 0.7682193 0.7954959 0.7911596 0.7932578
65%
65% kalman 0.0001282 0.0001238 0.0001133 0.0001202 0.0001191 0.0001214 0.3674640 0.3241013 0.2845153 0.3415862 0.3294167
65% interpolation 0.0001279 0.0001238 0.0001136 0.0001206 0.0001193 0.0001215 0.3662051 0.3241013 0.2831165 0.3394880 0.3282277
65% ma 0.0001512 0.0001488 0.0001313 0.0001449 0.0001417 0.0001441 0.3813121 0.3329137 0.3022800 0.3621486 0.3446636
65% locf 0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752 0.3538257
65% tidyr_fill 0.0002955 0.0003304 0.0002383 0.0002986 0.0002891 0.0002907 0.3817317 0.3404672 0.3154287 0.3776752 0.3538257
65% imp1 0.0037038 0.0036950 0.0037028 0.0037144 0.0037040 0.0037040 0.3534760 0.3313750 0.3049378 0.3376696 0.3318646
65% imp2 0.0037461 0.0037553 0.0037236 0.0037351 0.0037380 0.0037400 0.3432648 0.3056372 0.2940271 0.3113722 0.3135753
65% imp4 0.0040114 0.0039923 0.0040178 0.0040065 0.0040055 0.0040070 0.3424255 0.3155686 0.2971045 0.3151490 0.3175619
65% imp3 0.0040437 0.0040587 0.0040400 0.0040502 0.0040496 0.0040481 0.3513778 0.3303959 0.2906700 0.3392083 0.3279130
65% imp5 0.0041922 0.0041956 0.0041963 0.0042075 0.0041998 0.0041979 0.3347321 0.3033991 0.2861939 0.3043782 0.3071758
65% mean 0.0400602 0.0406286 0.0391629 0.0395314 0.0397743 0.0398458 0.4402014 0.3983774 0.3761365 0.4364247 0.4127850
65% random 0.0533171 0.0806651 0.1361035 0.1684688 0.1284125 0.1096386 0.2796195 0.2432508 0.6848510 0.6998182 0.4768849
70%
70% kalman 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3959994 0.3457826 0.3231221 0.3883061 0.3633026
70% interpolation 0.0001618 0.0001265 0.0001262 0.0001725 0.0001417 0.0001468 0.3927822 0.3453630 0.3225626 0.3881662 0.3622185
70% ma 0.0001928 0.0001537 0.0001548 0.0001982 0.0001689 0.0001749 0.4151630 0.3650860 0.3400476 0.4130648 0.3833403
70% locf 0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165 0.3983424
70% tidyr_fill 0.0003904 0.0003374 0.0003273 0.0004098 0.0003581 0.0003662 0.4253742 0.3855085 0.3597706 0.4227165 0.3983424
70% imp2 0.0058556 0.0058747 0.0058369 0.0058674 0.0058597 0.0058586 0.4127850 0.3397678 0.3446636 0.3655057 0.3656805
70% imp4 0.0062205 0.0061725 0.0061818 0.0061766 0.0061770 0.0061879 0.4052315 0.3361309 0.3291369 0.3536159 0.3560288
70% imp3 0.0063261 0.0062797 0.0063244 0.0063016 0.0063019 0.0063080 0.4242551 0.3555742 0.3613093 0.3979578 0.3847741
70% imp5 0.0063617 0.0063084 0.0063286 0.0063186 0.0063185 0.0063293 0.4506924 0.3659253 0.3615890 0.3909638 0.3922926
70% imp1 0.0065354 0.0065133 0.0065083 0.0064779 0.0064999 0.0065087 0.4273325 0.3474612 0.3517975 0.3743181 0.3752273
70% mean 0.0434017 0.0423406 0.0426192 0.0426670 0.0425423 0.0427571 0.4084487 0.3775353 0.3564135 0.4071898 0.3873968
70% random 0.1551202 0.0523911 0.1117988 0.2454679 0.1365526 0.1411945 0.8742481 0.8708910 0.7637432 0.8887956 0.8494195
80%
80% interpolation 0.0002653 0.0002080 0.0002403 0.0002507 0.0002330 0.0002411 0.4865016 0.4063505 0.4062107 0.4692964 0.4420898
80% kalman 0.0002647 0.0002080 0.0002403 0.0002511 0.0002331 0.0002410 0.4876206 0.4055113 0.4049517 0.4662191 0.4410757
80% ma 0.0003359 0.0002728 0.0002943 0.0003105 0.0002925 0.0003034 0.5045461 0.4288712 0.4227165 0.4869212 0.4607637
80% locf 0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.5302840 0.4497132 0.4574066 0.5218912 0.4898238
80% tidyr_fill 0.0007388 0.0005612 0.0005608 0.0006582 0.0005934 0.0006297 0.5302840 0.4497132 0.4574066 0.5218912 0.4898238
80% imp5 0.0171595 0.0171508 0.0172147 0.0171742 0.0171799 0.0171748 0.4348860 0.3748776 0.3765562 0.4386628 0.4062456
80% imp4 0.0178892 0.0178482 0.0178864 0.0178254 0.0178533 0.0178623 0.4648203 0.4056511 0.3899846 0.4616030 0.4305148
80% imp1 0.0179774 0.0179669 0.0179299 0.0179521 0.0179496 0.0179566 0.4511120 0.3912435 0.3925024 0.4497132 0.4211428
80% imp2 0.0181715 0.0182881 0.0182233 0.0182494 0.0182536 0.0182331 0.4567072 0.3929221 0.3660652 0.4357253 0.4128549
80% imp3 0.0188257 0.0189006 0.0189500 0.0188341 0.0188949 0.0188776 0.4779689 0.4242551 0.4165618 0.5053854 0.4560428
80% mean 0.0486525 0.0484767 0.0482804 0.0487259 0.0484943 0.0485339 0.2989229 0.2805987 0.2605959 0.2943069 0.2836061
80% random 0.0967558 0.0565147 0.1428038 0.0580634 0.0857940 0.0885344 0.8461323 0.9290810 0.1155406 0.8341027 0.6812142
85%
85% kalman 0.0003567 0.0002414 0.0003399 0.0003848 0.0003220 0.0003307 0.5336411 0.4581060 0.4311092 0.5150371 0.4844734
85% interpolation 0.0003568 0.0002415 0.0003425 0.0003855 0.0003232 0.0003316 0.5390964 0.4617429 0.4374038 0.5209120 0.4897888
85% ma 0.0004343 0.0003234 0.0004471 0.0004450 0.0004052 0.0004124 0.5718282 0.4860820 0.4788082 0.5623164 0.5247587
85% locf 0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.4825850 0.4667786 0.5409148 0.5123794
85% tidyr_fill 0.0009119 0.0007365 0.0009503 0.0008236 0.0008368 0.0008556 0.5592391 0.4825850 0.4667786 0.5409148 0.5123794
85% imp2 0.0285124 0.0284944 0.0284871 0.0284606 0.0284807 0.0284886 0.5101413 0.4222968 0.4008952 0.4583858 0.4479298
85% imp4 0.0295915 0.0292050 0.0294749 0.0293755 0.0293518 0.0294117 0.5662330 0.4571269 0.4434187 0.5016086 0.4920968
85% imp1 0.0296854 0.0294646 0.0295183 0.0295090 0.0294973 0.0295443 0.5305637 0.4369842 0.4357253 0.5021681 0.4763603
85% imp5 0.0296229 0.0295257 0.0296607 0.0295922 0.0295929 0.0296004 0.5403553 0.4505525 0.4560078 0.5295846 0.4941251
85% imp3 0.0309386 0.0309092 0.0309865 0.0308879 0.0309279 0.0309305 0.5620366 0.4683172 0.4411806 0.5178347 0.4973423
85% mean 0.0521145 0.0515481 0.0514613 0.0510225 0.0513439 0.0515366 0.2410127 0.2432508 0.2106588 0.2477270 0.2356623
85% random 0.3240766 0.0639116 0.0581029 0.0510604 0.0576916 0.1242879 0.9664289 0.1843614 0.9805567 0.9148133 0.7615401
90%
90% interpolation 0.0004602 0.0005112 0.0004253 0.0005540 0.0004968 0.0004877 0.5963072 0.5367184 0.4974122 0.6212058 0.5629109
90% kalman 0.0004602 0.0005172 0.0004253 0.0005541 0.0004989 0.0004892 0.5978459 0.5383970 0.4965729 0.6207861 0.5634005
90% ma 0.0006192 0.0006425 0.0005668 0.0007449 0.0006514 0.0006434 0.6475031 0.5676318 0.5312631 0.6360330 0.5956078
90% locf 0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398 0.5776332
90% tidyr_fill 0.0015472 0.0012908 0.0012546 0.0015022 0.0013492 0.0013987 0.6398098 0.5505665 0.5153168 0.6048398 0.5776332
90% imp5 0.0440889 0.0441617 0.0444101 0.0440200 0.0441973 0.0441702 0.5168555 0.4466359 0.4127850 0.5109806 0.4718142
90% imp2 0.0444710 0.0444534 0.0444560 0.0445331 0.0444808 0.0444784 0.5337809 0.4397818 0.4294307 0.4981116 0.4752763
90% imp1 0.0461595 0.0461769 0.0463285 0.0462737 0.0462597 0.0462346 0.5740663 0.4841237 0.4734928 0.5550427 0.5216814
90% imp3 0.0458608 0.0460663 0.0465595 0.0462600 0.0462953 0.0461867 0.5509862 0.4834243 0.4192195 0.5245489 0.4945447
90% imp4 0.0462646 0.0464083 0.0464868 0.0463752 0.0464234 0.0463837 0.5572807 0.4809064 0.4618828 0.5704294 0.5176248
90% mean 0.0554369 0.0545033 0.0548398 0.0556674 0.0550035 0.0551119 0.9363547 0.9440481 0.1286893 0.9380333 0.7367814
90% random 0.0716249 0.1366404 0.0568678 0.0790605 0.0908562 0.0860484 0.1349839 0.0870052 0.9282417 0.9373339 0.5218912
## tick-data to 1-min data.
dfm2 <- list(dfm2_impTS, dfm2_amelia, dfm2_tidyr) %>% bind_rows

dfm2 %>% ddply(.(.id), arrange, Mean.HLC, Mean.OHLC, bias) %>% tbl_df %>% 
  kable(caption = 'Bias Comparison') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  group_rows('1%', 1, 12, label_row_css = 'background-color: #e68a00; color: #fff;') %>%
  group_rows('10%', 13, 24, label_row_css = 'background-color: #ff0000; color: #fff;') %>%
  group_rows('20%', 25, 36, label_row_css = 'background-color: #bf80ff; color: #fff;') %>%
  group_rows('30%', 37, 48, label_row_css = 'background-color: #66ff33; color: #fff;') %>%
  group_rows('50%', 49, 60, label_row_css = 'background-color: #6666ff; color: #fff;') %>%
  group_rows('65%', 61, 72, label_row_css = 'background-color: #66e0ff; color: #fff;') %>%
  group_rows('70%', 73, 84, label_row_css = 'background-color:#0066ff; color: #fff;') %>%
  group_rows('80%', 85, 96, label_row_css = 'background-color: #ff9900; color: #fff;') %>%
  group_rows('85%', 97, 108, label_row_css = 'background-color: #33ff33; color: #fff;') %>%
  group_rows('90%', 109, 120, label_row_css = 'background-color: #339966; color: #fff;') %>%
  scroll_box(width = '100%', height = '400px')
Bias Comparison
.id Model AskOpen AskHigh AskLow AskClose Mean.HLC Mean.OHLC bias.open bias.high bias.low bias.close bias
1%
1% imp2 48.06482 48.03929 48.09266 48.06683 48.06626 48.06590 0.0055283 0.0042998 0.0055283 0.0049140 0.0050676
1% imp5 48.06493 48.03940 48.09263 48.06691 48.06631 48.06597 0.0049140 0.0042998 0.0049140 0.0042998 0.0046069
1% locf 48.06471 48.03943 48.09250 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.0024570 0.0050676
1% tidyr_fill 48.06471 48.03943 48.09250 48.06706 48.06633 48.06592 0.0079853 0.0067568 0.0030713 0.0024570 0.0050676
1% imp3 48.06477 48.03933 48.09253 48.06716 48.06634 48.06595 0.0036855 0.0036855 0.0024570 0.0024570 0.0030713
1% imp1 48.06482 48.03931 48.09267 48.06713 48.06637 48.06598 0.0055283 0.0055283 0.0030713 0.0030713 0.0042998
1% imp4 48.06483 48.03946 48.09270 48.06697 48.06638 48.06599 0.0036855 0.0030713 0.0042998 0.0036855 0.0036855
1% kalman 48.06484 48.03953 48.09285 48.06702 48.06646 48.06606 0.0030713 0.0024570 0.0030713 0.0024570 0.0027641
1% interpolation 48.06478 48.03953 48.09285 48.06702 48.06647 48.06605 0.0024570 0.0012285 0.0030713 0.0018428 0.0021499
1% ma 48.06507 48.03947 48.09286 48.06706 48.06647 48.06612 0.0055283 0.0055283 0.0042998 0.0055283 0.0052211
1% mean 48.06836 48.03966 48.09137 48.06870 48.06658 48.06702 0.0165848 0.0184275 0.0128993 0.0159705 0.0159705
1% random 48.06803 48.03820 48.10323 48.06020 48.06721 48.06742 0.0221130 0.0178133 0.0233415 0.0196560 0.0207310
10%
10% imp4 48.06482 48.03913 48.09300 48.06658 48.06624 48.06588 0.0466830 0.0386978 0.0417690 0.0350123 0.0405405
10% imp2 48.06486 48.03910 48.09271 48.06702 48.06627 48.06592 0.0423833 0.0393120 0.0337838 0.0337838 0.0373157
10% imp5 48.06506 48.03883 48.09322 48.06684 48.06630 48.06599 0.0565111 0.0466830 0.0448403 0.0399263 0.0469902
10% imp1 48.06460 48.03961 48.09248 48.06715 48.06642 48.06596 0.0479115 0.0337838 0.0411548 0.0307125 0.0383907
10% imp3 48.06492 48.03909 48.09336 48.06684 48.06643 48.06605 0.0472973 0.0411548 0.0509828 0.0485258 0.0469902
10% ma 48.06389 48.03796 48.09338 48.06803 48.06646 48.06581 0.0626536 0.0540541 0.0540541 0.0552826 0.0565111
10% kalman 48.06387 48.03828 48.09319 48.06821 48.06656 48.06589 0.0528256 0.0491400 0.0448403 0.0448403 0.0479115
10% interpolation 48.06382 48.03829 48.09319 48.06827 48.06658 48.06589 0.0546683 0.0485258 0.0454545 0.0423833 0.0477580
10% locf 48.06388 48.04010 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973 0.0526720
10% tidyr_fill 48.06388 48.04010 48.09277 48.06879 48.06722 48.06639 0.0589681 0.0485258 0.0558968 0.0472973 0.0526720
10% mean 48.07145 48.03286 48.10848 48.06614 48.06916 48.06973 0.1726044 0.1603194 0.1572482 0.1664619 0.1641585
10% random 47.91482 48.20938 48.19080 48.04397 48.14805 48.08974 0.1848894 0.1025799 0.2291155 0.1633907 0.1699939
20%
20% random 48.45310 48.01260 48.05931 47.86129 47.97773 48.09657 0.3366093 0.3218673 0.3126536 0.3359951 0.3267813
20% imp4 48.06378 48.03896 48.09348 48.06752 48.06665 48.06593 0.0921376 0.0804668 0.0878378 0.0853808 0.0864558
20% ma 48.06454 48.03813 48.09355 48.06860 48.06676 48.06620 0.1087224 0.1068796 0.0939803 0.1081081 0.1044226
20% imp2 48.06493 48.03925 48.09323 48.06796 48.06682 48.06635 0.0921376 0.0884521 0.0823096 0.0902948 0.0882985
20% imp3 48.06402 48.04021 48.09313 48.06711 48.06682 48.06612 0.0902948 0.0743243 0.0939803 0.0859951 0.0861486
20% locf 48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088 0.0992015
20% tidyr_fill 48.06368 48.03972 48.09354 48.06721 48.06683 48.06604 0.1068796 0.0976658 0.0970516 0.0952088 0.0992015
20% imp5 48.06468 48.03944 48.09354 48.06753 48.06684 48.06630 0.1007371 0.0786241 0.0853808 0.0687961 0.0833845
20% imp1 48.06460 48.03932 48.09376 48.06750 48.06686 48.06629 0.1056511 0.0970516 0.0970516 0.0982801 0.0995086
20% kalman 48.06438 48.03886 48.09359 48.06910 48.06718 48.06648 0.0927518 0.0945946 0.0816953 0.0933661 0.0906020
20% interpolation 48.06431 48.03889 48.09359 48.06921 48.06723 48.06650 0.0872236 0.0896806 0.0755528 0.0853808 0.0844595
20% mean 48.07973 48.02832 48.08681 48.08731 48.06748 48.07054 0.2782555 0.2721130 0.2395577 0.2800983 0.2675061
30%
30% random 48.25972 48.06207 48.33286 47.77778 48.05757 48.10811 0.5165848 0.4864865 0.5558968 0.5423833 0.5253378
30% imp2 48.06313 48.03819 48.09304 48.06734 48.06619 48.06543 0.1412776 0.1332924 0.1025799 0.1068796 0.1210074
30% imp3 48.06348 48.03940 48.09352 48.06649 48.06647 48.06572 0.1308354 0.1210074 0.1081081 0.1093366 0.1173219
30% imp5 48.06414 48.03950 48.09303 48.06767 48.06673 48.06608 0.1400491 0.1265356 0.1056511 0.1068796 0.1197789
30% imp1 48.06165 48.03872 48.09386 48.06788 48.06682 48.06553 0.1425061 0.1314496 0.1173219 0.1216216 0.1282248
30% interpolation 48.06474 48.03922 48.09377 48.06792 48.06697 48.06641 0.1461916 0.1351351 0.1332924 0.1461916 0.1402027
30% kalman 48.06447 48.03942 48.09377 48.06791 48.06703 48.06639 0.1529484 0.1418919 0.1363636 0.1541769 0.1463452
30% imp4 48.06310 48.04006 48.09321 48.06804 48.06711 48.06610 0.1351351 0.1283784 0.1062654 0.1142506 0.1210074
30% ma 48.06320 48.04018 48.09408 48.06792 48.06739 48.06635 0.1566339 0.1345209 0.1547912 0.1689189 0.1537162
30% locf 48.06791 48.04236 48.09580 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477 0.1570946
30% tidyr_fill 48.06791 48.04236 48.09580 48.06738 48.06851 48.06836 0.1621622 0.1461916 0.1541769 0.1658477 0.1570946
30% mean 48.07771 48.04385 48.10906 48.09428 48.08240 48.08122 0.3642506 0.3507371 0.3224816 0.3869779 0.3561118
50%
50% random 48.61875 48.18569 47.83183 47.39597 47.80450 48.00806 0.6044226 0.5761671 0.5909091 0.6173219 0.5972052
50% locf 48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995 0.2519963
50% tidyr_fill 48.06433 48.03798 48.09349 48.06476 48.06541 48.06514 0.2807125 0.2444717 0.2242015 0.2585995 0.2519963
50% kalman 48.06676 48.03676 48.09347 48.06627 48.06550 48.06581 0.2340295 0.2168305 0.1848894 0.2315725 0.2168305
50% interpolation 48.06646 48.03692 48.09347 48.06623 48.06554 48.06577 0.2223587 0.2094595 0.1799754 0.2260442 0.2094595
50% ma 48.06724 48.03628 48.09353 48.06685 48.06555 48.06597 0.2469287 0.2266585 0.1996314 0.2432432 0.2291155
50% imp5 48.06570 48.04092 48.09609 48.06943 48.06881 48.06803 0.2506143 0.2346437 0.2223587 0.2297297 0.2343366
50% imp3 48.06598 48.04386 48.09902 48.06832 48.07040 48.06930 0.2156020 0.1971744 0.2082310 0.2100737 0.2077703
50% imp2 48.06819 48.04509 48.09811 48.07183 48.07168 48.07080 0.2340295 0.2235872 0.2094595 0.2260442 0.2232801
50% imp4 48.06758 48.04450 48.09766 48.07323 48.07180 48.07074 0.2143735 0.2039312 0.1947174 0.2063882 0.2048526
50% imp1 48.06908 48.04628 48.10127 48.07136 48.07297 48.07200 0.2192875 0.1934889 0.2039312 0.2027027 0.2048526
50% mean 48.10011 48.05789 48.10226 48.06800 48.07605 48.08206 0.4484029 0.3900491 0.3832924 0.4318182 0.4133907
65%
65% imp3 48.06148 48.03451 48.08888 48.06000 48.06113 48.06122 0.3132678 0.2727273 0.2880835 0.2788698 0.2882371
65% imp5 48.06606 48.03924 48.08798 48.06334 48.06352 48.06416 0.3335381 0.2899263 0.2506143 0.2555283 0.2824017
65% locf 48.06478 48.04620 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.2942260 0.3445946 0.3267813
65% tidyr_fill 48.06478 48.04620 48.08847 48.06434 48.06634 48.06595 0.3544226 0.3138821 0.2942260 0.3445946 0.3267813
65% imp2 48.06781 48.03969 48.09423 48.06629 48.06673 48.06700 0.3347666 0.2819410 0.2813268 0.2647420 0.2906941
65% imp1 48.06753 48.03945 48.09411 48.06812 48.06723 48.06730 0.3519656 0.3028256 0.2714988 0.2542998 0.2951474
65% imp4 48.06864 48.03979 48.09638 48.06667 48.06761 48.06787 0.3464373 0.2929975 0.3015971 0.2745700 0.3039005
65% ma 48.07108 48.04394 48.09041 48.07027 48.06821 48.06892 0.3353808 0.2764128 0.2647420 0.3101966 0.2966830
65% kalman 48.07038 48.04468 48.08991 48.07061 48.06840 48.06890 0.3114251 0.2684275 0.2450860 0.2911548 0.2790233
65% interpolation 48.07036 48.04468 48.08991 48.07116 48.06858 48.06903 0.3095823 0.2659705 0.2457002 0.2948403 0.2790233
65% mean 48.06675 48.02981 48.12859 48.11165 48.09002 48.08420 0.4367322 0.3814496 0.3783784 0.4176904 0.4035627
65% random 47.50568 48.49940 49.01626 47.85631 48.45733 48.21942 0.8832924 0.6977887 0.9195332 0.8298526 0.8326167
70%
70% mean 48.12663 48.00807 48.08176 48.06426 48.05137 48.07018 0.8335381 0.7665848 0.3243243 0.3863636 0.5777027
70% imp3 48.05517 48.02155 48.08947 48.06446 48.05849 48.05766 0.3187961 0.3175676 0.2911548 0.3402948 0.3169533
70% locf 48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054 0.3773034
70% tidyr_fill 48.05579 48.03106 48.07434 48.08234 48.06258 48.06088 0.4029484 0.4004914 0.3003686 0.4054054 0.3773034
70% imp4 48.05450 48.03275 48.09509 48.06249 48.06344 48.06121 0.3243243 0.2628993 0.2807125 0.2585995 0.2816339
70% ma 48.05462 48.02955 48.08596 48.07753 48.06435 48.06191 0.3617936 0.3445946 0.2856265 0.3703931 0.3406020
70% kalman 48.05440 48.02955 48.08650 48.07724 48.06443 48.06192 0.3415233 0.3212531 0.2653563 0.3445946 0.3181818
70% interpolation 48.05448 48.02946 48.08666 48.07724 48.06445 48.06196 0.3421376 0.3243243 0.2616708 0.3452088 0.3183354
70% imp1 48.06299 48.03345 48.09202 48.06911 48.06486 48.06439 0.3378378 0.3015971 0.2702703 0.2850123 0.2986794
70% imp2 48.06721 48.03401 48.10299 48.07360 48.07020 48.06945 0.3310811 0.3065111 0.2788698 0.3003686 0.3042076
70% imp5 48.06417 48.04374 48.09782 48.08140 48.07432 48.07178 0.2997543 0.2972973 0.2899263 0.3230958 0.3025184
70% random 47.68556 47.12402 47.95719 49.26794 48.11638 48.00868 0.9324324 0.9262899 0.9146192 0.9275184 0.9252150
80%
80% locf 48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597 0.4450246
80% tidyr_fill 48.07033 48.03224 48.09007 48.06271 48.06167 48.06384 0.4932432 0.4404177 0.3912776 0.4551597 0.4450246
80% imp3 48.06543 48.03745 48.09131 48.05879 48.06252 48.06325 0.4778870 0.3384521 0.3673219 0.2868550 0.3676290
80% imp2 48.09635 48.03502 48.09513 48.06611 48.06542 48.07315 0.4809582 0.4379607 0.3550369 0.3574939 0.4078624
80% ma 48.06795 48.04561 48.09361 48.06843 48.06922 48.06890 0.4398034 0.3808354 0.3863636 0.4391892 0.4115479
80% interpolation 48.06756 48.04682 48.09353 48.06823 48.06952 48.06903 0.4250614 0.3525799 0.3789926 0.4244472 0.3952703
80% kalman 48.06761 48.04682 48.09357 48.06821 48.06953 48.06905 0.4256757 0.3519656 0.3820639 0.4256757 0.3963452
80% imp4 48.07939 48.04066 48.10873 48.06645 48.07195 48.07381 0.4662162 0.3765356 0.4041769 0.4011057 0.4120086
80% imp5 48.08769 48.05208 48.11440 48.07758 48.08135 48.08294 0.3980344 0.3470516 0.3200246 0.3304668 0.3488943
80% imp1 48.09547 48.06079 48.12047 48.08773 48.08966 48.09112 0.3599509 0.3396806 0.3065111 0.3476658 0.3384521
80% mean 48.05587 48.05459 48.17520 48.12306 48.11762 48.10218 0.8802211 0.2708845 0.8157248 0.2954545 0.5655713
80% random 49.44640 48.28223 49.41269 47.95915 48.55136 48.77512 0.9686732 0.9705160 0.9017199 0.9133907 0.9385749
85%
85% locf 48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.5491400 0.5138206
85% tidyr_fill 48.06139 48.04274 48.09568 48.04604 48.06149 48.06146 0.5479115 0.4570025 0.5012285 0.5491400 0.5138206
85% kalman 48.05593 48.04052 48.08479 48.06086 48.06206 48.06052 0.4404177 0.3593366 0.3875921 0.4484029 0.4089373
85% interpolation 48.05584 48.04052 48.08479 48.06113 48.06214 48.06057 0.4410319 0.3660934 0.3900491 0.4606880 0.4144656
85% ma 48.05618 48.04110 48.08498 48.06091 48.06233 48.06079 0.5116708 0.4183047 0.4551597 0.5098280 0.4737408
85% imp5 48.05452 48.04422 48.10806 48.09895 48.08375 48.07644 0.4299754 0.4103194 0.3869779 0.4434889 0.4176904
85% imp3 48.06908 48.06095 48.10012 48.10192 48.08767 48.08302 0.3568796 0.3396806 0.3218673 0.3488943 0.3418305
85% imp4 48.07092 48.06230 48.11810 48.08630 48.08890 48.08441 0.4459459 0.3200246 0.3495086 0.2696560 0.3462838
85% imp2 48.07333 48.06250 48.11384 48.10463 48.09366 48.08858 0.3562654 0.3427518 0.3200246 0.3839066 0.3507371
85% imp1 48.06422 48.06042 48.11247 48.11366 48.09552 48.08769 0.3765356 0.3568796 0.3427518 0.3820639 0.3645577
85% mean 48.11939 48.09357 48.10504 48.10364 48.10075 48.10541 0.2260442 0.1984029 0.2125307 0.2278870 0.2162162
85% random 49.15168 47.72769 49.75232 47.47577 48.31860 48.52687 0.9895577 0.9889435 0.9631450 0.9649877 0.9766585
90%
90% random 48.11770 47.55666 47.88066 47.62295 47.68676 47.79449 0.9410319 0.9367322 0.9613022 0.9637592 0.9507064
90% locf 48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.5368550 0.5018428 0.5122850 0.6375921 0.5471437
90% tidyr_fill 48.09342 48.02838 48.08001 48.02318 48.04386 48.05625 0.5368550 0.5018428 0.5122850 0.6375921 0.5471437
90% imp5 48.06921 48.04484 48.06735 48.03865 48.05028 48.05501 0.4299754 0.2929975 0.3108108 0.3882064 0.3554975
90% imp3 48.07160 48.04393 48.06767 48.04916 48.05359 48.05809 0.3452088 0.3132678 0.2463145 0.3003686 0.3012899
90% imp1 48.08072 48.03486 48.07313 48.05499 48.05433 48.06093 0.4189189 0.3409091 0.3114251 0.3544226 0.3564189
90% interpolation 48.09238 48.02948 48.08101 48.06488 48.05846 48.06694 0.5270270 0.5165848 0.4600737 0.6339066 0.5343980
90% kalman 48.09188 48.03086 48.08101 48.06581 48.05923 48.06739 0.5221130 0.5079853 0.4459459 0.6216216 0.5244165
90% ma 48.09193 48.03144 48.08093 48.06593 48.05943 48.06756 0.5737101 0.5110565 0.4944717 0.6228501 0.5505221
90% imp4 48.10228 48.04703 48.07421 48.07541 48.06555 48.07473 0.4576167 0.3851351 0.2585995 0.3538084 0.3637899
90% imp2 48.10489 48.05921 48.08469 48.07178 48.07189 48.08014 0.4600737 0.3789926 0.2911548 0.3568796 0.3717752
90% mean 48.03712 48.06868 48.06397 48.17040 48.10102 48.08504 0.1572482 0.1547912 0.1480344 0.1658477 0.1564803

4.5 GARCH模型预测数据偏差比较

以下乃日间数据,虽然与此文献的日内数据有所分别,但是也可作为参考。

fls <- list.files('data/fx/USDJPY', pattern = 'pred2.+.rds|^DCC.GARCH.USDJPY.HLC.[0-9]{4}-[0-9]{2}-[0-9]{2}.rds|^aDCC.GARCH.USDJPY.HLC.[0-9]{4}-[0-9]{2}-[0-9]{2}.rds|DCC.GARCH.USDJPY.OHLC.[0-9]{4}-[0-9]{2}-[0-9]{2}.rds')

## extract date
validate <- fls %>% 
    str_extract_all('[0-9]{4}-[0-9]{2}-[0-9]{2}') %>% 
    unlist %>% 
    plyr::count() %>% 
    tbl_df
validate %>% dplyr::filter(freq != 5) %>% 
  kable(caption = 'Count missing observation') %>% 
  kable_styling(
    bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  scroll_box(height = '400px')
Count missing observation
x freq
2012-12-31 1
2013-02-05 3
2013-02-21 4
2013-02-25 3
2013-03-18 4
2013-03-20 3
2013-04-10 4
2013-04-14 3
2013-04-16 4
2013-04-18 4
2013-04-24 3
2014-10-23 4
2014-10-28 3
2014-12-15 3
2014-12-25 4
2014-12-29 3
2015-02-13 4
2015-02-17 3
2017-04-18 4
2017-04-20 1
2017-05-15 4
2017-05-17 1
2017-05-18 1
2017-08-27 4
2017-08-29 3
## Univariate
td <- validate %>% dplyr::filter(freq == 5) %>% .$x %>% ymd
flv <- llply(td, grep, fls, value = TRUE) %>% unlist %>% unique

## get only MSE and AIC/BIC but ommit VaR.
MSE.com <- ldply(flv, function(x) {
    dfm <- readRDS(paste0('data/fx/USDJPY/', x))#[[1]]
    
    if (!is.data.frame(dfm)) {
      dfm %<>% .$res
    }
    
    names(dfm) %<>% str_replace_all('USDJPY', 'Price')
    dfm %<>% separate(Type, c('Cat', 'Type', 'Model'))
    
    if (ncol(dfm) == 10) {
      dfm %<>% dplyr::filter(Type == 'Op' | Type == 'Hi' | 
                             Type == 'Lo' | Type == 'Cl') %>% 
        mutate(Type2 = Type) %>% 
        spread(Type, Price) %>% 
        dplyr::rename(Price.Open = Op, Price.High = Hi, 
                      Price.Low = Lo, Price.Close = Cl) %>% 
        spread(Type2, `Price.T+1`) %>% 
        dplyr::rename(Price.Open.T1 = Op, Price.High.T1 = Hi, 
                      Price.Low.T1 = Lo, Price.Close.T1 = Cl) %>% 
        dplyr::select(Date, Price.Open, Price.High, Price.Low, Price.Close, 
                      Price.Open.T1, Price.High.T1, Price.Low.T1, Price.Close.T1, 
                      Akaike, Bayes, Shibata, Hannan.Quinn)
      dfm %<>% mutate(Model = 'gjrGARCH', Cat = 'OHLC')
      
    } else if (ncol(dfm) == 14) {
      dfm %<>% dplyr::select(Date, Model, Price.High, Price.Low, Price.Close, 
                             Price.High.T1, Price.Low.T1, Price.Close.T1, 
                             Akaike, Bayes, Shibata, Hannan.Quinn) %>% 
        unique
      dfm %<>% mutate(Cat = 'HLC')
      
    } else if (ncol(dfm) == 16) {
      
      dfm %<>% dplyr::select(Date, Model, 
                               Price.Open, Price.High, Price.Low, Price.Close, 
                               Price.Open.T1, Price.High.T1, Price.Low.T1, 
                               Price.Close.T1, Akaike, Bayes, Shibata, Hannan.Quinn) %>% 
          mutate(Akaike = mean(Akaike), Bayes = mean(Bayes), 
                 Shibata = mean(Shibata), Hannan.Quinn = mean(Hannan.Quinn)) %>% 
          unique
        dfm %<>% mutate(Cat = 'OHLC')
      
    } else {
      dfm %<>% dfm
    }
    
    return(dfm)
  }) %>% tbl_df
## Filter bias.
bias <- MSE.com %>% 
  mutate(Model = factor(Model)) %>% 
  dplyr::select(Date, Model, Cat, Price.Open.T1, Price.High.T1, Price.Low.T1, Price.Close.T1)
bias1 <- bias %>% dplyr::filter(Model != 'gjrGARCH')
bias2 <- bias %>% dplyr::filter(Model == 'gjrGARCH')
bias2A <- bias2[c(1:3)] %>% unique
bias2 <- bias2[-c(1:3)] %>% 
  rowSums(na.rm=TRUE) %>% 
  matrix(nc = 4, byrow=TRUE) %>% 
  as_data_frame %>% 
  dplyr::rename(Price.Open.T1 = V1, Price.High.T1 = V2, 
                Price.Low.T1 = V3, Price.Close.T1 = V4)
bias2 <- cbind(bias2A, bias2) %>% tbl_df
bias <- rbind(bias1, bias2) %>% tbl_df %>% arrange(Date)
rm(bias1, bias2A, bias2)

bias %<>% 
  mutate(
  bias.open = if_else(Price.Open.T1>Price.High.T1|Price.Open.T1<Price.Low.T1, 1, 0), 
  bias.high = if_else(Price.High.T1<Price.Open.T1|Price.High.T1<Price.Low.T1|Price.High.T1<Price.Close.T1, 1, 0), 
  bias.low = if_else(Price.Low.T1>Price.Open.T1|Price.Low.T1>Price.High.T1|Price.Low.T1>Price.Close.T1, 1, 0), 
  bias.close = if_else(Price.Close.T1>Price.High.T1|Price.Close.T1<Price.Low.T1, 1, 0)) %>% 
  dplyr::select(Date, Model, Cat, Price.Open.T1, Price.High.T1, Price.Low.T1, Price.Close.T1, bias.open, bias.high, bias.low, bias.close) #%>% 
#dplyr::filter(bias.open==1|bias.high==1|bias.low==1|bias.close==1)
bias
## # A tibble: 5,955 x 11
##    Date       Model Cat   Price.Open.T1 Price.High.T1 Price.Low.T1
##    <date>     <fct> <chr>         <dbl>         <dbl>        <dbl>
##  1 2013-01-01 aDCC  HLC            NA            86.8         86.8
##  2 2013-01-01 aDCC  OHLC           86.5          86.8         86.8
##  3 2013-01-01 DCC   HLC            NA            86.8         86.8
##  4 2013-01-01 DCC   OHLC           86.5          86.8         86.8
##  5 2013-01-01 gjrG~ OHLC           86.8          86.8         86.5
##  6 2013-01-02 aDCC  HLC            NA            87.3         86.5
##  7 2013-01-02 aDCC  OHLC           86.7          87.3         86.5
##  8 2013-01-02 DCC   HLC            NA            87.3         86.5
##  9 2013-01-02 DCC   OHLC           86.7          87.3         86.5
## 10 2013-01-02 gjrG~ OHLC           86.5          87.3         86.7
## # ... with 5,945 more rows, and 5 more variables: Price.Close.T1 <dbl>,
## #   bias.open <dbl>, bias.high <dbl>, bias.low <dbl>, bias.close <dbl>
bias %>% ddply(.(Model, Cat), summarise, 
               bias.open = sum(bias.open, na.rm=TRUE)/length(bias.open), 
               bias.high = sum(bias.high, na.rm=TRUE)/length(bias.high), 
               bias.low = sum(bias.low, na.rm=TRUE)/length(bias.low), 
               bias.close = sum(bias.close, na.rm=TRUE)/length(bias.close), 
               bias = (bias.open + bias.high + bias.low + bias.close)/4, 
               n = length(Cat)) %>% 
  kable(caption = 'Bias Dataset') %>% 
  kable_styling(
    bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  scroll_box(width = '100%')#, height = '400px')
Bias Dataset
Model Cat bias.open bias.high bias.low bias.close bias n
aDCC HLC 0.0000000 0.1192275 0.1032746 0.2157851 0.1095718 1191
aDCC OHLC 0.1897565 0.1326616 0.1141898 0.2149454 0.1628883 1191
DCC HLC 0.0000000 0.1183879 0.1032746 0.2149454 0.1091520 1191
DCC OHLC 0.1897565 0.1326616 0.1150294 0.2174643 0.1637280 1191
gjrGARCH OHLC 0.9756507 0.9076406 0.9378673 0.9269521 0.9370277 1191

5 结论

弥补来的数据得以以下标准:

  • 最低的MSE.HLC(倘若是交易的话,一些模型不包括开市价)
  • 最低的MSE.OHLC(一些交易模型会拿开市价与上一个闭市价进行比较)
  • 最低的bias(误差与偏差,例如开闭市价都必须在最高低价之间,否则就是弥补偏差)

从以上数据证明,弥补来的数据确实有误,如之前单变量的误差(开市或闭市价高于最高价、低于最低价)。僕们可以通过auto.arimaETSGARCH或者其它方式回测数据和弥补缺失值,不过那就比较费时了。根据以上的imputeTS::na.seadec()弥补来的数据binary.com 面试试题 I - 单变量数据缺失值管理algorithm ='kalman'algorithm ='interpolation'俩的误差率最低。

综合数据偏差比较中证明当1-min数据缺失值\(\leq\)30%的时候,amelia多变量最为精准,然后当50%\(\leq\)缺失值\(\leq\)65%时,MSE和偏差(bias)俩都不及单变量sea.dec。与此同时,tick to 1-min数据证明sea.dec乃最佳弥补数据的模型。

此文献的结论乃无法断定单变量或多变量比较适合弥补数据缺失值,不过有一点可以断定的是无论缺失值多寡,sea.dec(algorithm = 'interpolation')sea.dec(algorithm = 'kalman')都将会比较有效弥补数据。最有效的方法可能是使用多变量DCC模式(预测)3弥补缺失值,不过会非常耗时,而且该GARCH模型乃预测价格模型。

6 附录

6.1 文件与系统资讯

以下乃此文献资讯:

  • 文件建立日期:2018-10-10
  • 文件最新更新日期:2018-10-25
  • R version 3.5.1 (2018-07-02)
  • R语言版本:3.5.1
  • rmarkdown 程序包版本:1.10
  • 文件版本:1.0.1
  • 作者简历:®γσ, Eng Lian Hu
  • GitHub:源代码
  • 其它系统资讯:
Additional session information:
Category session_info Category Sys.info
version R version 3.5.1 (2018-07-02) sysname Windows
os Windows 10 x64 release 10 x64
system x86_64, mingw32 version build 17134
ui RTerm nodename RSTUDIO-SCIBROK
language en machine x86-64
collate Japanese_Japan.932 login scibr
ctype Japanese_Japan.932 user scibr
tz Asia/Tokyo effective_user scibr
date 2018-10-25 Current time 2018-10-25 23:59:22 JST

  1. 欲知更多详情,请查阅binary.com Interview Question I - Interday High Frequency Trading Models Comparison

  2. 欲知更多详情,请参阅一、什么是Tick Data

  3. binary.com Interview Question I - Multivariate GARCH Models中的多变量模型将会计算不同价格中的关系系数。