rm(list=ls())
etf4<-read.table("ETF4_2000_2018_d.txt", fileEncoding = "UTF-8-BOM")
head(etf4)
##         V1         V2       V3               V4           V5
## 1 證券代碼       簡稱     日期 未調整收盤價(元) 當日均價(元)
## 2     0050 元大台灣50 20090105            34.20        34.30
## 3     0056 元大高股息 20090105            13.92        14.00
## 4     0050 元大台灣50 20090106            34.18        34.21
## 5     0056 元大高股息 20090106            14.04        14.02
## 6     0050 元大台灣50 20090107            34.63        34.59
etf4<-read.table("ETF4_2000_2018_d_ansi.txt", header = T)
str(etf4)
## 'data.frame':    7493 obs. of  5 variables:
##  $ 證券代碼        : int  50 56 50 56 50 56 50 56 50 56 ...
##  $ 簡稱            : Factor w/ 4 levels "元大S&P500","元大台灣50",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ 日期            : int  20090105 20090105 20090106 20090106 20090107 20090107 20090108 20090108 20090109 20090109 ...
##  $ 未調整收盤價.元.: num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價.元.    : num  34.3 14 34.2 14 34.6 ...
etf4<-read.table("ETF4_2000_2018_d_ansi.txt", header = T, stringsAsFactors = T, 
                 colClasses = c("證券代碼"="character"))
str(etf4)
## 'data.frame':    7493 obs. of  5 variables:
##  $ 證券代碼        : chr  "0050" "0056" "0050" "0056" ...
##  $ 簡稱            : Factor w/ 4 levels "元大S&P500","元大台灣50",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ 日期            : int  20090105 20090105 20090106 20090106 20090107 20090107 20090108 20090108 20090109 20090109 ...
##  $ 未調整收盤價.元.: num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價.元.    : num  34.3 14 34.2 14 34.6 ...
head(etf4)
##   證券代碼       簡稱     日期 未調整收盤價.元. 當日均價.元.
## 1     0050 元大台灣50 20090105            34.20        34.30
## 2     0056 元大高股息 20090105            13.92        14.00
## 3     0050 元大台灣50 20090106            34.18        34.21
## 4     0056 元大高股息 20090106            14.04        14.02
## 5     0050 元大台灣50 20090107            34.63        34.59
## 6     0056 元大高股息 20090107            14.28        14.28

讀取文字檔用read.table

rm=remove

ls()=是設定為空的

rm(list=ls())=可以把所有的資料清除

txt.=文字檔 (read.table.)

header=T 是要有表投的意思

stringsAsFactors 字串變成文字分類

str=structure

etf4<-read.csv("ETF4_2000_2018_d.csv", colClasses = c("證券代碼"="character"))
str(etf4)
## 'data.frame':    7493 obs. of  5 variables:
##  $ 證券代碼        : chr  "0050   " "0056   " "0050   " "0056   " ...
##  $ 簡稱            : Factor w/ 4 levels "元大S&P500   ",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ 日期            : int  20090105 20090105 20090106 20090106 20090107 20090107 20090108 20090108 20090109 20090109 ...
##  $ 未調整收盤價.元.: num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價.元.    : num  34.3 14 34.2 14 34.6 ...
etf4.csv<-read.csv("ETF4_2000_2018_d.csv", fileEncoding='big5', 
                   colClasses=c('factor', 'factor', 'factor', 'numeric', 'numeric'))
head(etf4.csv)
##   證券代碼          簡稱     日期 未調整收盤價.元. 當日均價.元.
## 1  0050    元大台灣50    20090105            34.20        34.30
## 2  0056    元大高股息    20090105            13.92        14.00
## 3  0050    元大台灣50    20090106            34.18        34.21
## 4  0056    元大高股息    20090106            14.04        14.02
## 5  0050    元大台灣50    20090107            34.63        34.59
## 6  0056    元大高股息    20090107            14.28        14.28
str(etf4.csv)
## 'data.frame':    7493 obs. of  5 variables:
##  $ 證券代碼        : Factor w/ 4 levels "0050   ","0056   ",..: 1 2 1 2 1 2 1 2 1 2 ...
##  $ 簡稱            : Factor w/ 4 levels "元大S&P500   ",..: 2 3 2 3 2 3 2 3 2 3 ...
##  $ 日期            : Factor w/ 2474 levels "20090105","20090106",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ 未調整收盤價.元.: num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價.元.    : num  34.3 14 34.2 14 34.6 ...
library(readr)
etf4_csv<-read_csv("ETF4_2000_2018_d.csv")
## Parsed with column specification:
## cols(
##   `<c3><U+04A8><e9><U+00A5>N<U+00BD>X` = col_character(),
##   `2<U+00BA><d9>` = col_character(),
##   `<U+00A4><e9><U+00B4><c1>` = col_double(),
##   `<U+00A5><U+00BC><U+00BD><U+057E><U+39AC><U+00BD>L<U+00BB><f9>(<U+00A4><U+00B8>)` = col_double(),
##   `<U+00B7><ed><U+00A4><U+99E1><U+00BB><f9>(<U+00A4><U+00B8>)` = col_double()
## )
etf4_csv<-read_csv("ETF4_2000_2018_d.csv", locale = locale(encoding='big5'))
## Parsed with column specification:
## cols(
##   證券代碼 = col_character(),
##   簡稱 = col_character(),
##   日期 = col_double(),
##   `未調整收盤價(元)` = col_double(),
##   `當日均價(元)` = col_double()
## )
head(etf4_csv)
## # A tibble: 6 x 5
##   證券代碼 簡稱           日期 `未調整收盤價(元)` `當日均價(元)`
##   <chr>    <chr>         <dbl>              <dbl>          <dbl>
## 1 0050     元大台灣50 20090105               34.2           34.3
## 2 0056     元大高股息 20090105               13.9           14  
## 3 0050     元大台灣50 20090106               34.2           34.2
## 4 0056     元大高股息 20090106               14.0           14.0
## 5 0050     元大台灣50 20090107               34.6           34.6
## 6 0056     元大高股息 20090107               14.3           14.3
str(etf4_csv)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 7493 obs. of  5 variables:
##  $ 證券代碼        : chr  "0050" "0056" "0050" "0056" ...
##  $ 簡稱            : chr  "元大台灣50" "元大高股息" "元大台灣50" "元大高股息" ...
##  $ 日期            : num  20090105 20090105 20090106 20090106 20090107 ...
##  $ 未調整收盤價(元): num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價(元)    : num  34.3 14 34.2 14 34.6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   證券代碼 = col_character(),
##   ..   簡稱 = col_character(),
##   ..   日期 = col_double(),
##   ..   `未調整收盤價(元)` = col_double(),
##   ..   `當日均價(元)` = col_double()
##   .. )

讀取csv檔用read.csv

head/tail 是看前/後幾筆資料

str=structure

int=整數

colclasses=c 欄位的屬性

col=列

factor(因子)

numeric(數值)

character(文字)

fileEncoding=‘big5’(有亂碼時改編碼方式)

csv檔用readr(要先安裝程式)

library(readr) 執行才可跑程式

locale 是區域設定

encoding(編碼)

etf4_csv<-read_csv(“ETF4_2000_2018_d.csv”, locale = locale(encoding=‘big5’)) (會有tibble)

library(readxl)
etf4_xls<-read_excel("ETF4_2000_2018_d.xls", 
                     col_types =c("text", "text","text", "numeric","numeric"))
head(etf4_xls)
## # A tibble: 6 x 5
##   證券代碼 簡稱       日期     `未調整收盤價(元)` `當日均價(元)`
##   <chr>    <chr>      <chr>                 <dbl>          <dbl>
## 1 0050     元大台灣50 20090105               34.2           34.3
## 2 0056     元大高股息 20090105               13.9           14  
## 3 0050     元大台灣50 20090106               34.2           34.2
## 4 0056     元大高股息 20090106               14.0           14.0
## 5 0050     元大台灣50 20090107               34.6           34.6
## 6 0056     元大高股息 20090107               14.3           14.3
etf4.c<-etf4_csv[, c(-2, -4)]
etf4.c<-etf4.c[-1,]
colnames(etf4.c)<-c("id", "date", "price")
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
etf4.c<-etf4_csv%>%select(c(1,3,5))%>%rename("id" = "證券代碼", "date"= "日期", "price" = "當日均價(元)")
etf4.c
## # A tibble: 7,493 x 3
##    id        date price
##    <chr>    <dbl> <dbl>
##  1 0050  20090105  34.3
##  2 0056  20090105  14  
##  3 0050  20090106  34.2
##  4 0056  20090106  14.0
##  5 0050  20090107  34.6
##  6 0056  20090107  14.3
##  7 0050  20090108  33.2
##  8 0056  20090108  13.9
##  9 0050  20090109  32.3
## 10 0056  20090109  13.6
## # ... with 7,483 more rows

讀取excel用read_excel

etf4_xls<-read_excel(“ETF4_2000_2018_d.xls”, col_types =c(“text”, “text”,“text”, “numeric”,“numeric”)) 結果會是date.frame 的形式

library(readxl) 執行才可跑程式

col_types(列的形式)

整理資料可以用etf4.c<-etf4_csv[, c(欄, 列)]

etf4.c$ (看哪一個欄位)

c(欄,列)的c代表連結

colnames=欄位名稱

pipe operator (簡化成是不必要程式)

library(magrittr)、library(dplyr)執行才可跑程式

etf4_135 = etf4_csv[,(1,3,5)] (看1、3、5欄的資料)

%>% (可省略打變數名稱的時間)

id" = 證券代碼、date= 日期、price" = 當日均價(元)

library(reshape2)
etf4.reorder = dcast(etf4.c, date~id)
## Using price as value column: use value.var to override.
dim(etf4.reorder)
## [1] 2474    5
head(etf4.reorder)
##       date  0050  0056 006205 00646
## 1 20090105 34.30 14.00     NA    NA
## 2 20090106 34.21 14.02     NA    NA
## 3 20090107 34.59 14.28     NA    NA
## 4 20090108 33.21 13.86     NA    NA
## 5 20090109 32.32 13.61     NA    NA
## 6 20090110 31.91 13.55     NA    NA
str(etf4.reorder)
## 'data.frame':    2474 obs. of  5 variables:
##  $ date  : num  20090105 20090106 20090107 20090108 20090109 ...
##  $ 0050  : num  34.3 34.2 34.6 33.2 32.3 ...
##  $ 0056  : num  14 14 14.3 13.9 13.6 ...
##  $ 006205: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 00646 : num  NA NA NA NA NA NA NA NA NA NA ...
etf4.reorder$date<-as.Date(as.character(etf4.reorder$date), "%Y%m%d") 
head(etf4.reorder)
##         date  0050  0056 006205 00646
## 1 2009-01-05 34.30 14.00     NA    NA
## 2 2009-01-06 34.21 14.02     NA    NA
## 3 2009-01-07 34.59 14.28     NA    NA
## 4 2009-01-08 33.21 13.86     NA    NA
## 5 2009-01-09 32.32 13.61     NA    NA
## 6 2009-01-10 31.91 13.55     NA    NA
str(etf4.reorder)
## 'data.frame':    2474 obs. of  5 variables:
##  $ date  : Date, format: "2009-01-05" "2009-01-06" ...
##  $ 0050  : num  34.3 34.2 34.6 33.2 32.3 ...
##  $ 0056  : num  14 14 14.3 13.9 13.6 ...
##  $ 006205: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ 00646 : num  NA NA NA NA NA NA NA NA NA NA ...

使用dcast按日期重新排序數據

library(reshape2) 執行才可跑程式

reorder 為重新整理的意思

etf4.reorder = dcast(etf4.c, date~id) -用日期做一個欄位的分類,做資料重整

as.Date -把日期數字資料做轉換後的變數 (要先轉換成文字)

etf4.reorder\(date<-as.Date(as.character(etf4.reorder\)date), “%Y%m%d”- 把文字轉成數字再轉成日期

%Y大寫為4個數字(2009)、小寫為2個數字(09)

library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
etf4.xts<-xts(etf4.reorder[,-1], order.by = etf4.reorder$date)
head(etf4.xts)
##             0050  0056 006205 00646
## 2009-01-05 34.30 14.00     NA    NA
## 2009-01-06 34.21 14.02     NA    NA
## 2009-01-07 34.59 14.28     NA    NA
## 2009-01-08 33.21 13.86     NA    NA
## 2009-01-09 32.32 13.61     NA    NA
## 2009-01-10 31.91 13.55     NA    NA
tail(etf4.xts)
##             0050  0056 006205 00646
## 2018-12-22 74.75 24.15  25.08 22.93
## 2018-12-24 74.67 24.16  25.25 22.72
## 2018-12-25 73.57 23.90  24.90 22.51
## 2018-12-26 73.87 23.83  25.16 22.13
## 2018-12-27 74.81 23.96  25.30 22.95
## 2018-12-28 75.21 23.92  25.24 23.16
str(etf4.xts)
## An 'xts' object on 2009-01-05/2018-12-28 containing:
##   Data: num [1:2474, 1:4] 34.3 34.2 34.6 33.2 32.3 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "0050" "0056" "006205" "00646"
##   Indexed by objects of class: [Date] TZ: UTC
##   xts Attributes:  
##  NULL
etf4.xts<-na.locf(etf4.xts)                
tail(etf4.xts)
##             0050  0056 006205 00646
## 2018-12-22 74.75 24.15  25.08 22.93
## 2018-12-24 74.67 24.16  25.25 22.72
## 2018-12-25 73.57 23.90  24.90 22.51
## 2018-12-26 73.87 23.83  25.16 22.13
## 2018-12-27 74.81 23.96  25.30 22.95
## 2018-12-28 75.21 23.92  25.24 23.16
etf4.xts.fill<-na.locf(etf4.xts, fromLast = TRUE) 
head(etf4.xts.fill)
##             0050  0056 006205 00646
## 2009-01-05 34.30 14.00  20.33 19.54
## 2009-01-06 34.21 14.02  20.33 19.54
## 2009-01-07 34.59 14.28  20.33 19.54
## 2009-01-08 33.21 13.86  20.33 19.54
## 2009-01-09 32.32 13.61  20.33 19.54
## 2009-01-10 31.91 13.55  20.33 19.54
etf4.xts<-na.omit(etf4.xts)
head(etf4.xts)
##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64

轉變成處理時間序列的資料

xts是處理時間序列的資料

處理數據中的缺失

head/tail 是看前/後幾筆資料 na.locf中locf=last obs. carried forward (用現有的資料補齊沒有的資料)

last obs. carried forward (用未來的資料補前面沒有的資料)

na.omit中omit代表省略的意思

etf4.xts<-na.omit(etf4.xts)-刪除沒有值得部分

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following object is masked from 'package:magrittr':
## 
##     extract
etf4.xts1<-etf4.xts[complete.cases(etf4.xts),]
head(etf4.xts1)
##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64
lag_x <- lag(etf4.xts$`0050`, 1)
head(lag_x)
##             0050
## 2015-12-14    NA
## 2015-12-15 59.35
## 2015-12-16 59.59
## 2015-12-17 60.11
## 2015-12-18 60.78
## 2015-12-21 60.78

整理資料的套件tidyr

complete.cases為完整的資料

[lag operator(落後運算元)]

write.csv(etf4.xts, file = "myetf4.csv")
write.zoo(etf4.xts, sep = ',', file = "myetf4.csv.1")
saveRDS(etf4.xts, file = "etf4.xts.rds")
etf4.xts2 <- readRDS("etf4.xts.rds")
head(etf4.xts2)
##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64
etf4.zoo <- read.zoo("myetf4.csv.1", header = TRUE, index.column =1, 
                     sep = ",", format = "%Y-%m-%d")
head(etf4.zoo)
##            X0050 X0056 X006205 X00646
## 2015-12-14 59.35 21.06   30.98  19.54
## 2015-12-15 59.59 21.25   31.66  19.70
## 2015-12-16 60.11 21.50   31.67  19.80
## 2015-12-17 60.78 21.76   32.06  20.05
## 2015-12-18 60.78 21.97   32.23  19.87
## 2015-12-21 60.31 21.99   32.62  19.64
class(etf4.zoo)
## [1] "zoo"
etf4.xts3<-as.xts(etf4.zoo)
head(etf4.xts3)
##            X0050 X0056 X006205 X00646
## 2015-12-14 59.35 21.06   30.98  19.54
## 2015-12-15 59.59 21.25   31.66  19.70
## 2015-12-16 60.11 21.50   31.67  19.80
## 2015-12-17 60.78 21.76   32.06  20.05
## 2015-12-18 60.78 21.97   32.23  19.87
## 2015-12-21 60.31 21.99   32.62  19.64
etf4_2016<-etf4.xts['2016']
etf4_2016_01_06 <- etf4.xts["20160101/20160630"]
head(etf4_2016_01_06)
##             0050  0056 006205 00646
## 2016-01-04 59.62 21.51  31.02 19.90
## 2016-01-05 59.30 21.42  29.94 19.78
## 2016-01-06 58.33 21.15  29.94 19.68
## 2016-01-07 57.30 20.91  28.41 19.50
## 2016-01-08 57.33 20.95  28.62 19.32
## 2016-01-11 56.47 20.64  28.47 18.92
lastweek <- last(etf4_2016, "1 week")
last(lastweek, 2)
##             0050  0056 006205 00646
## 2016-12-29 71.35 22.97  26.96 21.55
## 2016-12-30 71.77 23.06  27.00 21.54
first(lastweek, "-2 days")
##             0050  0056 006205 00646
## 2016-12-28 71.37 22.96  27.01 21.62
## 2016-12-29 71.35 22.97  26.96 21.55
## 2016-12-30 71.77 23.06  27.00 21.54

數據輸出

輸出csv檔可使用write.csv函數

write.zoo 可保存時間序列

日期索引消失必須使用write.zoo來保存xts文件

saveRDS 可把大量資料存在較小的地方

數據查詢

lastweek <- last(etf4_2016, “1 week”) - 打印上週的最後2個觀察結果

last(lastweek, 2) - 提取上週前兩天的所有內容