import R

開啟「ETF4_2000_2018_d.txt」檔案，header = T表示第一列為變數的名稱及fileEncoding = “UTF-8-BOM”解決亂碼問題。

rm(list=ls())
etf4<-read.table("ETF4_2000_2018_d.txt", header = T, fileEncoding = "UTF-8-BOM", stringsAsFactors = T, 
                 colClasses = c("證券代碼"="character"))
head(etf4)

##   證券代碼       簡稱     日期 未調整收盤價.元. 當日均價.元.
## 1     0050 元大台灣50 20090105            34.20        34.30
## 2     0056 元大高股息 20090105            13.92        14.00
## 3     0050 元大台灣50 20090106            34.18        34.21
## 4     0056 元大高股息 20090106            14.04        14.02
## 5     0050 元大台灣50 20090107            34.63        34.59
## 6     0056 元大高股息 20090107            14.28        14.28

設定stringsAsFactors = T，讓其以證券代碼、id、date、price分類，且利用fileEncoding=’big5’使文字編碼為中文。

etf4.csv<-read.csv("ETF4_2000_2018_d.csv", stringsAsFactors = T,  fileEncoding='big5', colClasses = c("證券代碼"="character",'factor', 'factor', 'factor', 'numeric', 'numeric'))

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

head(etf4.csv)

##   證券代碼          簡稱     日期 未調整收盤價.元. 當日均價.元.
## 1  0050    元大台灣50    20090105            34.20        34.30
## 2  0056    元大高股息    20090105            13.92        14.00
## 3  0050    元大台灣50    20090106            34.18        34.21
## 4  0056    元大高股息    20090106            14.04        14.02
## 5  0050    元大台灣50    20090107            34.63        34.59
## 6  0056    元大高股息    20090107            14.28        14.28

使用readr以提供完整的文字檔讀取功能，且locale = locale(encoding=’big5’使文字編碼為中文。

library(readr)
etf4_csv<-read_csv("ETF4_2000_2018_d.csv", locale = locale(encoding='big5'))

## Parsed with column specification:
## cols(
##   證券代碼 = col_character(),
##   簡稱 = col_character(),
##   日期 = col_double(),
##   `未調整收盤價(元)` = col_double(),
##   `當日均價(元)` = col_double()
## )

head(etf4_csv)

## # A tibble: 6 x 5
##   證券代碼 簡稱           日期 `未調整收盤價(元)` `當日均價(元)`
##   <chr>    <chr>         <dbl>              <dbl>          <dbl>
## 1 0050     元大台灣50 20090105               34.2           34.3
## 2 0056     元大高股息 20090105               13.9           14  
## 3 0050     元大台灣50 20090106               34.2           34.2
## 4 0056     元大高股息 20090106               14.0           14.0
## 5 0050     元大台灣50 20090107               34.6           34.6
## 6 0056     元大高股息 20090107               14.3           14.3

str(etf4_csv)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 7493 obs. of  5 variables:
##  $ 證券代碼        : chr  "0050" "0056" "0050" "0056" ...
##  $ 簡稱            : chr  "元大台灣50" "元大高股息" "元大台灣50" "元大高股息" ...
##  $ 日期            : num  20090105 20090105 20090106 20090106 20090107 ...
##  $ 未調整收盤價(元): num  34.2 13.9 34.2 14 34.6 ...
##  $ 當日均價(元)    : num  34.3 14 34.2 14 34.6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   證券代碼 = col_character(),
##   ..   簡稱 = col_character(),
##   ..   日期 = col_double(),
##   ..   `未調整收盤價(元)` = col_double(),
##   ..   `當日均價(元)` = col_double()
##   .. )

利用readxl讀取ecvel的檔案，並且使用col_types，讓欄位的類別以向量表示。

library(readxl)
etf4_xls<-read_excel("ETF4_2000_2018_d.xls", 
                     col_types =c("text", "text","text", "numeric","numeric"))
head(etf4_xls)

## # A tibble: 6 x 5
##   證券代碼 簡稱       日期     `未調整收盤價(元)` `當日均價(元)`
##   <chr>    <chr>      <chr>                 <dbl>          <dbl>
## 1 0050     元大台灣50 20090105               34.2           34.3
## 2 0056     元大高股息 20090105               13.9           14  
## 3 0050     元大台灣50 20090106               34.2           34.2
## 4 0056     元大高股息 20090106               14.0           14.0
## 5 0050     元大台灣50 20090107               34.6           34.6
## 6 0056     元大高股息 20090107               14.3           14.3

把第一、第二、第四欄刪除，並使用magrittr、dplyr，將檔案變成以id、date、price顯示。

etf4.c<-etf4_csv[-1, c(-2, -4)]
library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

etf4.c<-etf4_csv%>%select(c(1,3,5))%>%rename("id" = "證券代碼", "date"= "日期", "price" = "當日均價(元)")
colnames(etf4.c)<-c("id", "date", "price")
head(etf4.c)

## # A tibble: 6 x 3
##   id        date price
##   <chr>    <dbl> <dbl>
## 1 0050  20090105  34.3
## 2 0056  20090105  14  
## 3 0050  20090106  34.2
## 4 0056  20090106  14.0
## 5 0050  20090107  34.6
## 6 0056  20090107  14.3

使用dcast讓資料變扁，以日期、id分組，“%Y%m%d”以年-月-日顯示。

library(reshape2)
etf4.reorder = dcast(etf4.c, date~id)

## Using price as value column: use value.var to override.

etf4.reorder$date<-as.Date(as.character(etf4.reorder$date), "%Y%m%d") 
dim(etf4.reorder)

## [1] 2474    5

head(etf4.reorder)

##         date  0050  0056 006205 00646
## 1 2009-01-05 34.30 14.00     NA    NA
## 2 2009-01-06 34.21 14.02     NA    NA
## 3 2009-01-07 34.59 14.28     NA    NA
## 4 2009-01-08 33.21 13.86     NA    NA
## 5 2009-01-09 32.32 13.61     NA    NA
## 6 2009-01-10 31.91 13.55     NA    NA

將文字轉化成數字。

library(xts)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

etf4.xts<-xts(etf4.reorder[,-1], order.by = etf4.reorder$date)
head(etf4.xts)

##             0050  0056 006205 00646
## 2009-01-05 34.30 14.00     NA    NA
## 2009-01-06 34.21 14.02     NA    NA
## 2009-01-07 34.59 14.28     NA    NA
## 2009-01-08 33.21 13.86     NA    NA
## 2009-01-09 32.32 13.61     NA    NA
## 2009-01-10 31.91 13.55     NA    NA

tail(etf4.xts)

##             0050  0056 006205 00646
## 2018-12-22 74.75 24.15  25.08 22.93
## 2018-12-24 74.67 24.16  25.25 22.72
## 2018-12-25 73.57 23.90  24.90 22.51
## 2018-12-26 73.87 23.83  25.16 22.13
## 2018-12-27 74.81 23.96  25.30 22.95
## 2018-12-28 75.21 23.92  25.24 23.16

str(etf4.xts)

## An 'xts' object on 2009-01-05/2018-12-28 containing:
##   Data: num [1:2474, 1:4] 34.3 34.2 34.6 33.2 32.3 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "0050" "0056" "006205" "00646"
##   Indexed by objects of class: [Date] TZ: UTC
##   xts Attributes:  
##  NULL

使用na.locf替代NA值。

etf4.xts<-na.locf(etf4.xts)                
tail(etf4.xts)

##             0050  0056 006205 00646
## 2018-12-22 74.75 24.15  25.08 22.93
## 2018-12-24 74.67 24.16  25.25 22.72
## 2018-12-25 73.57 23.90  24.90 22.51
## 2018-12-26 73.87 23.83  25.16 22.13
## 2018-12-27 74.81 23.96  25.30 22.95
## 2018-12-28 75.21 23.92  25.24 23.16

利用fromLast解決下一個錯誤。

etf4.xts.fill<-na.locf(etf4.xts, fromLast = TRUE) 
head(etf4.xts.fill)

##             0050  0056 006205 00646
## 2009-01-05 34.30 14.00  20.33 19.54
## 2009-01-06 34.21 14.02  20.33 19.54
## 2009-01-07 34.59 14.28  20.33 19.54
## 2009-01-08 33.21 13.86  20.33 19.54
## 2009-01-09 32.32 13.61  20.33 19.54
## 2009-01-10 31.91 13.55  20.33 19.54

使用na.omit去除NA值。

etf4.xts<-na.omit(etf4.xts)
head(etf4.xts)

##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64

利用tidyr使長寬表格轉換。

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

## The following object is masked from 'package:magrittr':
## 
##     extract

etf4.xts1<-etf4.xts[complete.cases(etf4.xts),]
head(etf4.xts1)

##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64

只顯示0050的資料。

lag_x <- lag(etf4.xts$`0050`, 1)
head(lag_x)

##             0050
## 2015-12-14    NA
## 2015-12-15 59.35
## 2015-12-16 59.59
## 2015-12-17 60.11
## 2015-12-18 60.78
## 2015-12-21 60.78

利用saveRDS將資料儲存為rds檔。

write.csv(etf4.xts, file = "myetf4.csv")
write.zoo(etf4.xts, sep = ',', file = "myetf4.csv.1")
saveRDS(etf4.xts, file = "etf4.xts.rds")
etf4.xts2 <- readRDS("etf4.xts.rds")
head(etf4.xts2)

##             0050  0056 006205 00646
## 2015-12-14 59.35 21.06  30.98 19.54
## 2015-12-15 59.59 21.25  31.66 19.70
## 2015-12-16 60.11 21.50  31.67 19.80
## 2015-12-17 60.78 21.76  32.06 20.05
## 2015-12-18 60.78 21.97  32.23 19.87
## 2015-12-21 60.31 21.99  32.62 19.64

zoo是以時間為基礎。

etf4.zoo <- read.zoo("myetf4.csv.1", header = TRUE, index.column =1, 
                     sep = ",", format = "%Y-%m-%d")
head(etf4.zoo)

##            X0050 X0056 X006205 X00646
## 2015-12-14 59.35 21.06   30.98  19.54
## 2015-12-15 59.59 21.25   31.66  19.70
## 2015-12-16 60.11 21.50   31.67  19.80
## 2015-12-17 60.78 21.76   32.06  20.05
## 2015-12-18 60.78 21.97   32.23  19.87
## 2015-12-21 60.31 21.99   32.62  19.64

class(etf4.zoo)

## [1] "zoo"

etf4.xts3<-as.xts(etf4.zoo)
head(etf4.xts3)

##            X0050 X0056 X006205 X00646
## 2015-12-14 59.35 21.06   30.98  19.54
## 2015-12-15 59.59 21.25   31.66  19.70
## 2015-12-16 60.11 21.50   31.67  19.80
## 2015-12-17 60.78 21.76   32.06  20.05
## 2015-12-18 60.78 21.97   32.23  19.87
## 2015-12-21 60.31 21.99   32.62  19.64

顯示2016年1月至6月的資料。

etf4_2016<-etf4.xts['2016']
etf4_2016_01_06 <- etf4.xts["20160101/20160630"]
head(etf4_2016_01_06)

##             0050  0056 006205 00646
## 2016-01-04 59.62 21.51  31.02 19.90
## 2016-01-05 59.30 21.42  29.94 19.78
## 2016-01-06 58.33 21.15  29.94 19.68
## 2016-01-07 57.30 20.91  28.41 19.50
## 2016-01-08 57.33 20.95  28.62 19.32
## 2016-01-11 56.47 20.64  28.47 18.92

只顯示2016年一個禮拜的資料。

lastweek <- last(etf4_2016, "1 week")

只顯示2016年最後兩天的資料。

last(lastweek, 2)

##             0050  0056 006205 00646
## 2016-12-29 71.35 22.97  26.96 21.55
## 2016-12-30 71.77 23.06  27.00 21.54

只顯示2016最後三天的資料。

first(lastweek, "-2 days")

##             0050  0056 006205 00646
## 2016-12-28 71.37 22.96  27.01 21.62
## 2016-12-29 71.35 22.97  26.96 21.55
## 2016-12-30 71.77 23.06  27.00 21.54

import R

張瑜君

2019年3月11日