data_wrangling_tws

library(tidyquant)
## Warning: package 'tidyquant' was built under R version 3.5.3
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.5.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
## Loading required package: PerformanceAnalytics
## Warning: package 'PerformanceAnalytics' was built under R version 3.5.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.5.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Warning: package 'quantmod' was built under R version 3.5.3
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.5.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.1     √ purrr   0.3.2
## √ tibble  2.1.1     √ dplyr   0.8.1
## √ tidyr   0.8.3     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x dplyr::first()           masks xts::first()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x dplyr::last()            masks xts::last()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
tej <- read_tsv("w2_importing data/tej_day_price_2017_2018_UTF8.txt", col_names = TRUE)
## Parsed with column specification:
## cols(
##   證券代碼 = col_double(),
##   簡稱 = col_character(),
##   TSE產業別 = col_character(),
##   上市別 = col_character(),
##   年月日 = col_double(),
##   `開盤價(元)` = col_double(),
##   `最高價(元)` = col_double(),
##   `收盤價(元)` = col_double(),
##   `最低價(元)` = col_double(),
##   `成交值(千元)` = col_double(),
##   `市值(百萬元)` = col_double(),
##   `成交量(千股)` = col_double()
## )
saveRDS(tej, "w2_importing data/tej_d_2017_18.rds")
tej <- readRDS("w2_importing data/tej_d_2017_18.rds")

讀取資料

執行tidyquant

用read_tsv函數讀取txt檔資料

用saveRDS函數將資料存成rds檔

用readRDS函數讀取rds檔資料

glimpse(tej)
## Observations: 443,171
## Variables: 12
## $ 證券代碼       <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 120...
## $ 簡稱           <chr> "台泥", "亞泥", "嘉泥", "環泥", "幸福", "信大", "東泥", "味全", "...
## $ TSE產業別      <chr> "01", "01", "01", "01", "01", "01", "01", "02", "0...
## $ 上市別         <chr> "TSE", "TSE", "TSE", "TSE", "TSE", "TSE", "TSE", "...
## $ 年月日         <dbl> 20170103, 20170103, 20170103, 20170103, 20170103, ...
## $ `開盤價(元)`   <dbl> 29.90, 24.91, 8.27, 21.41, 8.56, 10.11, 15.58, 17.5...
## $ `最高價(元)`   <dbl> 29.90, 24.95, 8.27, 21.54, 8.57, 10.11, 15.63, 17.7...
## $ `收盤價(元)`   <dbl> 29.64, 24.95, 8.27, 21.50, 8.57, 10.11, 15.54, 17.6...
## $ `最低價(元)`   <dbl> 29.35, 24.76, 8.17, 21.37, 8.56, 10.11, 15.54, 17.5...
## $ `成交值(千元)` <dbl> 101450, 33550, 2411, 3705, 182, 42, 406, 11504, 171,...
## $ `市值(百萬元)` <dbl> 129779, 89078, 6748, 15610, 3703, 3789, 9009, 8931, ...
## $ `成交量(千股)` <dbl> 2890, 1271, 278, 150, 20, 4, 25, 653, 7, 2213, 19, 2...
tej1<-tej %>% select('證券代碼', '年月日', '收盤價(元)') %>% 
  rename(ID = '證券代碼', date = '年月日', close = '收盤價(元)') %>%      
  mutate(date = date %>% as.character %>% as.Date('%Y%m%d')) %>% 
  mutate(ID = ID %>% as.character) %>% 
  arrange(ID)  
tej1
## # A tibble: 443,171 x 3
##    ID    date       close
##    <chr> <date>     <dbl>
##  1 1101  2017-01-03  29.6
##  2 1101  2017-01-04  29.7
##  3 1101  2017-01-05  29.7
##  4 1101  2017-01-06  29.7
##  5 1101  2017-01-09  29.4
##  6 1101  2017-01-10  29.5
##  7 1101  2017-01-11  30.2
##  8 1101  2017-01-12  30.6
##  9 1101  2017-01-13  30.4
## 10 1101  2017-01-16  30.2
## # ... with 443,161 more rows

用select函數選擇需要的資料欄位

用rename函數將名稱重新命名

用mutate函數新增新的欄位

用arrange函數將資料排列整理

tej.3 <- tej1 %>%
  arrange(ID) %>% 
  filter(ID %in% c("1101", "2317", "2330"))
tej.3
## # A tibble: 1,479 x 3
##    ID    date       close
##    <chr> <date>     <dbl>
##  1 1101  2017-01-03  29.6
##  2 1101  2017-01-04  29.7
##  3 1101  2017-01-05  29.7
##  4 1101  2017-01-06  29.7
##  5 1101  2017-01-09  29.4
##  6 1101  2017-01-10  29.5
##  7 1101  2017-01-11  30.2
##  8 1101  2017-01-12  30.6
##  9 1101  2017-01-13  30.4
## 10 1101  2017-01-16  30.2
## # ... with 1,469 more rows

select 3 stocks from tej1: 1101, 2317, 2330;

用arrange函數將資料排列整理

用filter函數選取需要的資料

%in% 將選取資料於ID裡面

tej.3 <- tej.3 %>%
  group_by(ID) %>% 
  tq_mutate(select = c(close),     
            mutate_fun = SMA,           
            n = 5) %>%                
  rename(ma5 = SMA)  %>% 
  
  tq_mutate(select = c(close),
            mutate_fun = SMA,
            n = 10) %>%
  rename(ma10 = SMA) %>%
 
  tq_mutate(select = c(close),
            mutate_fun = SMA,
            n = 20) %>%
  rename(ma20 = SMA) %>% 
  ungroup()
tej.3
## # A tibble: 1,479 x 6
##    ID    date       close   ma5  ma10  ma20
##    <chr> <date>     <dbl> <dbl> <dbl> <dbl>
##  1 1101  2017-01-03  29.6  NA    NA      NA
##  2 1101  2017-01-04  29.7  NA    NA      NA
##  3 1101  2017-01-05  29.7  NA    NA      NA
##  4 1101  2017-01-06  29.7  NA    NA      NA
##  5 1101  2017-01-09  29.4  29.7  NA      NA
##  6 1101  2017-01-10  29.5  29.6  NA      NA
##  7 1101  2017-01-11  30.2  29.7  NA      NA
##  8 1101  2017-01-12  30.6  29.9  NA      NA
##  9 1101  2017-01-13  30.4  30.0  NA      NA
## 10 1101  2017-01-16  30.2  30.2  29.9    NA
## # ... with 1,469 more rows

用select函數選擇收盤價、選擇簡單移動平均線

計算5日簡單移動平均線參數

用rename函數將名稱重新命名

計算10日簡單移動平均線參數

計算20日簡單移動平均線參數

tej.3 %>% select(-starts_with("ma")) %>% 
  spread(key = ID, value = close)
## # A tibble: 493 x 4
##    date       `1101` `2317` `2330`
##    <date>      <dbl>  <dbl>  <dbl>
##  1 2017-01-03   29.6   96.1   171.
##  2 2017-01-04   29.7   96.0   171.
##  3 2017-01-05   29.7   95.7   171.
##  4 2017-01-06   29.7   95.5   172.
##  5 2017-01-09   29.4   95.2   172.
##  6 2017-01-10   29.5   95.7   172.
##  7 2017-01-11   30.2   96.0   170.
##  8 2017-01-12   30.6   96.1   172.
##  9 2017-01-13   30.4   96.0   169.
## 10 2017-01-16   30.2   95.2   168.
## # ... with 483 more rows

spread() long to wide;

將常資料轉為寬資料

checkpoint_1_lubridate

library(tidyverse)
library(lubridate)
library(tidyquant)

bike_orderlines_tbl <- read_rds("data_wrangled_student/bike_orderlines.rds")

glimpse(bike_orderlines_tbl)
## Observations: 15,644
## Variables: 13
## $ order_date     <dttm> 2011-01-07, 2011-01-07, 2011-01-10, 2011-01-10...
## $ order_id       <dbl> 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6,...
## $ order_line     <dbl> 1, 2, 1, 2, 1, 2, 3, 4, 5, 1, 1, 2, 3, 4, 1, 2,...
## $ quantity       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,...
## $ price          <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ total_price    <dbl> 6070, 5970, 2770, 5970, 10660, 3200, 12790, 533...
## $ model          <chr> "Jekyll Carbon 2", "Trigger Carbon 2", "Beast o...
## $ category_1     <chr> "Mountain", "Mountain", "Mountain", "Mountain",...
## $ category_2     <chr> "Over Mountain", "Over Mountain", "Trail", "Ove...
## $ frame_material <chr> "Carbon", "Carbon", "Aluminum", "Carbon", "Carb...
## $ bikeshop_name  <chr> "Ithaca Mountain Climbers", "Ithaca Mountain Cl...
## $ city           <chr> "Ithaca", "Ithaca", "Kansas City", "Kansas City...
## $ state          <chr> "NY", "NY", "KS", "KS", "KY", "KY", "KY", "KY",...

執行tidyquant、lubridate、tidyquant

用readRDS函數讀取rds檔資料

order_date_tbl <- bike_orderlines_tbl %>%
    select(order_date)

order_date_tbl %>%
    pull(order_date) %>%
    class()
## [1] "POSIXct" "POSIXt"

1.1 Character vs Date/Datetime

用select函數選擇需要的資料欄位

class() - 輸出的種類

order_date_tbl %>%
    mutate(order_date_chr = as.character(order_date)) %>%
    mutate(order_date_chr2 = order_date_chr %>% str_c(" 00:00:00")) %>%
    
    mutate(order_date_date = order_date_chr %>% ymd()) %>%
    mutate(order_date_dttm = order_date_chr2 %>% ymd_hms())
## # A tibble: 15,644 x 5
##    order_date          order_date_chr order_date_chr2 order_date_date
##    <dttm>              <chr>          <chr>           <date>         
##  1 2011-01-07 00:00:00 2011-01-07     2011-01-07 00:~ 2011-01-07     
##  2 2011-01-07 00:00:00 2011-01-07     2011-01-07 00:~ 2011-01-07     
##  3 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  4 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  5 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  6 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  7 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  8 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
##  9 2011-01-10 00:00:00 2011-01-10     2011-01-10 00:~ 2011-01-10     
## 10 2011-01-11 00:00:00 2011-01-11     2011-01-11 00:~ 2011-01-11     
## # ... with 15,634 more rows, and 1 more variable: order_date_dttm <dttm>

1.2 Date Classes

用mutate函數新增新的欄位

as.character - 為文字

ymd()函數 - 將資料轉為年月日

ymd_hms() - 將資料轉為年月日小時 分鐘 秒

"06/01/18" %>% mdy() %>% class()
## [1] "Date"
"06/01/18 12:30:15" %>% mdy_hms() %>% class()
## [1] "POSIXct" "POSIXt"
"January 1, 1985" %>% mdy()
## [1] "1985-01-01"
"2011-01-01" %>% ymd() %>% year()
## [1] 2011
"2011-01-01" %>% ymd() %>% month(label = TRUE, abbr = FALSE) 
## [1] 一月
## 12 Levels: 一月 < 二月 < 三月 < 四月 < 五月 < 六月 < 七月 < ... < 十二月
"2011-01-01" %>% ymd() %>% wday(label = TRUE, abbr = FALSE)
## [1] 星期六
## 7 Levels: 星期日 < 星期一 < 星期二 < 星期三 < 星期四 < ... < 星期六
"2011-01-01" %>% ymd() %>% day()
## [1] 1
# Helpers
now()
## [1] "2019-05-20 12:12:32 CST"
today()
## [1] "2019-05-20"

1.3 Lubridate Functions

mdy()函數 - 將資料轉為月 日 年

ymd()函數 - 將資料轉為年 月 日

ymd_hms() - 將資料轉為年月日小時 分鐘 秒

abbr - 縮寫

today() + days(12)
## [1] "2019-06-01"
today() + ddays(12)
## [1] "2019-06-01"
today() + years(4)  
## [1] "2023-05-20"
today() + dyears(4)
## [1] "2023-05-19"

Periods & Durations - Add/subract time to/from a date

簡單的運算式 - Period(一段時間)、Duration(期間)

i <- interval(today(), today() + ddays(12)) 

i / ddays(1) 
## [1] 12
i / dminutes(1) 
## [1] 17280
order_date_tbl %>%
    mutate(today = today()) %>%
    mutate(diff_days = interval(order_date, today) / ddays(1))
## # A tibble: 15,644 x 3
##    order_date          today      diff_days
##    <dttm>              <date>         <dbl>
##  1 2011-01-07 00:00:00 2019-05-20      3055
##  2 2011-01-07 00:00:00 2019-05-20      3055
##  3 2011-01-10 00:00:00 2019-05-20      3052
##  4 2011-01-10 00:00:00 2019-05-20      3052
##  5 2011-01-10 00:00:00 2019-05-20      3052
##  6 2011-01-10 00:00:00 2019-05-20      3052
##  7 2011-01-10 00:00:00 2019-05-20      3052
##  8 2011-01-10 00:00:00 2019-05-20      3052
##  9 2011-01-10 00:00:00 2019-05-20      3052
## 10 2011-01-11 00:00:00 2019-05-20      3051
## # ... with 15,634 more rows

Intervals - Calculate time-based distance

interval / ddays =間隔多少天

interval / dminutes =間隔中的分鐘數

用mutate函數新增新的欄位

bike_sales_y_tbl <- bike_orderlines_tbl %>%
    select(order_date, total_price) %>%
    mutate(order_date = ymd(order_date)) %>%
    mutate(year = year(order_date)) %>%
    group_by(year) %>%
    summarize(sales = sum(total_price)) %>%
    ungroup()

bike_sales_y_tbl
## # A tibble: 5 x 2
##    year    sales
##   <dbl>    <dbl>
## 1  2011 11292885
## 2  2012 12163075
## 3  2013 16480775
## 4  2014 13924085
## 5  2015 17171510
bike_sales_m_tbl <- bike_orderlines_tbl %>%
    select(order_date, total_price) %>%
    mutate(order_date = ymd(order_date)) %>%
    mutate(
        year  = year(order_date),
        month = month(order_date, label = TRUE, abbr = TRUE)
    ) %>%
    group_by(year, month) %>%
    summarize(sales = sum(total_price)) %>%
    ungroup()

bike_sales_m_tbl
## # A tibble: 60 x 3
##     year month   sales
##    <dbl> <ord>   <dbl>
##  1  2011 一月   483015
##  2  2011 二月  1162075
##  3  2011 三月   659975
##  4  2011 四月  1827140
##  5  2011 五月   844170
##  6  2011 六月  1413445
##  7  2011 七月  1194430
##  8  2011 八月   679790
##  9  2011 九月   814720
## 10  2011 十月   734920
## # ... with 50 more rows

2.0 Time-Based Data Grouping

bike_sales_y_tbl - 計算年的資料

用select函數選擇需要的資料欄位

用mutate函數新增新的欄位

ymd()函數 - 將資料轉為年 月 日

用group_by函數 - 群組起來

用summarize函數加總

bike_sales_m_tbl - 計算月的資料

bike_orderlines_tbl %>%
    select(order_date, total_price) %>%
    mutate(order_date = ymd(order_date)) %>%
    mutate(year_month = floor_date(order_date, unit = "month")) %>%
    group_by(year_month) %>%
    summarize(sales = sum(total_price))
## # A tibble: 60 x 2
##    year_month   sales
##    <date>       <dbl>
##  1 2011-01-01  483015
##  2 2011-02-01 1162075
##  3 2011-03-01  659975
##  4 2011-04-01 1827140
##  5 2011-05-01  844170
##  6 2011-06-01 1413445
##  7 2011-07-01 1194430
##  8 2011-08-01  679790
##  9 2011-09-01  814720
## 10 2011-10-01  734920
## # ... with 50 more rows

Floor Date

用select函數選擇需要的資料欄位

用mutate函數新增新的欄位

第一個日期floor_date

用group_by函數 - 群組起來

用summarize函數加總

bike_sales_y_tbl %>%
    mutate(sales_lag_1 = lag(sales, n = 1)) %>%
    mutate(sales_lag_1 = case_when(
        is.na(sales_lag_1) ~ sales,
        TRUE ~ sales_lag_1
    )) %>%
    mutate(diff_1 = sales - sales_lag_1) %>%
    mutate(pct_diff_1 = diff_1 / sales_lag_1) %>%
    mutate(pct_diff_1_chr = scales::percent(pct_diff_1))
## # A tibble: 5 x 6
##    year    sales sales_lag_1   diff_1 pct_diff_1 pct_diff_1_chr
##   <dbl>    <dbl>       <dbl>    <dbl>      <dbl> <chr>         
## 1  2011 11292885    11292885        0     0      0.0%          
## 2  2012 12163075    11292885   870190     0.0771 7.7%          
## 3  2013 16480775    12163075  4317700     0.355  35.5%         
## 4  2014 13924085    16480775 -2556690    -0.155  -15.5%        
## 5  2015 17171510    13924085  3247425     0.233  23.3%
calculate_pct_diff <- function(data) {
    data %>% 
        mutate(sales_lag_1 = lag(sales, n = 1)) %>%
        mutate(sales_lag_1 = case_when(
            is.na(sales_lag_1) ~ sales,
            TRUE ~ sales_lag_1
        )) %>%
        mutate(diff_1 = sales - sales_lag_1) %>%
        mutate(pct_diff_1 = diff_1 / sales_lag_1) %>%
        mutate(pct_diff_1_chr = scales::percent(pct_diff_1))
}

bike_sales_m_tbl %>%
    calculate_pct_diff()
## # A tibble: 60 x 7
##     year month   sales sales_lag_1  diff_1 pct_diff_1 pct_diff_1_chr
##    <dbl> <ord>   <dbl>       <dbl>   <dbl>      <dbl> <chr>         
##  1  2011 一月   483015      483015       0     0      0%            
##  2  2011 二月  1162075      483015  679060     1.41   141%          
##  3  2011 三月   659975     1162075 -502100    -0.432  -43%          
##  4  2011 四月  1827140      659975 1167165     1.77   177%          
##  5  2011 五月   844170     1827140 -982970    -0.538  -54%          
##  6  2011 六月  1413445      844170  569275     0.674  67%           
##  7  2011 七月  1194430     1413445 -219015    -0.155  -15%          
##  8  2011 八月   679790     1194430 -514640    -0.431  -43%          
##  9  2011 九月   814720      679790  134930     0.198  20%           
## 10  2011 十月   734920      814720  -79800    -0.0979 -10%          
## # ... with 50 more rows

3.1 Difference from most recent observation

用mutate函數新增新的欄位並處理NA資料

lag - 往後一期

percent(pct_diff_1) - 轉回百分比

calculate_pct_diff <- function(data) {… - 固定的程式(百分比)

case_when - 向EXCEL裡的IF EALES

bike_sales_y_tbl %>%
    mutate(sales_2011 = first(sales)) %>%
    mutate(diff_2011  = sales - sales_2011) %>%
    mutate(pct_diff_2011 = diff_2011 / sales_2011) %>%
    mutate(pct_diff_2011_chr = scales::percent(pct_diff_2011))
## # A tibble: 5 x 6
##    year    sales sales_2011 diff_2011 pct_diff_2011 pct_diff_2011_chr
##   <dbl>    <dbl>      <dbl>     <dbl>         <dbl> <chr>            
## 1  2011 11292885   11292885         0        0      0.0%             
## 2  2012 12163075   11292885    870190        0.0771 7.7%             
## 3  2013 16480775   11292885   5187890        0.459  45.9%            
## 4  2014 13924085   11292885   2631200        0.233  23.3%            
## 5  2015 17171510   11292885   5878625        0.521  52.1%
bike_sales_m_tbl %>%
    group_by(year) %>%
    mutate(sales_jan = first(sales)) %>% 
    mutate(
        diff_jan         = sales - sales_jan,
        pct_diff_jan     = diff_jan / sales_jan,
        pct_diff_jan_chr = scales::percent(pct_diff_jan)
    ) 
## # A tibble: 60 x 7
## # Groups:   year [5]
##     year month   sales sales_jan diff_jan pct_diff_jan pct_diff_jan_chr
##    <dbl> <ord>   <dbl>     <dbl>    <dbl>        <dbl> <chr>           
##  1  2011 一月   483015    483015        0        0     0%              
##  2  2011 二月  1162075    483015   679060        1.41  141%            
##  3  2011 三月   659975    483015   176960        0.366 37%             
##  4  2011 四月  1827140    483015  1344125        2.78  278%            
##  5  2011 五月   844170    483015   361155        0.748 75%             
##  6  2011 六月  1413445    483015   930430        1.93  193%            
##  7  2011 七月  1194430    483015   711415        1.47  147%            
##  8  2011 八月   679790    483015   196775        0.407 41%             
##  9  2011 九月   814720    483015   331705        0.687 69%             
## 10  2011 十月   734920    483015   251905        0.522 52%             
## # ... with 50 more rows

3.2 Difference from first observation

bike_sales_y_tbl - 處理計算年的資料

用mutate函數新增新的欄

用group_by函數 - 群組起來

bike_sales_m_tbl - 處理計算月的資料

bike_sales_y_tbl %>%
    mutate(cumulative_sales = cumsum(sales)) %>%
    mutate(cumulative_sales_pct = cumulative_sales / sum(sales)) %>%
    mutate(cumulative_sales_pct_chr = cumulative_sales_pct %>% scales::percent())
## # A tibble: 5 x 5
##    year    sales cumulative_sales cumulative_sales_p~ cumulative_sales_pct~
##   <dbl>    <dbl>            <dbl>               <dbl> <chr>                
## 1  2011 11292885         11292885               0.159 15.9%                
## 2  2012 12163075         23455960               0.330 33.0%                
## 3  2013 16480775         39936735               0.562 56.2%                
## 4  2014 13924085         53860820               0.758 75.8%                
## 5  2015 17171510         71032330               1     100.0%
bike_sales_m_tbl %>%
    group_by(year) %>%
    mutate(cumulative_sales = cumsum(sales)) %>%
    mutate(cumulative_sales_pct = cumulative_sales / sum(sales)) %>%
    mutate(cumulative_sales_pct_chr = scales::percent(cumulative_sales_pct))
## # A tibble: 60 x 6
## # Groups:   year [5]
##     year month   sales cumulative_sales cumulative_sales~ cumulative_sales~
##    <dbl> <ord>   <dbl>            <dbl>             <dbl> <chr>            
##  1  2011 一月   483015           483015            0.0428 4.3%             
##  2  2011 二月  1162075          1645090            0.146  14.6%            
##  3  2011 三月   659975          2305065            0.204  20.4%            
##  4  2011 四月  1827140          4132205            0.366  36.6%            
##  5  2011 五月   844170          4976375            0.441  44.1%            
##  6  2011 六月  1413445          6389820            0.566  56.6%            
##  7  2011 七月  1194430          7584250            0.672  67.2%            
##  8  2011 八月   679790          8264040            0.732  73.2%            
##  9  2011 九月   814720          9078760            0.804  80.4%            
## 10  2011 十月   734920          9813680            0.869  86.9%            
## # ... with 50 more rows

4.0 Cumulative Calculations

用mutate函數新增新的欄

用group_by函數 - 群組起來

bike_sales_m_tbl %>%
    mutate(roll_mean_3 = rollmean(sales, k = 3, na.pad = TRUE, align = "right", fill = NA)) %>%
    mutate(roll_mean_6 = rollmean(sales, k = 6, na.pad = TRUE, align = "right", fill = NA))
## # A tibble: 60 x 5
##     year month   sales roll_mean_3 roll_mean_6
##    <dbl> <ord>   <dbl>       <dbl>       <dbl>
##  1  2011 一月   483015         NA          NA 
##  2  2011 二月  1162075         NA          NA 
##  3  2011 三月   659975     768355          NA 
##  4  2011 四月  1827140    1216397.         NA 
##  5  2011 五月   844170    1110428.         NA 
##  6  2011 六月  1413445    1361585     1064970 
##  7  2011 七月  1194430    1150682.    1183539.
##  8  2011 八月   679790    1095888.    1103158.
##  9  2011 九月   814720     896313.    1128949.
## 10  2011 十月   734920     743143.     946912.
## # ... with 50 more rows

5.0 Rolling Calculations

用mutate函數新增新的欄

rollmean類似SMA取線

bike_orderlines_tbl %>%
    mutate(order_date = ymd(order_date)) %>%
    filter(order_date %>% between(left = ymd("2012-01-01"), right = ymd("2013-12-31"))) 
## # A tibble: 6,311 x 13
##    order_date order_id order_line quantity price total_price model
##    <date>        <dbl>      <dbl>    <dbl> <dbl>       <dbl> <chr>
##  1 2012-01-03      323          1        1  3200        3200 Slic~
##  2 2012-01-03      323          2        1  2130        2130 Fat ~
##  3 2012-01-03      323          3        1  1620        1620 Beas~
##  4 2012-01-03      323          4        2  1410        2820 CAAD~
##  5 2012-01-03      323          5        1  3200        3200 Jeky~
##  6 2012-01-03      324          1        1  3200        3200 Trig~
##  7 2012-01-03      324          2        2  1950        3900 CAAD~
##  8 2012-01-03      324          3        1   815         815 CAAD~
##  9 2012-01-03      324          4        1   815         815 Trai~
## 10 2012-01-03      324          5        1  2880        2880 F-Si~
## # ... with 6,301 more rows, and 6 more variables: category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>,
## #   city <chr>, state <chr>
bike_orderlines_tbl %>%
    mutate(order_date = ymd(order_date)) %>%
    filter(year(order_date) %in% c(2012, 2013))
## # A tibble: 6,311 x 13
##    order_date order_id order_line quantity price total_price model
##    <date>        <dbl>      <dbl>    <dbl> <dbl>       <dbl> <chr>
##  1 2012-01-03      323          1        1  3200        3200 Slic~
##  2 2012-01-03      323          2        1  2130        2130 Fat ~
##  3 2012-01-03      323          3        1  1620        1620 Beas~
##  4 2012-01-03      323          4        2  1410        2820 CAAD~
##  5 2012-01-03      323          5        1  3200        3200 Jeky~
##  6 2012-01-03      324          1        1  3200        3200 Trig~
##  7 2012-01-03      324          2        2  1950        3900 CAAD~
##  8 2012-01-03      324          3        1   815         815 CAAD~
##  9 2012-01-03      324          4        1   815         815 Trai~
## 10 2012-01-03      324          5        1  2880        2880 F-Si~
## # ... with 6,301 more rows, and 6 more variables: category_1 <chr>,
## #   category_2 <chr>, frame_material <chr>, bikeshop_name <chr>,
## #   city <chr>, state <chr>

6.0 Filtering Date Ranges

用mutate函數新增新的欄

用filter函數設定資料

%in% 將選取資料於year(order_date)裡面