library(tidyverse)
library(lubridate)
# 1. 讀取資料-注意編碼問題
df_stock <- read_csv("TWSE_Stock Data_2012-2017.csv", locale = locale(encoding = "Big5"))
df_stock                     
## # A tibble: 96 × 1,237
##    security_id type  `2015/12/31` `2015/12/30` `2015/12/29` `2015/12/28`
##    <chr>       <chr>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 1101 台泥   open          25.4         25.8         25.8         25.8
##  2 1101 台泥   close         25.2         25.1         25.5         25.8
##  3 1102 亞泥   open          25.4         25.7         25.9         25.9
##  4 1102 亞泥   close         25.5         25.3         25.6         25.9
##  5 1216 統一   open          51.2         51.0         51.4         52.1
##  6 1216 統一   close         51.3         51.0         50.7         52.0
##  7 1301 台塑   open          66.3         66.6         66.7         66.7
##  8 1301 台塑   close         69.9         66.7         66.2         67.2
##  9 1303 南亞   open          52.1         52.6         52.3         52.8
## 10 1303 南亞   close         54.5         52.8         52.3         52.5
## # ℹ 86 more rows
## # ℹ 1,231 more variables: `2015/12/25` <dbl>, `2015/12/24` <dbl>,
## #   `2015/12/23` <dbl>, `2015/12/22` <dbl>, `2015/12/21` <dbl>,
## #   `2015/12/18` <dbl>, `2015/12/17` <dbl>, `2015/12/16` <dbl>,
## #   `2015/12/15` <dbl>, `2015/12/14` <dbl>, `2015/12/11` <dbl>,
## #   `2015/12/10` <dbl>, `2015/12/9` <dbl>, `2015/12/8` <dbl>,
## #   `2015/12/7` <dbl>, `2015/12/4` <dbl>, `2015/12/3` <dbl>, …
# 2. data tidying-轉成長表格並確認
df_longer <- df_stock %>%
  pivot_longer(cols = -c(security_id, type), names_to = "date", values_to = "price")
df_longer
## # A tibble: 118,560 × 4
##    security_id type  date       price
##    <chr>       <chr> <chr>      <dbl>
##  1 1101 台泥   open  2015/12/31  25.4
##  2 1101 台泥   open  2015/12/30  25.8
##  3 1101 台泥   open  2015/12/29  25.8
##  4 1101 台泥   open  2015/12/28  25.8
##  5 1101 台泥   open  2015/12/25  26.0
##  6 1101 台泥   open  2015/12/24  25.5
##  7 1101 台泥   open  2015/12/23  25.5
##  8 1101 台泥   open  2015/12/22  25.8
##  9 1101 台泥   open  2015/12/21  24.9
## 10 1101 台泥   open  2015/12/18  25.8
## # ℹ 118,550 more rows
# 3. data cleaning-清洗數據(date是chr, 得改)
library(lubridate)
df_clean <- df_longer %>% 
  mutate(date = ymd(date)) %>% 
  separate(col = security_id, into = c("id", "name"), sep = " ")
df_clean
## # A tibble: 118,560 × 5
##    id    name  type  date       price
##    <chr> <chr> <chr> <date>     <dbl>
##  1 1101  台泥  open  2015-12-31  25.4
##  2 1101  台泥  open  2015-12-30  25.8
##  3 1101  台泥  open  2015-12-29  25.8
##  4 1101  台泥  open  2015-12-28  25.8
##  5 1101  台泥  open  2015-12-25  26.0
##  6 1101  台泥  open  2015-12-24  25.5
##  7 1101  台泥  open  2015-12-23  25.5
##  8 1101  台泥  open  2015-12-22  25.8
##  9 1101  台泥  open  2015-12-21  24.9
## 10 1101  台泥  open  2015-12-18  25.8
## # ℹ 118,550 more rows
## 股價趨勢探索
# 只留下收盤價
df_close <- df_clean %>% 
  filter(type == "close")
df_close %>% 
  ggplot(mapping = aes(x = date, y = price, color = name))+
  geom_line()+
  labs(title = "台灣權值股股價走勢 (2011-2015)", 
       x = "日期", 
       y = "股價 (TWD)", 
       color = "公司名稱")

# 指數化
df_trend <- df_clean %>%
  filter(type == "close") %>% # 只看收盤價
  group_by(name) %>% # 依照公司分組
  arrange(date) %>% # 確保日期是從 2011 開始
  mutate(idx_price = price / first(price) * 100) %>% 
  ungroup()
df_trend
## # A tibble: 59,280 × 6
##    id    name   type  date       price idx_price
##    <chr> <chr>  <chr> <date>     <dbl>     <dbl>
##  1 1101  台泥   close 2011-01-03  23.7       100
##  2 1102  亞泥   close 2011-01-03  21.3       100
##  3 1216  統一   close 2011-01-03  26.8       100
##  4 1301  台塑   close 2011-01-03  70.8       100
##  5 1303  南亞   close 2011-01-03  54.4       100
##  6 1326  台化   close 2011-01-03  71.3       100
##  7 1402  遠東新 close 2011-01-03  33.5       100
##  8 2002  中鋼   close 2011-01-03  24.5       100
##  9 2105  正新   close 2011-01-03  32.9       100
## 10 2301  光寶科 close 2011-01-03  25.6       100
## # ℹ 59,270 more rows
# 畫圖
df_trend %>% 
  ggplot(mapping = aes(x = date, y = idx_price, colour = name))+
  geom_line(size = 1)+
  labs(
    title = "台灣權值股成長競賽 (2011-2015)",
    subtitle = "基準日:2011/01/03 = 100",
    x = "年份", 
    y = "股價指數 (起點=100)", 
    color = "公司名稱"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2470 rows containing missing values or values outside the scale range
## (`geom_line()`).

# 個股太多了,只挑電子股出來
tech_prefixes <- c("23", "24", "30", "34", "49")
df_tech <- df_trend %>%
  filter(str_sub(as.character(id), 1, 2) %in% tech_prefixes) # str_sub(字串, 開始, 結束): 取出前兩個字
df_tech %>% 
  ggplot(aes(x = date, y = idx_price, color = name)) + 
  geom_line(size = 1) + 
  labs(
    title = "台灣電子權值股成長競賽 (2011-2015)",
    subtitle = "基準日:2011/01/03 = 100",
    x = "年份", 
    y = "股價指數 (Index)", 
    color = "公司名稱"
  )

# 評估風險
df_risk <- df_clean %>%
  filter(type == "close") %>%
  group_by(name) %>%
  arrange(date) %>% 
  mutate(daily_return = (price - lag(price)) / lag(price)) %>%
  ungroup() %>%
  na.omit() # 去除掉第一天

head(df_risk %>% select(date, name, price, daily_return))
## # A tibble: 6 × 4
##   date       name  price daily_return
##   <date>     <chr> <dbl>        <dbl>
## 1 2011-01-04 台泥   23.5     -0.00761
## 2 2011-01-04 亞泥   21.6      0.0150 
## 3 2011-01-04 統一   26.7     -0.00224
## 4 2011-01-04 台塑   70.6     -0.00311
## 5 2011-01-04 南亞   54.2     -0.00276
## 6 2011-01-04 台化   71.1     -0.00308
# 風險比較
target_stocks <- c("台積電", "大立光", "中華電", "群創")

df_risk %>%
  filter(name %in% target_stocks) %>% 
  ggplot(aes(x = name, y = daily_return, fill = name)) + 
  geom_boxplot() + 
  labs(
    title = "個股風險分析:日報酬率分布 (2011-2015)",
    subtitle = "箱子越『扁』代表越穩;越『長』代表波動越劇烈",
    x = "公司",
    y = "日漲跌幅"
  )

中華電的波動(風險)最小; 群創與大立光的風險都很大,但獲利群創遠遠小於大立光 可見波動是獲利的必要條件,但不是充分條件