library(tidyverse)
library(lubridate)
# 1. 讀取資料-注意編碼問題
df_stock <- read_csv("TWSE_Stock Data_2012-2017.csv", locale = locale(encoding = "Big5"))
df_stock
## # A tibble: 96 × 1,237
## security_id type `2015/12/31` `2015/12/30` `2015/12/29` `2015/12/28`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1101 台泥 open 25.4 25.8 25.8 25.8
## 2 1101 台泥 close 25.2 25.1 25.5 25.8
## 3 1102 亞泥 open 25.4 25.7 25.9 25.9
## 4 1102 亞泥 close 25.5 25.3 25.6 25.9
## 5 1216 統一 open 51.2 51.0 51.4 52.1
## 6 1216 統一 close 51.3 51.0 50.7 52.0
## 7 1301 台塑 open 66.3 66.6 66.7 66.7
## 8 1301 台塑 close 69.9 66.7 66.2 67.2
## 9 1303 南亞 open 52.1 52.6 52.3 52.8
## 10 1303 南亞 close 54.5 52.8 52.3 52.5
## # ℹ 86 more rows
## # ℹ 1,231 more variables: `2015/12/25` <dbl>, `2015/12/24` <dbl>,
## # `2015/12/23` <dbl>, `2015/12/22` <dbl>, `2015/12/21` <dbl>,
## # `2015/12/18` <dbl>, `2015/12/17` <dbl>, `2015/12/16` <dbl>,
## # `2015/12/15` <dbl>, `2015/12/14` <dbl>, `2015/12/11` <dbl>,
## # `2015/12/10` <dbl>, `2015/12/9` <dbl>, `2015/12/8` <dbl>,
## # `2015/12/7` <dbl>, `2015/12/4` <dbl>, `2015/12/3` <dbl>, …
# 2. data tidying-轉成長表格並確認
df_longer <- df_stock %>%
pivot_longer(cols = -c(security_id, type), names_to = "date", values_to = "price")
df_longer
## # A tibble: 118,560 × 4
## security_id type date price
## <chr> <chr> <chr> <dbl>
## 1 1101 台泥 open 2015/12/31 25.4
## 2 1101 台泥 open 2015/12/30 25.8
## 3 1101 台泥 open 2015/12/29 25.8
## 4 1101 台泥 open 2015/12/28 25.8
## 5 1101 台泥 open 2015/12/25 26.0
## 6 1101 台泥 open 2015/12/24 25.5
## 7 1101 台泥 open 2015/12/23 25.5
## 8 1101 台泥 open 2015/12/22 25.8
## 9 1101 台泥 open 2015/12/21 24.9
## 10 1101 台泥 open 2015/12/18 25.8
## # ℹ 118,550 more rows
# 3. data cleaning-清洗數據(date是chr, 得改)
library(lubridate)
df_clean <- df_longer %>%
mutate(date = ymd(date)) %>%
separate(col = security_id, into = c("id", "name"), sep = " ")
df_clean
## # A tibble: 118,560 × 5
## id name type date price
## <chr> <chr> <chr> <date> <dbl>
## 1 1101 台泥 open 2015-12-31 25.4
## 2 1101 台泥 open 2015-12-30 25.8
## 3 1101 台泥 open 2015-12-29 25.8
## 4 1101 台泥 open 2015-12-28 25.8
## 5 1101 台泥 open 2015-12-25 26.0
## 6 1101 台泥 open 2015-12-24 25.5
## 7 1101 台泥 open 2015-12-23 25.5
## 8 1101 台泥 open 2015-12-22 25.8
## 9 1101 台泥 open 2015-12-21 24.9
## 10 1101 台泥 open 2015-12-18 25.8
## # ℹ 118,550 more rows
## 股價趨勢探索
# 只留下收盤價
df_close <- df_clean %>%
filter(type == "close")
df_close %>%
ggplot(mapping = aes(x = date, y = price, color = name))+
geom_line()+
labs(title = "台灣權值股股價走勢 (2011-2015)",
x = "日期",
y = "股價 (TWD)",
color = "公司名稱")
# 指數化
df_trend <- df_clean %>%
filter(type == "close") %>% # 只看收盤價
group_by(name) %>% # 依照公司分組
arrange(date) %>% # 確保日期是從 2011 開始
mutate(idx_price = price / first(price) * 100) %>%
ungroup()
df_trend
## # A tibble: 59,280 × 6
## id name type date price idx_price
## <chr> <chr> <chr> <date> <dbl> <dbl>
## 1 1101 台泥 close 2011-01-03 23.7 100
## 2 1102 亞泥 close 2011-01-03 21.3 100
## 3 1216 統一 close 2011-01-03 26.8 100
## 4 1301 台塑 close 2011-01-03 70.8 100
## 5 1303 南亞 close 2011-01-03 54.4 100
## 6 1326 台化 close 2011-01-03 71.3 100
## 7 1402 遠東新 close 2011-01-03 33.5 100
## 8 2002 中鋼 close 2011-01-03 24.5 100
## 9 2105 正新 close 2011-01-03 32.9 100
## 10 2301 光寶科 close 2011-01-03 25.6 100
## # ℹ 59,270 more rows
# 畫圖
df_trend %>%
ggplot(mapping = aes(x = date, y = idx_price, colour = name))+
geom_line(size = 1)+
labs(
title = "台灣權值股成長競賽 (2011-2015)",
subtitle = "基準日:2011/01/03 = 100",
x = "年份",
y = "股價指數 (起點=100)",
color = "公司名稱"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2470 rows containing missing values or values outside the scale range
## (`geom_line()`).
# 個股太多了,只挑電子股出來
tech_prefixes <- c("23", "24", "30", "34", "49")
df_tech <- df_trend %>%
filter(str_sub(as.character(id), 1, 2) %in% tech_prefixes) # str_sub(字串, 開始, 結束): 取出前兩個字
df_tech %>%
ggplot(aes(x = date, y = idx_price, color = name)) +
geom_line(size = 1) +
labs(
title = "台灣電子權值股成長競賽 (2011-2015)",
subtitle = "基準日:2011/01/03 = 100",
x = "年份",
y = "股價指數 (Index)",
color = "公司名稱"
)
# 評估風險
df_risk <- df_clean %>%
filter(type == "close") %>%
group_by(name) %>%
arrange(date) %>%
mutate(daily_return = (price - lag(price)) / lag(price)) %>%
ungroup() %>%
na.omit() # 去除掉第一天
head(df_risk %>% select(date, name, price, daily_return))
## # A tibble: 6 × 4
## date name price daily_return
## <date> <chr> <dbl> <dbl>
## 1 2011-01-04 台泥 23.5 -0.00761
## 2 2011-01-04 亞泥 21.6 0.0150
## 3 2011-01-04 統一 26.7 -0.00224
## 4 2011-01-04 台塑 70.6 -0.00311
## 5 2011-01-04 南亞 54.2 -0.00276
## 6 2011-01-04 台化 71.1 -0.00308
# 風險比較
target_stocks <- c("台積電", "大立光", "中華電", "群創")
df_risk %>%
filter(name %in% target_stocks) %>%
ggplot(aes(x = name, y = daily_return, fill = name)) +
geom_boxplot() +
labs(
title = "個股風險分析:日報酬率分布 (2011-2015)",
subtitle = "箱子越『扁』代表越穩;越『長』代表波動越劇烈",
x = "公司",
y = "日漲跌幅"
)
中華電的波動(風險)最小;
群創與大立光的風險都很大,但獲利群創遠遠小於大立光
可見波動是獲利的必要條件,但不是充分條件