## 1. 專案背景 (Introduction)
## 本專案旨在透過**主成分分析 (PCA)**,將 2017 年上市公司的 16 項複雜財務指標,濃縮為少數幾個關鍵維度,以協助投資人快速識別具有「高獲利」與「高效率」潛力的優質企業。
## 2. 資料匯入與整理 (Import & Tidy)
## 我們使用《R for Data Science》建議的 Tidyverse 流程進行資料清洗。
financial <- read_csv("2017_financial index_163 comp.csv")
names(financial)
## [1] "comp_id" "roe" "roa"
## [4] "profit_margin_rate" "gross_margin_rate" "expense_rate"
## [7] "asset_turnover" "inventory_turnover" "equity_turnnover"
## [10] "rev_growth_rate" "margin_growth_rate" "op_profit_growth_rate"
## [13] "cash_reinv_rate" "asset_growth_rate" "current_ratio"
## [16] "quick_rartio" "debt_ratio"
# 資料清洗
fin_clean <- financial %>%
mutate(
op_profit_growth_rate = parse_number(as.character(op_profit_growth_rate)),
current_ratio = parse_number(as.character(current_ratio)),
quick_rartio = parse_number(as.character(quick_rartio))
) %>%
drop_na()
head(fin_clean) %>% knitr::kable()
| comp_id | roe | roa | profit_margin_rate | gross_margin_rate | expense_rate | asset_turnover | inventory_turnover | equity_turnnover | rev_growth_rate | margin_growth_rate | op_profit_growth_rate | cash_reinv_rate | asset_growth_rate | current_ratio | quick_rartio | debt_ratio |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2303 | 3.06 | 2.21 | 4.40 | 18.12 | 14.83 | 0.38 | 6.93 | 0.69 | 0.96 | -10.93 | 6.05 | 4.66 | 1.93 | 158.03 | 118.71 | 45.69 |
| 2330 | 23.56 | 17.84 | 39.45 | 50.62 | 11.04 | 0.50 | 7.88 | 0.67 | 3.11 | 4.21 | 2.01 | 11.06 | 5.59 | 238.97 | 215.17 | 23.55 |
| 2337 | 25.68 | 14.29 | 16.82 | 36.95 | 20.12 | 0.86 | 2.54 | 1.59 | 41.75 | 116.49 | 1708.73 | 5.31 | 24.33 | 187.85 | 110.85 | 44.21 |
| 2342 | -3.41 | -0.72 | 3.86 | 16.86 | 13.00 | 0.66 | 8.87 | 1.55 | 12.73 | 13.25 | 22.41 | 0.56 | 17.54 | 184.99 | 158.19 | 51.42 |
| 2344 | 10.90 | 7.69 | 13.99 | 34.30 | 20.31 | 0.61 | 3.99 | 0.89 | 13.07 | 35.83 | 79.26 | 5.91 | 29.60 | 229.31 | 173.05 | 30.06 |
| 2408 | 37.01 | 28.27 | 34.22 | 44.87 | 10.66 | 0.38 | 5.16 | 0.50 | 31.91 | 91.77 | 119.72 | 7.94 | 10.08 | 424.44 | 366.56 | 12.38 |
由於財務指標間的單位差異巨大(如 ROE
為百分比,週轉率為小數),我們設定 scale. = TRUE
進行標準化,避免特定變數主導模型。
# 執行 PCA (暫時移除 id 欄位)
pca_model <- prcomp(fin_clean %>% select(-comp_id), scale. = TRUE)
# 查看解釋變異量
summary(pca_model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0330 1.8739 1.4399 1.16215 1.01323 0.91896 0.87186
## Proportion of Variance 0.2583 0.2195 0.1296 0.08441 0.06417 0.05278 0.04751
## Cumulative Proportion 0.2583 0.4778 0.6074 0.69177 0.75594 0.80872 0.85623
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.77645 0.73927 0.67853 0.56103 0.50353 0.27912 0.15178
## Proportion of Variance 0.03768 0.03416 0.02878 0.01967 0.01585 0.00487 0.00144
## Cumulative Proportion 0.89391 0.92806 0.95684 0.97651 0.99236 0.99723 0.99867
## PC15 PC16
## Standard deviation 0.12692 0.07223
## Proportion of Variance 0.00101 0.00033
## Cumulative Proportion 0.99967 1.00000
決定要保留多少主成分。
# 整理繪圖資料
pca_var <- tibble(
pc = 1:length(pca_model$sdev),
var_pct = pca_model$sdev^2 / sum(pca_model$sdev^2),
cum_var = cumsum(var_pct)
)
# 繪圖
ggplot(pca_var, aes(x = pc)) +
geom_col(aes(y = var_pct), fill = "steelblue", alpha = 0.7) +
geom_line(aes(y = cum_var), color = "red", size = 1) +
geom_point(aes(y = cum_var), color = "red") +
geom_hline(yintercept = 0.8, linetype = "dashed", color = "orange") +
labs(title = "Scree Plot: 累積解釋力", x = "主成分", y = "解釋變異比例") +
theme_minimal()
定義主成分的商業意義。 發現: - PC1 (橫軸):與 ROE, ROA 高度相關 -> 定義為「獲利指標」。 - PC2 (縱軸):與資產週轉率高度相關 -> 定義為「效率指標」。
# 提取負荷量
pca_loadings <- as_tibble(pca_model$rotation, rownames = "variable")
ggplot(pca_loadings, aes(x = PC1, y = PC2, label = variable)) +
geom_segment(aes(xend = PC1, yend = PC2), x = 0, y = 0,
arrow = arrow(length = unit(0.3, "cm")), color = "darkblue", alpha = 0.5) +
geom_text_repel(color = "darkred", size = 3) +
coord_fixed() +
labs(title = "Loading Plot: 變數與主成分的關係", x = "PC1 (獲利)", y = "PC2 (效率)") +
theme_minimal()
將所有公司投影到這兩個新維度上。
# 結合 PCA 分數與公司代號
fin_scores <- fin_clean %>%
bind_cols(as_tibble(pca_model$x))
# 找出表現優異的「超級巨星」 (假設 PC1, PC2 正向代表好,視實際正負號調整)
# 這裡標示出 PC2 (效率) 特別突出的公司
top_efficiency <- fin_scores %>%
filter(abs(PC2) > 3)
ggplot(fin_scores, aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey") +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
geom_point(color = "steelblue", alpha = 0.6) +
# 標註特定公司
geom_text_repel(data = top_efficiency, aes(label = comp_id), color = "red", size = 4) +
labs(
title = "企業戰略地圖",
subtitle = "尋找兼具獲利與效率的投資標的",
x = "獲利指標 (PC1)",
y = "效率指標 (PC2)"
) +
theme_minimal()
透過 PCA 分析,我們成功將 16 維的財務數據降維,並發現 3219 (倚強科) 與 4967 (十銓) 在效率與獲利指標上表現突出,值得投資人進一步深入研究。 ```