This document analyzes Taiwan stock market data from the TEJ database covering the period from January 2, 2024 to June 30, 2025. The analysis includes data cleaning, transformation, return calculations, and identification of the largest market cap firms.
Import the data file using read_tsv() function and
display the structure.
# Set the file path to Downloads folder
file_path <- "C:/Users/97688/Downloads/tej_day_price_2024_20250630.txt"
# Import tab-delimited data
data <- read_tsv(file_path)
# Show data structure
glimpse(data)## Rows: 337,347
## Columns: 12
## $ CO_ID <chr> "1101 TCC", "1102 ACC", "1103 CHC", "1104 UCC", …
## $ Date <dbl> 20240102, 20240102, 20240102, 20240102, 20240102…
## $ `TSE ID` <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, …
## $ `TSE Sector` <chr> "01", "01", "01", "01", "01", "01", "01", "02", …
## $ `English Short Name` <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSI…
## $ `Open(NTD)` <dbl> 32.5373, 37.2642, 17.7825, 26.0628, 14.1679, 16.…
## $ `High(NTD)` <dbl> 32.5373, 37.4442, 17.7825, 26.1505, 14.1679, 16.…
## $ `Low(NTD)` <dbl> 32.3038, 36.9492, 17.5953, 25.9750, 14.0343, 16.…
## $ `Close(NTD)` <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.…
## $ `Volume(1000S)` <dbl> 14937, 6223, 171, 260, 442, 228, 57, 126, 48, 18…
## $ `Amount(NTD1000)` <dbl> 518751, 256522, 3240, 7736, 6992, 4159, 1075, 24…
## $ `Market Cap.(NTD MN)` <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754,…
The dataset contains 337347 rows and 12 columns with information about stock prices, trading volume, and market capitalization.
Select and rename columns 2, 3, 5, 9, and 12 to: “id”, “name”, “date”, “price”, and “cap”.
data_renamed <- data %>%
select(id = `TSE ID`,
name = `English Short Name`,
date = Date,
price = `Close(NTD)`,
cap = `Market Cap.(NTD MN)`)
glimpse(data_renamed)## Rows: 337,347
## Columns: 5
## $ id <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 1203, 1210, 1213…
## $ name <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSINGTA", "Tuna Cem…
## $ date <dbl> 20240102, 20240102, 20240102, 20240102, 20240102, 20240102, 2024…
## $ price <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.1807, 18.3336, 1…
## $ cap <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754, 9640, 13992, 52…
Convert id to text, date to date format, and transform data from long to wide format.
data_wide <- data_renamed %>%
mutate(id = as.character(id),
date = ymd(date)) %>%
select(id, date, price) %>%
pivot_wider(names_from = id, values_from = price)
# Display first 10 rows and 10 columns
data_wide %>%
select(1:10) %>%
head(10) %>%
kable(caption = "First 10 rows and 10 columns of wide format data")| date | 1101 | 1102 | 1103 | 1104 | 1108 | 1109 | 1110 | 1201 | 1203 |
|---|---|---|---|---|---|---|---|---|---|
| 2024-01-02 | 32.3972 | 37.0392 | 17.6421 | 26.0628 | 14.0788 | 16.1807 | 18.3336 | 18.1893 | 55.3475 |
| 2024-01-03 | 31.9304 | 36.5892 | 17.5017 | 25.8873 | 14.0343 | 16.0918 | 18.1873 | 18.1893 | 54.6830 |
| 2024-01-04 | 31.9304 | 37.0392 | 17.5017 | 25.5363 | 14.0788 | 16.1362 | 18.3823 | 18.1416 | 54.1134 |
| 2024-01-05 | 32.0704 | 36.9942 | 17.5485 | 25.7995 | 14.0788 | 16.1362 | 18.3336 | 18.1416 | 54.8729 |
| 2024-01-08 | 31.9771 | 37.1742 | 17.5953 | 25.7118 | 14.1679 | 16.0918 | 18.3336 | 18.1416 | 54.5881 |
| 2024-01-09 | 31.7903 | 36.9042 | 17.5485 | 25.3607 | 13.8115 | 16.0473 | 18.4311 | 18.0939 | 54.5881 |
| 2024-01-10 | 31.4636 | 36.6342 | 17.4081 | 25.3607 | 13.6778 | 16.0473 | 18.4799 | 17.9984 | 53.4488 |
| 2024-01-11 | 31.4636 | 36.7242 | 17.5017 | 25.5801 | 13.7670 | 16.0918 | 18.4799 | 17.9984 | 55.0627 |
| 2024-01-12 | 31.5103 | 36.5892 | 17.5017 | 25.6240 | 13.8115 | 16.0473 | 18.4311 | 17.9984 | 54.3982 |
| 2024-01-15 | 31.4169 | 36.5892 | 17.4081 | 25.4485 | 13.7670 | 16.0029 | 18.3336 | 18.0461 | 53.8286 |
Dimensions: 358 rows × 947 columns
Count the number of NA values for each stock.
na_counts <- data_wide %>%
select(-date) %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "key", values_to = "value") %>%
filter(value > 0) %>%
arrange(desc(value))
na_counts %>%
head(10) %>%
kable(caption = "Top 10 stocks with NA values")| key | value |
|---|---|
| 7799 | 217 |
| 7788 | 198 |
| 7780 | 196 |
| 4585 | 177 |
| 3716 | 160 |
| 7765 | 151 |
| 7750 | 108 |
| 7736 | 53 |
| 7732 | 43 |
| 7722 | 18 |
Total stocks with NA values: 10
Replace NA values with the closest available stock prices using
na.locf().
data_filled <- data_wide %>%
tk_xts(date_var = date) %>%
na.locf() %>%
tk_tbl(rename_index = "date")
# Display first 10 rows and 10 columns
data_filled %>%
select(1:10) %>%
head(10) %>%
kable(caption = "Data after filling NA values")| date | 1101 | 1102 | 1103 | 1104 | 1108 | 1109 | 1110 | 1201 | 1203 |
|---|---|---|---|---|---|---|---|---|---|
| 2024-01-02 | 32.3972 | 37.0392 | 17.6421 | 26.0628 | 14.0788 | 16.1807 | 18.3336 | 18.1893 | 55.3475 |
| 2024-01-03 | 31.9304 | 36.5892 | 17.5017 | 25.8873 | 14.0343 | 16.0918 | 18.1873 | 18.1893 | 54.6830 |
| 2024-01-04 | 31.9304 | 37.0392 | 17.5017 | 25.5363 | 14.0788 | 16.1362 | 18.3823 | 18.1416 | 54.1134 |
| 2024-01-05 | 32.0704 | 36.9942 | 17.5485 | 25.7995 | 14.0788 | 16.1362 | 18.3336 | 18.1416 | 54.8729 |
| 2024-01-08 | 31.9771 | 37.1742 | 17.5953 | 25.7118 | 14.1679 | 16.0918 | 18.3336 | 18.1416 | 54.5881 |
| 2024-01-09 | 31.7903 | 36.9042 | 17.5485 | 25.3607 | 13.8115 | 16.0473 | 18.4311 | 18.0939 | 54.5881 |
| 2024-01-10 | 31.4636 | 36.6342 | 17.4081 | 25.3607 | 13.6778 | 16.0473 | 18.4799 | 17.9984 | 53.4488 |
| 2024-01-11 | 31.4636 | 36.7242 | 17.5017 | 25.5801 | 13.7670 | 16.0918 | 18.4799 | 17.9984 | 55.0627 |
| 2024-01-12 | 31.5103 | 36.5892 | 17.5017 | 25.6240 | 13.8115 | 16.0473 | 18.4311 | 17.9984 | 54.3982 |
| 2024-01-15 | 31.4169 | 36.5892 | 17.4081 | 25.4485 | 13.7670 | 16.0029 | 18.3336 | 18.0461 | 53.8286 |
Total rows after filling: 358
Delete stocks that contained NA values from the original data.
# Get stock IDs with NA values
stocks_with_na <- na_counts$key
# Remove these stocks
data_no_na <- data_wide %>%
select(-all_of(stocks_with_na))
# Show dimensions
dimensions <- dim(data_no_na)
cat("Dimensions after removing stocks with NA:\n")## Dimensions after removing stocks with NA:
## Rows: 358
## Columns: 937
Updated dimensions: 358 rows × 937 columns
Convert to xts format and calculate daily discrete returns.
# Convert to xts
data_xts <- data_no_na %>%
tk_xts(date_var = date)
# Calculate daily returns
daily_returns <- Return.calculate(data_xts, method = "discrete")
# Remove first row and show first 5 stocks, first 5 days
daily_returns_clean <- daily_returns[-1, ]
daily_returns_clean[1:5, 1:5] %>%
as.data.frame() %>%
rownames_to_column("Date") %>%
kable(caption = "Daily returns for first 5 stocks (first 5 days)",
digits = 6)| Date | 1101 | 1102 | 1103 | 1104 | 1108 |
|---|---|---|---|---|---|
| 2024-01-03 | -0.014409 | -0.012149 | -0.007958 | -0.006734 | -0.003161 |
| 2024-01-04 | 0.000000 | 0.012299 | 0.000000 | -0.013559 | 0.003171 |
| 2024-01-05 | 0.004385 | -0.001215 | 0.002674 | 0.010307 | 0.000000 |
| 2024-01-08 | -0.002909 | 0.004866 | 0.002667 | -0.003399 | 0.006329 |
| 2024-01-09 | -0.005842 | -0.007263 | -0.002660 | -0.013655 | -0.025155 |
Convert to monthly frequency and calculate monthly returns.
# Convert to monthly and calculate returns
monthly_returns <- data_xts %>%
to.monthly(indexAt = "lastof", OHLC = FALSE) %>%
Return.calculate(method = "discrete")
# Remove first row and show first 5 stocks, first 5 months
monthly_returns_clean <- monthly_returns[-1, ]
monthly_returns_clean[1:5, 1:5] %>%
as.data.frame() %>%
rownames_to_column("Date") %>%
kable(caption = "Monthly returns for first 5 stocks (first 5 months)",
digits = 6)| Date | 1101 | 1102 | 1103 | 1104 | 1108 |
|---|---|---|---|---|---|
| 2024-02-29 | 0.006272 | 0.017608 | -0.008404 | 0.018870 | 0.036185 |
| 2024-03-31 | 0.001555 | 0.021014 | -0.016951 | 0.063972 | 0.003171 |
| 2024-04-30 | -0.003108 | 0.058113 | 0.057470 | 0.112340 | 0.072790 |
| 2024-05-31 | 0.029639 | -0.049201 | 0.032612 | -0.032715 | 0.000000 |
| 2024-06-30 | 0.036365 | 0.055357 | -0.036845 | 0.048528 | -0.011805 |
Find the 20 largest market cap firms at year-end 2024 and mid-year 2025.
top20_firms <- data_renamed %>%
mutate(date = ymd(date),
id = as.character(id),
year1 = year(date)) %>%
filter((date == "2024-12-31") | (date == "2025-06-30")) %>%
group_by(date, year1) %>%
arrange(desc(cap)) %>%
slice(1:20) %>%
ungroup() %>%
mutate(cap1 = paste0("$", format(cap, big.mark = ","))) %>%
select(date, year1, cap, cap1, id, name)top20_2024 <- top20_firms %>% filter(year1 == 2024)
top20_2024 %>%
select(Rank = id, Name = name, `Market Cap` = cap1) %>%
mutate(Rank = row_number()) %>%
kable(caption = "Top 20 largest cap firms at 2024-12-31")| Rank | Name | Market Cap |
|---|---|---|
| 1 | TSMC | \(27,877,688 | | 2|Hon Hai |\) 2,556,073 |
| 3 | MediaTek | $ 2,266,389 |
| 4 | Fubon Financial | $ 1,234,015 |
| 5 | DELTA | $ 1,118,242 |
| 6 | QCI | $ 1,108,574 |
| 7 | CATHAY FHC | $ 1,001,907 |
| 8 | CHT | $ 958,045 |
| 9 | CTBC Holding | $ 767,154 |
| 10 | ASEH | $ 715,219 |
| 11 | Mega FHC | $ 574,052 |
| 12 | UMC | $ 540,739 |
| 13 | EMC | $ 487,135 |
| 14 | Wiwynn | $ 486,903 |
| 15 | Uni-President | $ 459,675 |
| 16 | Asustek | $ 457,540 |
| 17 | Yuanta Group | $ 440,057 |
| 18 | Accton | $ 433,744 |
| 19 | E.S.F.H | $ 431,087 |
| 20 | TWM | $ 422,590 |
top20_2025 <- top20_firms %>% filter(year1 == 2025)
top20_2025 %>%
select(ID = id, Name = name, `Market Cap` = cap1) %>%
mutate(Rank = row_number()) %>%
kable(caption = "Top 20 largest cap firms at 2025-06-30")| ID | Name | Market Cap | Rank |
|---|---|---|---|
| 2330 | TSMC | \(27,488,572 | 1| |2317 |Hon Hai |\) 2,236,570 | 2 |
| 2454 | MediaTek | $ 2,002,073 | 3 |
| 2881 | Fubon Financial | $ 1,193,018 | 4 |
| 2308 | DELTA | $ 1,072,785 | 5 |
| 2382 | QCI | $ 1,060,291 | 6 |
| 2412 | CHT | $ 1,047,255 | 7 |
| 2882 | CATHAY FHC | $ 921,226 | 8 |
| 2891 | CTBC Holding | $ 858,722 | 9 |
| 3711 | ASEH | $ 652,308 | 10 |
| 2886 | Mega FHC | $ 608,910 | 11 |
| 2303 | UMC | $ 554,998 | 12 |
| 2884 | E.S.F.H | $ 525,462 | 13 |
| 2357 | Asustek | $ 478,338 | 14 |
| 6669 | Wiwynn | $ 470,177 | 15 |
| 1216 | Uni-President | $ 460,243 | 16 |
| 2885 | Yuanta Group | $ 441,999 | 17 |
| 2603 | EMC | $ 430,844 | 18 |
| 3045 | TWM | $ 428,175 | 19 |
| 2345 | Accton | $ 409,616 | 20 |
This analysis successfully:
Key Findings:
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.50 timetk_2.9.1
## [3] lubridate_1.9.4 forcats_1.0.1
## [5] stringr_1.5.2 dplyr_1.1.4
## [7] purrr_1.1.0 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.3.0
## [11] ggplot2_4.0.0 tidyverse_2.0.0
## [13] PerformanceAnalytics_2.0.8 quantmod_0.4.28
## [15] TTR_0.24.4 xts_0.14.1
## [17] zoo_1.8-14 tidyquant_1.0.11
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.53 bslib_0.9.0
## [4] recipes_1.3.1 lattice_0.22-7 tzdb_0.5.0
## [7] quadprog_1.5-8 vctrs_0.6.5 tools_4.5.1
## [10] generics_0.1.4 parallel_4.5.1 curl_7.0.0
## [13] RobStatTM_1.0.11 pkgconfig_2.0.3 Matrix_1.7-3
## [16] data.table_1.17.8 RColorBrewer_1.1-3 S7_0.2.0
## [19] lifecycle_1.0.4 compiler_4.5.1 farver_2.1.2
## [22] codetools_0.2-20 htmltools_0.5.8.1 class_7.3-23
## [25] sass_0.4.10 yaml_2.3.10 prodlim_2025.04.28
## [28] crayon_1.5.3 furrr_0.3.1 pillar_1.11.1
## [31] jquerylib_0.1.4 MASS_7.3-65 cachem_1.1.0
## [34] gower_1.0.2 rpart_4.1.24 parallelly_1.45.1
## [37] lava_1.8.1 tidyselect_1.2.1 digest_0.6.37
## [40] future_1.67.0 stringi_1.8.7 listenv_0.9.1
## [43] splines_4.5.1 fastmap_1.2.0 grid_4.5.1
## [46] cli_3.6.5 magrittr_2.0.3 survival_3.8-3
## [49] future.apply_1.20.0 withr_3.0.2 scales_1.4.0
## [52] bit64_4.6.0-1 timechange_0.3.0 rmarkdown_2.29
## [55] globals_0.18.0 bit_4.6.0 nnet_7.3-20
## [58] timeDate_4041.110 hms_1.1.3 evaluate_1.0.5
## [61] hardhat_1.4.2 rsample_1.3.1 rlang_1.1.6
## [64] Rcpp_1.1.0 glue_1.8.0 ipred_0.9-15
## [67] vroom_1.6.6 rstudioapi_0.17.1 jsonlite_2.0.0
## [70] R6_2.6.1