Introduction

This document analyzes Taiwan stock market data from the TEJ database covering the period from January 2, 2024 to June 30, 2025. The analysis includes data cleaning, transformation, return calculations, and identification of the largest market cap firms.

Load Required Packages

library(tidyquant)
library(tidyverse)
library(timetk)
library(knitr)

Question 1: Import Data

Import the data file using read_tsv() function and display the structure.

# Set the file path to Downloads folder
file_path <- "C:/Users/97688/Downloads/tej_day_price_2024_20250630.txt"

# Import tab-delimited data
data <- read_tsv(file_path)

# Show data structure
glimpse(data)
## Rows: 337,347
## Columns: 12
## $ CO_ID                 <chr> "1101 TCC", "1102 ACC", "1103 CHC", "1104 UCC", …
## $ Date                  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102…
## $ `TSE ID`              <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, …
## $ `TSE Sector`          <chr> "01", "01", "01", "01", "01", "01", "01", "02", …
## $ `English Short Name`  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSI…
## $ `Open(NTD)`           <dbl> 32.5373, 37.2642, 17.7825, 26.0628, 14.1679, 16.…
## $ `High(NTD)`           <dbl> 32.5373, 37.4442, 17.7825, 26.1505, 14.1679, 16.…
## $ `Low(NTD)`            <dbl> 32.3038, 36.9492, 17.5953, 25.9750, 14.0343, 16.…
## $ `Close(NTD)`          <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.…
## $ `Volume(1000S)`       <dbl> 14937, 6223, 171, 260, 442, 228, 57, 126, 48, 18…
## $ `Amount(NTD1000)`     <dbl> 518751, 256522, 3240, 7736, 6992, 4159, 1075, 24…
## $ `Market Cap.(NTD MN)` <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754,…

The dataset contains 337347 rows and 12 columns with information about stock prices, trading volume, and market capitalization.


Question 2: Rename Columns

Select and rename columns 2, 3, 5, 9, and 12 to: “id”, “name”, “date”, “price”, and “cap”.

data_renamed <- data %>%
  select(id = `TSE ID`,
         name = `English Short Name`,
         date = Date,
         price = `Close(NTD)`,
         cap = `Market Cap.(NTD MN)`)

glimpse(data_renamed)
## Rows: 337,347
## Columns: 5
## $ id    <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 1203, 1210, 1213…
## $ name  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSINGTA", "Tuna Cem…
## $ date  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102, 20240102, 2024…
## $ price <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.1807, 18.3336, 1…
## $ cap   <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754, 9640, 13992, 52…

Question 3: Convert to Wide Format

Convert id to text, date to date format, and transform data from long to wide format.

data_wide <- data_renamed %>%
  mutate(id = as.character(id),
         date = ymd(date)) %>%
  select(id, date, price) %>%
  pivot_wider(names_from = id, values_from = price)

# Display first 10 rows and 10 columns
data_wide %>%
  select(1:10) %>%
  head(10) %>%
  kable(caption = "First 10 rows and 10 columns of wide format data")
First 10 rows and 10 columns of wide format data
date 1101 1102 1103 1104 1108 1109 1110 1201 1203
2024-01-02 32.3972 37.0392 17.6421 26.0628 14.0788 16.1807 18.3336 18.1893 55.3475
2024-01-03 31.9304 36.5892 17.5017 25.8873 14.0343 16.0918 18.1873 18.1893 54.6830
2024-01-04 31.9304 37.0392 17.5017 25.5363 14.0788 16.1362 18.3823 18.1416 54.1134
2024-01-05 32.0704 36.9942 17.5485 25.7995 14.0788 16.1362 18.3336 18.1416 54.8729
2024-01-08 31.9771 37.1742 17.5953 25.7118 14.1679 16.0918 18.3336 18.1416 54.5881
2024-01-09 31.7903 36.9042 17.5485 25.3607 13.8115 16.0473 18.4311 18.0939 54.5881
2024-01-10 31.4636 36.6342 17.4081 25.3607 13.6778 16.0473 18.4799 17.9984 53.4488
2024-01-11 31.4636 36.7242 17.5017 25.5801 13.7670 16.0918 18.4799 17.9984 55.0627
2024-01-12 31.5103 36.5892 17.5017 25.6240 13.8115 16.0473 18.4311 17.9984 54.3982
2024-01-15 31.4169 36.5892 17.4081 25.4485 13.7670 16.0029 18.3336 18.0461 53.8286

Dimensions: 358 rows × 947 columns


Question 4: Identify Stocks with NA Values

Count the number of NA values for each stock.

na_counts <- data_wide %>%
  select(-date) %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "key", values_to = "value") %>%
  filter(value > 0) %>%
  arrange(desc(value))

na_counts %>%
  head(10) %>%
  kable(caption = "Top 10 stocks with NA values")
Top 10 stocks with NA values
key value
7799 217
7788 198
7780 196
4585 177
3716 160
7765 151
7750 108
7736 53
7732 43
7722 18

Total stocks with NA values: 10


Question 5: Fill NA Values

Replace NA values with the closest available stock prices using na.locf().

data_filled <- data_wide %>%
  tk_xts(date_var = date) %>%
  na.locf() %>%
  tk_tbl(rename_index = "date")

# Display first 10 rows and 10 columns
data_filled %>%
  select(1:10) %>%
  head(10) %>%
  kable(caption = "Data after filling NA values")
Data after filling NA values
date 1101 1102 1103 1104 1108 1109 1110 1201 1203
2024-01-02 32.3972 37.0392 17.6421 26.0628 14.0788 16.1807 18.3336 18.1893 55.3475
2024-01-03 31.9304 36.5892 17.5017 25.8873 14.0343 16.0918 18.1873 18.1893 54.6830
2024-01-04 31.9304 37.0392 17.5017 25.5363 14.0788 16.1362 18.3823 18.1416 54.1134
2024-01-05 32.0704 36.9942 17.5485 25.7995 14.0788 16.1362 18.3336 18.1416 54.8729
2024-01-08 31.9771 37.1742 17.5953 25.7118 14.1679 16.0918 18.3336 18.1416 54.5881
2024-01-09 31.7903 36.9042 17.5485 25.3607 13.8115 16.0473 18.4311 18.0939 54.5881
2024-01-10 31.4636 36.6342 17.4081 25.3607 13.6778 16.0473 18.4799 17.9984 53.4488
2024-01-11 31.4636 36.7242 17.5017 25.5801 13.7670 16.0918 18.4799 17.9984 55.0627
2024-01-12 31.5103 36.5892 17.5017 25.6240 13.8115 16.0473 18.4311 17.9984 54.3982
2024-01-15 31.4169 36.5892 17.4081 25.4485 13.7670 16.0029 18.3336 18.0461 53.8286

Total rows after filling: 358


Question 6: Remove Stocks with NA Values

Delete stocks that contained NA values from the original data.

# Get stock IDs with NA values
stocks_with_na <- na_counts$key

# Remove these stocks
data_no_na <- data_wide %>%
  select(-all_of(stocks_with_na))

# Show dimensions
dimensions <- dim(data_no_na)
cat("Dimensions after removing stocks with NA:\n")
## Dimensions after removing stocks with NA:
cat("Rows:", dimensions[1], "\n")
## Rows: 358
cat("Columns:", dimensions[2], "\n")
## Columns: 937

Updated dimensions: 358 rows × 937 columns


Question 7: Calculate Daily Returns

Convert to xts format and calculate daily discrete returns.

# Convert to xts
data_xts <- data_no_na %>%
  tk_xts(date_var = date)

# Calculate daily returns
daily_returns <- Return.calculate(data_xts, method = "discrete")

# Remove first row and show first 5 stocks, first 5 days
daily_returns_clean <- daily_returns[-1, ]

daily_returns_clean[1:5, 1:5] %>%
  as.data.frame() %>%
  rownames_to_column("Date") %>%
  kable(caption = "Daily returns for first 5 stocks (first 5 days)", 
        digits = 6)
Daily returns for first 5 stocks (first 5 days)
Date 1101 1102 1103 1104 1108
2024-01-03 -0.014409 -0.012149 -0.007958 -0.006734 -0.003161
2024-01-04 0.000000 0.012299 0.000000 -0.013559 0.003171
2024-01-05 0.004385 -0.001215 0.002674 0.010307 0.000000
2024-01-08 -0.002909 0.004866 0.002667 -0.003399 0.006329
2024-01-09 -0.005842 -0.007263 -0.002660 -0.013655 -0.025155

Question 8: Calculate Monthly Returns

Convert to monthly frequency and calculate monthly returns.

# Convert to monthly and calculate returns
monthly_returns <- data_xts %>%
  to.monthly(indexAt = "lastof", OHLC = FALSE) %>%
  Return.calculate(method = "discrete")

# Remove first row and show first 5 stocks, first 5 months
monthly_returns_clean <- monthly_returns[-1, ]

monthly_returns_clean[1:5, 1:5] %>%
  as.data.frame() %>%
  rownames_to_column("Date") %>%
  kable(caption = "Monthly returns for first 5 stocks (first 5 months)", 
        digits = 6)
Monthly returns for first 5 stocks (first 5 months)
Date 1101 1102 1103 1104 1108
2024-02-29 0.006272 0.017608 -0.008404 0.018870 0.036185
2024-03-31 0.001555 0.021014 -0.016951 0.063972 0.003171
2024-04-30 -0.003108 0.058113 0.057470 0.112340 0.072790
2024-05-31 0.029639 -0.049201 0.032612 -0.032715 0.000000
2024-06-30 0.036365 0.055357 -0.036845 0.048528 -0.011805

Question 9: Top 20 Largest Cap Firms

Find the 20 largest market cap firms at year-end 2024 and mid-year 2025.

top20_firms <- data_renamed %>%
  mutate(date = ymd(date),
         id = as.character(id),
         year1 = year(date)) %>%
  filter((date == "2024-12-31") | (date == "2025-06-30")) %>%
  group_by(date, year1) %>%
  arrange(desc(cap)) %>%
  slice(1:20) %>%
  ungroup() %>%
  mutate(cap1 = paste0("$", format(cap, big.mark = ","))) %>%
  select(date, year1, cap, cap1, id, name)

Top 20 Firms at 2024-12-31

top20_2024 <- top20_firms %>% filter(year1 == 2024)

top20_2024 %>%
  select(Rank = id, Name = name, `Market Cap` = cap1) %>%
  mutate(Rank = row_number()) %>%
  kable(caption = "Top 20 largest cap firms at 2024-12-31")
Top 20 largest cap firms at 2024-12-31
Rank Name Market Cap
1 TSMC \(27,877,688 | | 2|Hon Hai |\) 2,556,073
3 MediaTek $ 2,266,389
4 Fubon Financial $ 1,234,015
5 DELTA $ 1,118,242
6 QCI $ 1,108,574
7 CATHAY FHC $ 1,001,907
8 CHT $ 958,045
9 CTBC Holding $ 767,154
10 ASEH $ 715,219
11 Mega FHC $ 574,052
12 UMC $ 540,739
13 EMC $ 487,135
14 Wiwynn $ 486,903
15 Uni-President $ 459,675
16 Asustek $ 457,540
17 Yuanta Group $ 440,057
18 Accton $ 433,744
19 E.S.F.H $ 431,087
20 TWM $ 422,590

Top 20 Firms at 2025-06-30

top20_2025 <- top20_firms %>% filter(year1 == 2025)

top20_2025 %>%
  select(ID = id, Name = name, `Market Cap` = cap1) %>%
  mutate(Rank = row_number()) %>%
  kable(caption = "Top 20 largest cap firms at 2025-06-30")
Top 20 largest cap firms at 2025-06-30
ID Name Market Cap Rank
2330 TSMC \(27,488,572 | 1| |2317 |Hon Hai |\) 2,236,570 2
2454 MediaTek $ 2,002,073 3
2881 Fubon Financial $ 1,193,018 4
2308 DELTA $ 1,072,785 5
2382 QCI $ 1,060,291 6
2412 CHT $ 1,047,255 7
2882 CATHAY FHC $ 921,226 8
2891 CTBC Holding $ 858,722 9
3711 ASEH $ 652,308 10
2886 Mega FHC $ 608,910 11
2303 UMC $ 554,998 12
2884 E.S.F.H $ 525,462 13
2357 Asustek $ 478,338 14
6669 Wiwynn $ 470,177 15
1216 Uni-President $ 460,243 16
2885 Yuanta Group $ 441,999 17
2603 EMC $ 430,844 18
3045 TWM $ 428,175 19
2345 Accton $ 409,616 20

Summary

This analysis successfully:

  1. ✅ Imported and explored the TEJ stock price dataset
  2. ✅ Cleaned and transformed column names
  3. ✅ Converted data from long to wide format
  4. ✅ Identified and handled missing values
  5. ✅ Calculated daily and monthly returns
  6. ✅ Identified the top 20 largest market cap firms

Key Findings:

  • The dataset covers 946 stocks over 358 trading days
  • TSMC (2330) dominates with the largest market capitalization in both periods
  • After removing stocks with missing data, 936 stocks remain for analysis

Session Information

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] knitr_1.50                 timetk_2.9.1              
##  [3] lubridate_1.9.4            forcats_1.0.1             
##  [5] stringr_1.5.2              dplyr_1.1.4               
##  [7] purrr_1.1.0                readr_2.1.5               
##  [9] tidyr_1.3.1                tibble_3.3.0              
## [11] ggplot2_4.0.0              tidyverse_2.0.0           
## [13] PerformanceAnalytics_2.0.8 quantmod_0.4.28           
## [15] TTR_0.24.4                 xts_0.14.1                
## [17] zoo_1.8-14                 tidyquant_1.0.11          
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        xfun_0.53           bslib_0.9.0        
##  [4] recipes_1.3.1       lattice_0.22-7      tzdb_0.5.0         
##  [7] quadprog_1.5-8      vctrs_0.6.5         tools_4.5.1        
## [10] generics_0.1.4      parallel_4.5.1      curl_7.0.0         
## [13] RobStatTM_1.0.11    pkgconfig_2.0.3     Matrix_1.7-3       
## [16] data.table_1.17.8   RColorBrewer_1.1-3  S7_0.2.0           
## [19] lifecycle_1.0.4     compiler_4.5.1      farver_2.1.2       
## [22] codetools_0.2-20    htmltools_0.5.8.1   class_7.3-23       
## [25] sass_0.4.10         yaml_2.3.10         prodlim_2025.04.28 
## [28] crayon_1.5.3        furrr_0.3.1         pillar_1.11.1      
## [31] jquerylib_0.1.4     MASS_7.3-65         cachem_1.1.0       
## [34] gower_1.0.2         rpart_4.1.24        parallelly_1.45.1  
## [37] lava_1.8.1          tidyselect_1.2.1    digest_0.6.37      
## [40] future_1.67.0       stringi_1.8.7       listenv_0.9.1      
## [43] splines_4.5.1       fastmap_1.2.0       grid_4.5.1         
## [46] cli_3.6.5           magrittr_2.0.3      survival_3.8-3     
## [49] future.apply_1.20.0 withr_3.0.2         scales_1.4.0       
## [52] bit64_4.6.0-1       timechange_0.3.0    rmarkdown_2.29     
## [55] globals_0.18.0      bit_4.6.0           nnet_7.3-20        
## [58] timeDate_4041.110   hms_1.1.3           evaluate_1.0.5     
## [61] hardhat_1.4.2       rsample_1.3.1       rlang_1.1.6        
## [64] Rcpp_1.1.0          glue_1.8.0          ipred_0.9-15       
## [67] vroom_1.6.6         rstudioapi_0.17.1   jsonlite_2.0.0     
## [70] R6_2.6.1