Introduction

This report analyzes Taiwan stock market data from the TEJ database, covering the period from January 2024 to June 2025. The analysis includes data cleaning, transformation, and calculation of stock returns at both daily and monthly frequencies.


Question 1: Load Packages and Import Data

Load Required Packages

rm(list=ls())

# Install kableExtra if not already installed
if (!require("kableExtra")) {
  install.packages("kableExtra")
  library(kableExtra)
}

library(tidyverse)
library(tidyquant)
library(timetk)
library(zoo)
library(PerformanceAnalytics)
library(xts)
library(knitr)

Import Data

We’ll try three different import functions to read the tab-delimited file:

# Method 1: read_csv() - for comma-separated files
# data1 <- read_csv("tej_day_price_2024_20250630.txt")

# Method 2: read_tsv() - Best for tab-delimited files
data2 <- read_tsv("tej_day_price_2024_20250630.txt")

# Method 3: read_delim() - Generic delimiter specification
# data3 <- read_delim("tej_day_price_2024_20250630.txt", delim = "\t")

Show Imported Data Structure

glimpse(data2)
## Rows: 337,347
## Columns: 12
## $ CO_ID                 <chr> "1101 TCC", "1102 ACC", "1103 CHC", "1104 UCC", …
## $ Date                  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102…
## $ `TSE ID`              <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, …
## $ `TSE Sector`          <chr> "01", "01", "01", "01", "01", "01", "01", "02", …
## $ `English Short Name`  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSI…
## $ `Open(NTD)`           <dbl> 32.5373, 37.2642, 17.7825, 26.0628, 14.1679, 16.…
## $ `High(NTD)`           <dbl> 32.5373, 37.4442, 17.7825, 26.1505, 14.1679, 16.…
## $ `Low(NTD)`            <dbl> 32.3038, 36.9492, 17.5953, 25.9750, 14.0343, 16.…
## $ `Close(NTD)`          <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.…
## $ `Volume(1000S)`       <dbl> 14937, 6223, 171, 260, 442, 228, 57, 126, 48, 18…
## $ `Amount(NTD1000)`     <dbl> 518751, 256522, 3240, 7736, 6992, 4159, 1075, 24…
## $ `Market Cap.(NTD MN)` <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754,…

Result: The data contains 337,347 rows and 12 columns, with stock price information including open, high, low, close prices, volume, and market capitalization.


Question 2: Rename Columns

Replace columns 2, 3, 5, 9, and 12 with new names: “date”, “id”, “name”, “price”, “cap”.

data <- data2 %>%
  rename(date = 2,          # Date
         id = 3,            # TSE ID
         name = 5,          # English Short Name
         price = 9,         # Close(NTD)
         cap = 12) %>%      # Market Cap.(NTD MN)
  select(id, name, date, price, cap)

glimpse(data)
## Rows: 337,347
## Columns: 5
## $ id    <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 1203, 1210, 1213…
## $ name  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSINGTA", "Tuna Cem…
## $ date  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102, 20240102, 2024…
## $ price <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.1807, 18.3336, 1…
## $ cap   <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754, 9640, 13992, 52…

Question 3: Convert to Wide Format

Select columns id, date, and price. Convert id to text, date to date format, and transform data from long to wide format.

data_wide <- data %>%
  select(id, date, price) %>%
  mutate(id = as.character(id),
         date = ymd(date)) %>%
  pivot_wider(names_from = id, values_from = price)

head(data_wide, 10)

Result: Data now has 358 rows (trading days) and 947 columns (date + stock IDs).


Question 4: Identify NA Values

Show stock IDs with NA values and compute the number of NAs for each stock.

na_summary <- data_wide %>%
  select(-date) %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "key", values_to = "value") %>%
  filter(value > 0) %>%
  arrange(key)

na_summary %>%
  kable(caption = "Stocks with Missing Values", 
        col.names = c("Stock ID", "Number of NAs")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE)
Stocks with Missing Values
Stock ID Number of NAs
3716 160
4585 177
7722 18
7732 43
7736 53
7750 108
7765 151
7780 196
7788 198
7799 217

Finding: 10 stocks contain NA values, with the number of missing observations ranging from 18 to 217.


Question 5: Fill NA Values

Replace NA values with the closest available stock prices using forward fill method.

data_filled <- data_wide %>%
  mutate(across(-date, ~na.locf(., na.rm = FALSE)))

head(data_filled, 10)

Method: Used na.locf() (Last Observation Carried Forward) to fill missing values with the most recent available price.


Question 6: Remove Stocks with NA Values

Delete stocks that contained NA values and show the updated dimensions.

stocks_to_remove <- na_summary$key

data_clean <- data_filled %>%
  select(-all_of(stocks_to_remove))

cat("Original dimensions:", dim(data_filled), "\n")
## Original dimensions: 358 947
cat("Cleaned dimensions:", dim(data_clean), "\n")
## Cleaned dimensions: 358 937
cat("Stocks removed:", length(stocks_to_remove), "\n")
## Stocks removed: 10

Result: After removing 10 stocks with missing values, the cleaned dataset has 358 rows and 937 columns (including date column).


Question 7: Daily Returns Calculation

Convert data to time series format (xts) and calculate daily returns.

# Convert to xts
data_xts <- data_clean %>%
  tk_xts(select = -date, date_var = date)

# Calculate daily returns (discrete method)
returns_daily <- Return.calculate(data_xts, method = "discrete")

# Show first 5 stocks and first 5 days (excluding first row with NA)
returns_daily[-1, 1:5] %>% 
  head(5) %>%
  as.data.frame() %>%
  kable(caption = "Daily Returns - First 5 Stocks, First 5 Days", 
        digits = 6) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = FALSE)
Daily Returns - First 5 Stocks, First 5 Days
1101 1102 1103 1104 1108
2024-01-03 -0.014409 -0.012149 -0.007958 -0.006734 -0.003161
2024-01-04 0.000000 0.012299 0.000000 -0.013559 0.003171
2024-01-05 0.004385 -0.001215 0.002674 0.010307 0.000000
2024-01-08 -0.002909 0.004866 0.002667 -0.003399 0.006329
2024-01-09 -0.005842 -0.007263 -0.002660 -0.013655 -0.025155

Formula: Daily return = (Price_t - Price_{t-1}) / Price_{t-1}


Question 8: Monthly Returns Calculation

Compute monthly returns and show the first 5 stocks and first 5 months.

# Convert to monthly prices
returns_monthly <- to.monthly(data_xts, OHLC = FALSE)

# Calculate monthly returns
returns_monthly_calc <- Return.calculate(returns_monthly, method = "discrete")

# Show first 5 stocks and first 5 months (excluding first row)
returns_monthly_calc[-1, 1:5] %>%
  head(5) %>%
  as.data.frame() %>%
  kable(caption = "Monthly Returns - First 5 Stocks, First 5 Months", 
        digits = 6) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = FALSE)
Monthly Returns - First 5 Stocks, First 5 Months
1101 1102 1103 1104 1108
Feb 2024 0.006272 0.017608 -0.008404 0.018870 0.036185
Mar 2024 0.001555 0.021014 -0.016951 0.063972 0.003171
Apr 2024 -0.003108 0.058113 0.057470 0.112340 0.072790
May 2024 0.029639 -0.049201 0.032612 -0.032715 0.000000
Jun 2024 0.036365 0.055357 -0.036845 0.048528 -0.011805

Question 9: Largest Market Cap Firms

Find the 20 largest market cap firms at year-end 2024 and mid-year 2025.

largest_caps <- data %>%
  mutate(date = ymd(date),
         id = as.character(id),
         year = year(date)) %>%
  filter(date == ymd("2024-12-31") | date == ymd("2025-06-30")) %>%
  group_by(year) %>%
  arrange(desc(cap)) %>%
  slice(1:20) %>%
  ungroup() %>%
  mutate(cap_formatted = scales::dollar(cap, prefix = "NT$", 
                                       suffix = " M", big.mark = ",")) %>%
  select(date, year, id, name, cap, cap_formatted)

# Display 2024 results
largest_caps %>%
  filter(year == 2024) %>%
  select(Rank = year, `Stock ID` = id, Name = name, 
         `Market Cap (NT$ Million)` = cap_formatted) %>%
  mutate(Rank = row_number()) %>%
  kable(caption = "Top 20 Firms by Market Cap - Year End 2024") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE)
Top 20 Firms by Market Cap - Year End 2024
Rank Stock ID Name Market Cap (NT$ Million)
1 2330 TSMC NT$27,877,688 M
2 2317 Hon Hai NT$2,556,073 M
3 2454 MediaTek NT$2,266,389 M
4 2881 Fubon Financial NT$1,234,015 M
5 2308 DELTA NT$1,118,242 M
6 2382 QCI NT$1,108,574 M
7 2882 CATHAY FHC NT$1,001,907 M
8 2412 CHT NT$958,045 M
9 2891 CTBC Holding NT$767,154 M
10 3711 ASEH NT$715,219 M
11 2886 Mega FHC NT$574,052 M
12 2303 UMC NT$540,739 M
13 2603 EMC NT$487,135 M
14 6669 Wiwynn NT$486,903 M
15 1216 Uni-President NT$459,675 M
16 2357 Asustek NT$457,540 M
17 2885 Yuanta Group NT$440,057 M
18 2345 Accton NT$433,744 M
19 2884 E.S.F.H NT$431,087 M
20 3045 TWM NT$422,590 M
# Display 2025 results
largest_caps %>%
  filter(year == 2025) %>%
  select(Rank = year, `Stock ID` = id, Name = name, 
         `Market Cap (NT$ Million)` = cap_formatted) %>%
  mutate(Rank = row_number()) %>%
  kable(caption = "Top 20 Firms by Market Cap - Mid Year 2025") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE)
Top 20 Firms by Market Cap - Mid Year 2025
Rank Stock ID Name Market Cap (NT$ Million)
1 2330 TSMC NT$27,488,572 M
2 2317 Hon Hai NT$2,236,570 M
3 2454 MediaTek NT$2,002,073 M
4 2881 Fubon Financial NT$1,193,018 M
5 2308 DELTA NT$1,072,785 M
6 2382 QCI NT$1,060,291 M
7 2412 CHT NT$1,047,255 M
8 2882 CATHAY FHC NT$921,226 M
9 2891 CTBC Holding NT$858,722 M
10 3711 ASEH NT$652,308 M
11 2886 Mega FHC NT$608,910 M
12 2303 UMC NT$554,998 M
13 2884 E.S.F.H NT$525,462 M
14 2357 Asustek NT$478,338 M
15 6669 Wiwynn NT$470,177 M
16 1216 Uni-President NT$460,243 M
17 2885 Yuanta Group NT$441,999 M
18 2603 EMC NT$430,844 M
19 3045 TWM NT$428,175 M
20 2345 Accton NT$409,616 M

Visualization of Top 20 Firms

largest_caps %>%
  mutate(name = reorder(paste(id, name), cap),
         year = as.factor(year)) %>%
  ggplot(aes(x = cap/1000, y = name, fill = year)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("2024" = "#2E86AB", "2025" = "#A23B72")) +
  labs(title = "Top 20 Firms by Market Capitalization",
       subtitle = "Comparison between Year-End 2024 and Mid-Year 2025",
       x = "Market Cap (NT$ Billion)",
       y = NULL,
       fill = "Period") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        legend.position = "top")


Session Information

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] knitr_1.43                 timetk_2.9.1              
##  [3] PerformanceAnalytics_2.0.8 quantmod_0.4.28           
##  [5] TTR_0.24.4                 xts_0.14.1                
##  [7] zoo_1.8-14                 tidyquant_1.0.11          
##  [9] lubridate_1.9.2            forcats_1.0.0             
## [11] stringr_1.5.0              dplyr_1.1.2               
## [13] purrr_1.0.2                readr_2.1.4               
## [15] tidyr_1.3.0                tibble_3.2.1              
## [17] ggplot2_3.4.2              tidyverse_2.0.0           
## [19] kableExtra_1.4.0          
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0    viridisLite_0.4.2   timeDate_4051.111  
##  [4] farver_2.1.1        fastmap_1.1.1       digest_0.6.33      
##  [7] rpart_4.1.23        timechange_0.2.0    lifecycle_1.0.4    
## [10] survival_3.5-8      magrittr_2.0.3      compiler_4.3.3     
## [13] rlang_1.1.6         sass_0.4.7          tools_4.3.3        
## [16] utf8_1.2.3          yaml_2.3.7          data.table_1.14.8  
## [19] labeling_0.4.2      bit_4.0.5           curl_7.0.0         
## [22] xml2_1.3.5          withr_2.5.0         nnet_7.3-19        
## [25] grid_4.3.3          fansi_1.0.4         colorspace_2.1-0   
## [28] future_1.67.0       globals_0.18.0      scales_1.2.1       
## [31] MASS_7.3-60.0.1     cli_3.6.5           crayon_1.5.2       
## [34] rmarkdown_2.23      generics_0.1.3      rstudioapi_0.15.0  
## [37] future.apply_1.20.0 tzdb_0.5.0          cachem_1.0.8       
## [40] splines_4.3.3       parallel_4.3.3      vctrs_0.6.5        
## [43] hardhat_1.4.2       Matrix_1.6-5        jsonlite_1.8.7     
## [46] hms_1.1.3           bit64_4.0.5         RobStatTM_1.0.11   
## [49] listenv_0.9.1       systemfonts_1.3.1   gower_1.0.2        
## [52] jquerylib_0.1.4     recipes_1.3.1       glue_1.6.2         
## [55] parallelly_1.45.1   codetools_0.2-19    rsample_1.3.1      
## [58] stringi_1.7.12      gtable_0.3.3        quadprog_1.5-8     
## [61] munsell_0.5.0       furrr_0.3.1         pillar_1.9.0       
## [64] htmltools_0.5.8.1   ipred_0.9-15        lava_1.8.1         
## [67] R6_2.5.1            textshaping_0.3.6   vroom_1.6.3        
## [70] evaluate_0.21       lattice_0.22-5      highr_0.10         
## [73] bslib_0.5.0         class_7.3-22        Rcpp_1.1.0         
## [76] svglite_2.2.2       prodlim_2025.04.28  xfun_0.40          
## [79] pkgconfig_2.0.3