1. Load packages and import data
import pandas as pd
import numpy as np
import gzip
from pathlib import Path
gz_path = Path("tej_day_price_2024_20250630.txt.gz")
if not gz_path.exists():
raise FileNotFoundError("Could not find 'tej_day_price_2024_20250630.txt.gz' in the working directory.")
with gzip.open(gz_path, "rt", encoding="utf-8") as f:
df_raw = pd.read_csv(f, sep="\t")
print("Shape:", df_raw.shape)
## Shape: (337347, 12)
print(df_raw.dtypes)
## CO_ID object
## Date int64
## TSE ID int64
## TSE Sector int64
## English Short Name object
## Open(NTD) float64
## High(NTD) float64
## Low(NTD) float64
## Close(NTD) float64
## Volume(1000S) int64
## Amount(NTD1000) int64
## Market Cap.(NTD MN) int64
## dtype: object
print(df_raw.head())
## CO_ID Date ... Amount(NTD1000) Market Cap.(NTD MN)
## 0 1101 TCC 20240102 ... 518751 262026
## 1 1102 ACC 20240102 ... 256522 145941
## 2 1103 CHC 20240102 ... 3240 14896
## 3 1104 UCC 20240102 ... 7736 19995
## 4 1108 Lucky Cement 20240102 ... 6992 6395
##
## [5 rows x 12 columns]
2. Rename columns
orig_cols = list(df_raw.columns)
print("Original columns:", orig_cols)
## Original columns: ['CO_ID', 'Date', 'TSE ID', 'TSE Sector', 'English Short Name', 'Open(NTD)', 'High(NTD)', 'Low(NTD)', 'Close(NTD)', 'Volume(1000S)', 'Amount(NTD1000)', 'Market Cap.(NTD MN)']
df_sel = df_raw.iloc[:, [1, 2, 4, 8, 11]].copy()
df_sel.columns = ["date", "id", "name", "price", "cap"]
print(df_sel.head())
## date id name price cap
## 0 20240102 1101 TCC 32.3972 262026
## 1 20240102 1102 ACC 37.0392 145941
## 2 20240102 1103 CHC 17.6421 14896
## 3 20240102 1104 UCC 26.0628 19995
## 4 20240102 1108 Lucky Cement 14.0788 6395
3. Long → wide
df_prices = df_sel[["id", "date", "price"]].copy()
df_prices["id"] = df_prices["id"].astype(str)
df_prices["date"] = pd.to_datetime(df_prices["date"].astype(str), format="%Y%m%d")
price_wide = df_prices.pivot(index="date", columns="id", values="price").sort_index()
print(price_wide.shape)
## (358, 946)
print(price_wide.head(10))
## id 1101 1102 1103 ... 9946 9955 9958
## date ...
## 2024-01-02 32.3972 37.0392 17.6421 ... 19.4864 24.30 167.3796
## 2024-01-03 31.9304 36.5892 17.5017 ... 19.3937 24.30 171.2385
## 2024-01-04 31.9304 37.0392 17.5017 ... 19.8112 24.00 173.6503
## 2024-01-05 32.0704 36.9942 17.5485 ... 19.8112 24.35 177.0269
## 2024-01-08 31.9771 37.1742 17.5953 ... 19.8576 24.10 177.0269
## 2024-01-09 31.7903 36.9042 17.5485 ... 19.2081 23.70 179.9210
## 2024-01-10 31.4636 36.6342 17.4081 ... 19.0689 23.40 182.3328
## 2024-01-11 31.4636 36.7242 17.5017 ... 19.0225 23.55 184.7446
## 2024-01-12 31.5103 36.5892 17.5017 ... 19.0689 23.60 182.3328
## 2024-01-15 31.4169 36.5892 17.4081 ... 19.3937 23.85 196.3213
##
## [10 rows x 946 columns]
4. NAs
na_counts = price_wide.isna().sum()
na_counts = na_counts[na_counts > 0].sort_values(ascending=False)
na_df = na_counts.reset_index()
na_df.columns = ["id", "na_count"]
print(na_df.head(20))
## id na_count
## 0 7799 217
## 1 7788 198
## 2 7780 196
## 3 4585 177
## 4 3716 160
## 5 7765 151
## 6 7750 108
## 7 7736 53
## 8 7732 43
## 9 7722 18
5. LOCF-like fill
price_filled = price_wide.ffill().bfill()
print(price_filled.head(10))
## id 1101 1102 1103 ... 9946 9955 9958
## date ...
## 2024-01-02 32.3972 37.0392 17.6421 ... 19.4864 24.30 167.3796
## 2024-01-03 31.9304 36.5892 17.5017 ... 19.3937 24.30 171.2385
## 2024-01-04 31.9304 37.0392 17.5017 ... 19.8112 24.00 173.6503
## 2024-01-05 32.0704 36.9942 17.5485 ... 19.8112 24.35 177.0269
## 2024-01-08 31.9771 37.1742 17.5953 ... 19.8576 24.10 177.0269
## 2024-01-09 31.7903 36.9042 17.5485 ... 19.2081 23.70 179.9210
## 2024-01-10 31.4636 36.6342 17.4081 ... 19.0689 23.40 182.3328
## 2024-01-11 31.4636 36.7242 17.5017 ... 19.0225 23.55 184.7446
## 2024-01-12 31.5103 36.5892 17.5017 ... 19.0689 23.60 182.3328
## 2024-01-15 31.4169 36.5892 17.4081 ... 19.3937 23.85 196.3213
##
## [10 rows x 946 columns]
6. Drop cols with NA
cols_with_na = price_wide.columns[price_wide.isna().any()].tolist()
price_complete = price_filled.drop(columns=cols_with_na)
print("Original shape (wide):", price_wide.shape)
## Original shape (wide): (358, 946)
print("After dropping NA columns:", price_complete.shape)
## After dropping NA columns: (358, 936)
7. Daily returns
ret_daily = price_complete.pct_change()
ret_daily = ret_daily.iloc[1:, :]
first_5_cols = ret_daily.columns[:5]
print(ret_daily[first_5_cols].head(5))
## id 1101 1102 1103 1104 1108
## date
## 2024-01-03 -0.014409 -0.012149 -0.007958 -0.006734 -0.003161
## 2024-01-04 0.000000 0.012299 0.000000 -0.013559 0.003171
## 2024-01-05 0.004385 -0.001215 0.002674 0.010307 0.000000
## 2024-01-08 -0.002909 0.004866 0.002667 -0.003399 0.006329
## 2024-01-09 -0.005842 -0.007263 -0.002660 -0.013655 -0.025155
8. Monthly returns
monthly_prices = price_complete.resample("M").last()
ret_monthly = monthly_prices.pct_change()
ret_monthly = ret_monthly.iloc[1:, :]
first_5_cols_m = ret_monthly.columns[:5]
print(ret_monthly[first_5_cols_m].head(5))
## id 1101 1102 1103 1104 1108
## date
## 2024-02-29 0.006272 0.017608 -0.008404 0.018870 0.036185
## 2024-03-31 0.001555 0.021014 -0.016951 0.063972 0.003171
## 2024-04-30 -0.003108 0.058113 0.057470 0.112340 0.072790
## 2024-05-31 0.029639 -0.049201 0.032612 -0.032715 0.000000
## 2024-06-30 0.036365 0.055357 -0.036845 0.048528 -0.011805
9. 20 largest cap firms 2024 & 2025
df_cap = df_sel.copy()
df_cap["date"] = pd.to_datetime(df_cap["date"].astype(str), format="%Y%m%d")
target_dates = [pd.Timestamp("2024-12-31"), pd.Timestamp("2025-12-31")]
results = []
for d in target_dates:
tmp = df_cap[df_cap["date"] == d].copy()
if tmp.empty:
print(f"No data for {d.date()}")
continue
tmp = tmp.sort_values("cap", ascending=False).head(20)
tmp["year"] = d.year
results.append(tmp)
## No data for 2025-12-31
if results:
top_cap = pd.concat(results, ignore_index=True)
print(top_cap[["date", "year", "id", "name", "cap"]])
else:
print("No year-end data found for 2024 or 2025 in this file.")
## date year id name cap
## 0 2024-12-31 2024 2330 TSMC 27877688
## 1 2024-12-31 2024 2317 Hon Hai 2556073
## 2 2024-12-31 2024 2454 MediaTek 2266389
## 3 2024-12-31 2024 2881 Fubon Financial 1234015
## 4 2024-12-31 2024 2308 DELTA 1118242
## 5 2024-12-31 2024 2382 QCI 1108574
## 6 2024-12-31 2024 2882 CATHAY FHC 1001907
## 7 2024-12-31 2024 2412 CHT 958045
## 8 2024-12-31 2024 2891 CTBC Holding 767154
## 9 2024-12-31 2024 3711 ASEH 715219
## 10 2024-12-31 2024 2886 Mega FHC 574052
## 11 2024-12-31 2024 2303 UMC 540739
## 12 2024-12-31 2024 2603 EMC 487135
## 13 2024-12-31 2024 6669 Wiwynn 486903
## 14 2024-12-31 2024 1216 Uni-President 459675
## 15 2024-12-31 2024 2357 Asustek 457540
## 16 2024-12-31 2024 2885 Yuanta Group 440057
## 17 2024-12-31 2024 2345 Accton 433744
## 18 2024-12-31 2024 2884 E.S.F.H 431087
## 19 2024-12-31 2024 3045 TWM 422590