1. Load packages and import data

import pandas as pd
import numpy as np
import gzip
from pathlib import Path

gz_path = Path("tej_day_price_2024_20250630.txt.gz")
if not gz_path.exists():
    raise FileNotFoundError("Could not find 'tej_day_price_2024_20250630.txt.gz' in the working directory.")

with gzip.open(gz_path, "rt", encoding="utf-8") as f:
    df_raw = pd.read_csv(f, sep="\t")

print("Shape:", df_raw.shape)
## Shape: (337347, 12)
print(df_raw.dtypes)
## CO_ID                   object
## Date                     int64
## TSE ID                   int64
## TSE Sector               int64
## English Short Name      object
## Open(NTD)              float64
## High(NTD)              float64
## Low(NTD)               float64
## Close(NTD)             float64
## Volume(1000S)            int64
## Amount(NTD1000)          int64
## Market Cap.(NTD MN)      int64
## dtype: object
print(df_raw.head())
##                CO_ID      Date  ...  Amount(NTD1000)  Market Cap.(NTD MN)
## 0           1101 TCC  20240102  ...           518751               262026
## 1           1102 ACC  20240102  ...           256522               145941
## 2           1103 CHC  20240102  ...             3240                14896
## 3           1104 UCC  20240102  ...             7736                19995
## 4  1108 Lucky Cement  20240102  ...             6992                 6395
## 
## [5 rows x 12 columns]

2. Rename columns

orig_cols = list(df_raw.columns)
print("Original columns:", orig_cols)
## Original columns: ['CO_ID', 'Date', 'TSE ID', 'TSE Sector', 'English Short Name', 'Open(NTD)', 'High(NTD)', 'Low(NTD)', 'Close(NTD)', 'Volume(1000S)', 'Amount(NTD1000)', 'Market Cap.(NTD MN)']
df_sel = df_raw.iloc[:, [1, 2, 4, 8, 11]].copy()
df_sel.columns = ["date", "id", "name", "price", "cap"]

print(df_sel.head())
##        date    id          name    price     cap
## 0  20240102  1101           TCC  32.3972  262026
## 1  20240102  1102           ACC  37.0392  145941
## 2  20240102  1103           CHC  17.6421   14896
## 3  20240102  1104           UCC  26.0628   19995
## 4  20240102  1108  Lucky Cement  14.0788    6395

3. Long → wide

df_prices = df_sel[["id", "date", "price"]].copy()
df_prices["id"] = df_prices["id"].astype(str)
df_prices["date"] = pd.to_datetime(df_prices["date"].astype(str), format="%Y%m%d")

price_wide = df_prices.pivot(index="date", columns="id", values="price").sort_index()

print(price_wide.shape)
## (358, 946)
print(price_wide.head(10))
## id             1101     1102     1103  ...     9946   9955      9958
## date                                   ...                          
## 2024-01-02  32.3972  37.0392  17.6421  ...  19.4864  24.30  167.3796
## 2024-01-03  31.9304  36.5892  17.5017  ...  19.3937  24.30  171.2385
## 2024-01-04  31.9304  37.0392  17.5017  ...  19.8112  24.00  173.6503
## 2024-01-05  32.0704  36.9942  17.5485  ...  19.8112  24.35  177.0269
## 2024-01-08  31.9771  37.1742  17.5953  ...  19.8576  24.10  177.0269
## 2024-01-09  31.7903  36.9042  17.5485  ...  19.2081  23.70  179.9210
## 2024-01-10  31.4636  36.6342  17.4081  ...  19.0689  23.40  182.3328
## 2024-01-11  31.4636  36.7242  17.5017  ...  19.0225  23.55  184.7446
## 2024-01-12  31.5103  36.5892  17.5017  ...  19.0689  23.60  182.3328
## 2024-01-15  31.4169  36.5892  17.4081  ...  19.3937  23.85  196.3213
## 
## [10 rows x 946 columns]

4. NAs

na_counts = price_wide.isna().sum()
na_counts = na_counts[na_counts > 0].sort_values(ascending=False)
na_df = na_counts.reset_index()
na_df.columns = ["id", "na_count"]
print(na_df.head(20))
##      id  na_count
## 0  7799       217
## 1  7788       198
## 2  7780       196
## 3  4585       177
## 4  3716       160
## 5  7765       151
## 6  7750       108
## 7  7736        53
## 8  7732        43
## 9  7722        18

5. LOCF-like fill

price_filled = price_wide.ffill().bfill()
print(price_filled.head(10))
## id             1101     1102     1103  ...     9946   9955      9958
## date                                   ...                          
## 2024-01-02  32.3972  37.0392  17.6421  ...  19.4864  24.30  167.3796
## 2024-01-03  31.9304  36.5892  17.5017  ...  19.3937  24.30  171.2385
## 2024-01-04  31.9304  37.0392  17.5017  ...  19.8112  24.00  173.6503
## 2024-01-05  32.0704  36.9942  17.5485  ...  19.8112  24.35  177.0269
## 2024-01-08  31.9771  37.1742  17.5953  ...  19.8576  24.10  177.0269
## 2024-01-09  31.7903  36.9042  17.5485  ...  19.2081  23.70  179.9210
## 2024-01-10  31.4636  36.6342  17.4081  ...  19.0689  23.40  182.3328
## 2024-01-11  31.4636  36.7242  17.5017  ...  19.0225  23.55  184.7446
## 2024-01-12  31.5103  36.5892  17.5017  ...  19.0689  23.60  182.3328
## 2024-01-15  31.4169  36.5892  17.4081  ...  19.3937  23.85  196.3213
## 
## [10 rows x 946 columns]

6. Drop cols with NA

cols_with_na = price_wide.columns[price_wide.isna().any()].tolist()
price_complete = price_filled.drop(columns=cols_with_na)
print("Original shape (wide):", price_wide.shape)
## Original shape (wide): (358, 946)
print("After dropping NA columns:", price_complete.shape)
## After dropping NA columns: (358, 936)

7. Daily returns

ret_daily = price_complete.pct_change()
ret_daily = ret_daily.iloc[1:, :]
first_5_cols = ret_daily.columns[:5]
print(ret_daily[first_5_cols].head(5))
## id              1101      1102      1103      1104      1108
## date                                                        
## 2024-01-03 -0.014409 -0.012149 -0.007958 -0.006734 -0.003161
## 2024-01-04  0.000000  0.012299  0.000000 -0.013559  0.003171
## 2024-01-05  0.004385 -0.001215  0.002674  0.010307  0.000000
## 2024-01-08 -0.002909  0.004866  0.002667 -0.003399  0.006329
## 2024-01-09 -0.005842 -0.007263 -0.002660 -0.013655 -0.025155

8. Monthly returns

monthly_prices = price_complete.resample("M").last()
ret_monthly = monthly_prices.pct_change()
ret_monthly = ret_monthly.iloc[1:, :]
first_5_cols_m = ret_monthly.columns[:5]
print(ret_monthly[first_5_cols_m].head(5))
## id              1101      1102      1103      1104      1108
## date                                                        
## 2024-02-29  0.006272  0.017608 -0.008404  0.018870  0.036185
## 2024-03-31  0.001555  0.021014 -0.016951  0.063972  0.003171
## 2024-04-30 -0.003108  0.058113  0.057470  0.112340  0.072790
## 2024-05-31  0.029639 -0.049201  0.032612 -0.032715  0.000000
## 2024-06-30  0.036365  0.055357 -0.036845  0.048528 -0.011805

9. 20 largest cap firms 2024 & 2025

df_cap = df_sel.copy()
df_cap["date"] = pd.to_datetime(df_cap["date"].astype(str), format="%Y%m%d")

target_dates = [pd.Timestamp("2024-12-31"), pd.Timestamp("2025-12-31")]
results = []
for d in target_dates:
    tmp = df_cap[df_cap["date"] == d].copy()
    if tmp.empty:
        print(f"No data for {d.date()}")
        continue
    tmp = tmp.sort_values("cap", ascending=False).head(20)
    tmp["year"] = d.year
    results.append(tmp)
## No data for 2025-12-31
if results:
    top_cap = pd.concat(results, ignore_index=True)
    print(top_cap[["date", "year", "id", "name", "cap"]])
else:
    print("No year-end data found for 2024 or 2025 in this file.")
##          date  year    id             name       cap
## 0  2024-12-31  2024  2330             TSMC  27877688
## 1  2024-12-31  2024  2317          Hon Hai   2556073
## 2  2024-12-31  2024  2454         MediaTek   2266389
## 3  2024-12-31  2024  2881  Fubon Financial   1234015
## 4  2024-12-31  2024  2308            DELTA   1118242
## 5  2024-12-31  2024  2382              QCI   1108574
## 6  2024-12-31  2024  2882       CATHAY FHC   1001907
## 7  2024-12-31  2024  2412              CHT    958045
## 8  2024-12-31  2024  2891     CTBC Holding    767154
## 9  2024-12-31  2024  3711             ASEH    715219
## 10 2024-12-31  2024  2886         Mega FHC    574052
## 11 2024-12-31  2024  2303              UMC    540739
## 12 2024-12-31  2024  2603              EMC    487135
## 13 2024-12-31  2024  6669           Wiwynn    486903
## 14 2024-12-31  2024  1216    Uni-President    459675
## 15 2024-12-31  2024  2357          Asustek    457540
## 16 2024-12-31  2024  2885     Yuanta Group    440057
## 17 2024-12-31  2024  2345           Accton    433744
## 18 2024-12-31  2024  2884          E.S.F.H    431087
## 19 2024-12-31  2024  3045              TWM    422590