# Option 1: Use file.choose() to manually select the file (interactive)
# data <- read_tsv(file.choose())
# Option 2: Specify full file path (update this to your actual path)
# data <- read_tsv("C:/Users/dell/OneDrive/Desktop/tej_day_price_2024_20250630.txt")
# Option 3: Use the file if it's in the same directory as the .Rmd file
data <- read_tsv("tej_day_price_2024_20250630.txt")
# Show data structure
glimpse(data)## Rows: 337,347
## Columns: 12
## $ CO_ID <chr> "1101 TCC", "1102 ACC", "1103 CHC", "1104 UCC", …
## $ Date <dbl> 20240102, 20240102, 20240102, 20240102, 20240102…
## $ `TSE ID` <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, …
## $ `TSE Sector` <chr> "01", "01", "01", "01", "01", "01", "01", "02", …
## $ `English Short Name` <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSI…
## $ `Open(NTD)` <dbl> 32.5373, 37.2642, 17.7825, 26.0628, 14.1679, 16.…
## $ `High(NTD)` <dbl> 32.5373, 37.4442, 17.7825, 26.1505, 14.1679, 16.…
## $ `Low(NTD)` <dbl> 32.3038, 36.9492, 17.5953, 25.9750, 14.0343, 16.…
## $ `Close(NTD)` <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.…
## $ `Volume(1000S)` <dbl> 14937, 6223, 171, 260, 442, 228, 57, 126, 48, 18…
## $ `Amount(NTD1000)` <dbl> 518751, 256522, 3240, 7736, 6992, 4159, 1075, 24…
## $ `Market Cap.(NTD MN)` <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754,…
# Replace specific column names
colnames(data)[c(2, 3, 5, 9, 12)] <- c("date", "id", "name", "price", "cap")
head(data)## # A tibble: 6 × 12
## CO_ID date id `TSE Sector` name `Open(NTD)` `High(NTD)` `Low(NTD)` price
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1101… 2.02e7 1101 01 TCC 32.5 32.5 32.3 32.4
## 2 1102… 2.02e7 1102 01 ACC 37.3 37.4 36.9 37.0
## 3 1103… 2.02e7 1103 01 CHC 17.8 17.8 17.6 17.6
## 4 1104… 2.02e7 1104 01 UCC 26.1 26.2 26.0 26.1
## 5 1108… 2.02e7 1108 01 Luck… 14.2 14.2 14.0 14.1
## 6 1109… 2.02e7 1109 01 HSIN… 16.2 16.3 16.1 16.2
## # ℹ 3 more variables: `Volume(1000S)` <dbl>, `Amount(NTD1000)` <dbl>, cap <dbl>
# Select relevant columns and transform to wide format
data_selected <- data %>%
select(id, date, price) %>%
mutate(
id = as.character(id),
date = ymd(date),
price = as.numeric(price)
) %>%
pivot_wider(names_from = id, values_from = price)
head(data_selected)## # A tibble: 6 × 947
## date `1101` `1102` `1103` `1104` `1108` `1109` `1110` `1201` `1203`
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2024-01-02 32.4 37.0 17.6 26.1 14.1 16.2 18.3 18.2 55.3
## 2 2024-01-03 31.9 36.6 17.5 25.9 14.0 16.1 18.2 18.2 54.7
## 3 2024-01-04 31.9 37.0 17.5 25.5 14.1 16.1 18.4 18.1 54.1
## 4 2024-01-05 32.1 37.0 17.5 25.8 14.1 16.1 18.3 18.1 54.9
## 5 2024-01-08 32.0 37.2 17.6 25.7 14.2 16.1 18.3 18.1 54.6
## 6 2024-01-09 31.8 36.9 17.5 25.4 13.8 16.0 18.4 18.1 54.6
## # ℹ 937 more variables: `1210` <dbl>, `1213` <dbl>, `1215` <dbl>, `1216` <dbl>,
## # `1217` <dbl>, `1218` <dbl>, `1219` <dbl>, `1220` <dbl>, `1225` <dbl>,
## # `1227` <dbl>, `1229` <dbl>, `1231` <dbl>, `1232` <dbl>, `1233` <dbl>,
## # `1234` <dbl>, `1235` <dbl>, `1236` <dbl>, `1301` <dbl>, `1303` <dbl>,
## # `1304` <dbl>, `1305` <dbl>, `1307` <dbl>, `1308` <dbl>, `1309` <dbl>,
## # `1310` <dbl>, `1312` <dbl>, `1313` <dbl>, `1314` <dbl>, `1315` <dbl>,
## # `1316` <dbl>, `1319` <dbl>, `1321` <dbl>, `1323` <dbl>, `1324` <dbl>, …
# Summarize NA values by stock
na_summary <- data_selected %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "stock_id", values_to = "na_count") %>%
filter(na_count > 0) %>%
arrange(stock_id)
print(na_summary)## # A tibble: 10 × 2
## stock_id na_count
## <chr> <int>
## 1 3716 160
## 2 4585 177
## 3 7722 18
## 4 7732 43
## 5 7736 53
## 6 7750 108
## 7 7765 151
## 8 7780 196
## 9 7788 198
## 10 7799 217
Total stocks with NA values: 10
# Replace NA values with last observation carried forward
data_filled <- data_selected %>%
mutate(across(-date, ~na.locf(., na.rm = FALSE)))
head(data_filled)## # A tibble: 6 × 947
## date `1101` `1102` `1103` `1104` `1108` `1109` `1110` `1201` `1203`
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2024-01-02 32.4 37.0 17.6 26.1 14.1 16.2 18.3 18.2 55.3
## 2 2024-01-03 31.9 36.6 17.5 25.9 14.0 16.1 18.2 18.2 54.7
## 3 2024-01-04 31.9 37.0 17.5 25.5 14.1 16.1 18.4 18.1 54.1
## 4 2024-01-05 32.1 37.0 17.5 25.8 14.1 16.1 18.3 18.1 54.9
## 5 2024-01-08 32.0 37.2 17.6 25.7 14.2 16.1 18.3 18.1 54.6
## 6 2024-01-09 31.8 36.9 17.5 25.4 13.8 16.0 18.4 18.1 54.6
## # ℹ 937 more variables: `1210` <dbl>, `1213` <dbl>, `1215` <dbl>, `1216` <dbl>,
## # `1217` <dbl>, `1218` <dbl>, `1219` <dbl>, `1220` <dbl>, `1225` <dbl>,
## # `1227` <dbl>, `1229` <dbl>, `1231` <dbl>, `1232` <dbl>, `1233` <dbl>,
## # `1234` <dbl>, `1235` <dbl>, `1236` <dbl>, `1301` <dbl>, `1303` <dbl>,
## # `1304` <dbl>, `1305` <dbl>, `1307` <dbl>, `1308` <dbl>, `1309` <dbl>,
## # `1310` <dbl>, `1312` <dbl>, `1313` <dbl>, `1314` <dbl>, `1315` <dbl>,
## # `1316` <dbl>, `1319` <dbl>, `1321` <dbl>, `1323` <dbl>, `1324` <dbl>, …
# Remove stocks that had NA values
stocks_with_na <- na_summary$stock_id
data_clean <- data_selected %>%
select(-any_of(stocks_with_na))
cat("Updated dimensions after removing stocks with NA:\n")## Updated dimensions after removing stocks with NA:
## Rows: 358
## Columns: 937
Final dataset: 358 rows × 937 columns
# Ensure date is properly formatted
data_clean <- data_clean %>%
mutate(date = as.Date(date))
# Convert to xts object
data_xts <- xts(data_clean[, -1], order.by = data_clean$date)
# Calculate daily returns
daily_returns <- Return.calculate(data_xts, method = "discrete")
# Remove first row (NA) and show first 5 stocks with first 5 days
daily_returns <- daily_returns[-1, ]
head(daily_returns[, 1:5], 5)## 1101 1102 1103 1104 1108
## 2024-01-03 -0.014408653 -0.012149290 -0.007958236 -0.006733735 -0.003160781
## 2024-01-04 0.000000000 0.012298711 0.000000000 -0.013558772 0.003170803
## 2024-01-05 0.004384536 -0.001214929 0.002674026 0.010306896 0.000000000
## 2024-01-08 -0.002909225 0.004865628 0.002666895 -0.003399291 0.006328664
## 2024-01-09 -0.005841680 -0.007263102 -0.002659801 -0.013655209 -0.025155457
# Get last day of each month
monthly_prices <- apply.monthly(data_xts, last)
# Calculate monthly returns
monthly_returns <- Return.calculate(monthly_prices, method = "discrete")
# Remove first row (NA) and show first 5 stocks with first 5 months
monthly_returns <- monthly_returns[-1, ]
head(monthly_returns[, 1:5], 5)## 1101 1102 1103 1104 1108
## 2024-02-29 0.006272034 0.01760804 -0.008404066 0.01887014 0.036185231
## 2024-03-29 0.001554899 0.02101398 -0.016950585 0.06397241 0.003170803
## 2024-04-30 -0.003108301 0.05811288 0.057470064 0.11234002 0.072790295
## 2024-05-31 0.029639309 -0.04920108 0.032611536 -0.03271487 0.000000000
## 2024-06-28 0.036364817 0.05535680 -0.036845213 0.04852830 -0.011805133
# Convert and clean the data
data_long <- data %>%
mutate(date = as.Date(as.character(date), format = "%Y%m%d")) %>% # Proper conversion
filter(!is.na(date), !is.na(cap))
# Year-end 2024
top20_2024 <- data_long %>%
filter(year(date) == 2024) %>%
group_by(id, name) %>%
slice_max(date, n = 1) %>%
ungroup() %>%
arrange(desc(cap)) %>%
slice(1:20) %>%
select(date, cap, id, name)
print(top20_2024)## # A tibble: 20 × 4
## date cap id name
## <date> <dbl> <dbl> <chr>
## 1 2024-12-31 27877688 2330 TSMC
## 2 2024-12-31 2556073 2317 Hon Hai
## 3 2024-12-31 2266389 2454 MediaTek
## 4 2024-12-31 1234015 2881 Fubon Financial
## 5 2024-12-31 1118242 2308 DELTA
## 6 2024-12-31 1108574 2382 QCI
## 7 2024-12-31 1001907 2882 CATHAY FHC
## 8 2024-12-31 958045 2412 CHT
## 9 2024-12-31 767154 2891 CTBC Holding
## 10 2024-12-31 715219 3711 ASEH
## 11 2024-12-31 574052 2886 Mega FHC
## 12 2024-12-31 540739 2303 UMC
## 13 2024-12-31 487135 2603 EMC
## 14 2024-12-31 486903 6669 Wiwynn
## 15 2024-12-31 459675 1216 Uni-President
## 16 2024-12-31 457540 2357 Asustek
## 17 2024-12-31 440057 2885 Yuanta Group
## 18 2024-12-31 433744 2345 Accton
## 19 2024-12-31 431087 2884 E.S.F.H
## 20 2024-12-31 422590 3045 TWM
# Year-end 2025
top20_2025 <- data_long %>%
filter(year(date) == 2025) %>%
group_by(id, name) %>%
slice_max(date, n = 1) %>%
ungroup() %>%
arrange(desc(cap)) %>%
slice(1:20) %>%
select(date, cap, id, name)
print(top20_2025)## # A tibble: 20 × 4
## date cap id name
## <date> <dbl> <dbl> <chr>
## 1 2025-06-30 27488572 2330 TSMC
## 2 2025-06-30 2236570 2317 Hon Hai
## 3 2025-06-30 2002073 2454 MediaTek
## 4 2025-06-30 1193018 2881 Fubon Financial
## 5 2025-06-30 1072785 2308 DELTA
## 6 2025-06-30 1060291 2382 QCI
## 7 2025-06-30 1047255 2412 CHT
## 8 2025-06-30 921226 2882 CATHAY FHC
## 9 2025-06-30 858722 2891 CTBC Holding
## 10 2025-06-30 652308 3711 ASEH
## 11 2025-06-30 608910 2886 Mega FHC
## 12 2025-06-30 554998 2303 UMC
## 13 2025-06-30 525462 2884 E.S.F.H
## 14 2025-06-30 478338 2357 Asustek
## 15 2025-06-30 470177 6669 Wiwynn
## 16 2025-06-30 460243 1216 Uni-President
## 17 2025-06-30 441999 2885 Yuanta Group
## 18 2025-06-30 430844 2603 EMC
## 19 2025-06-30 428175 3045 TWM
## 20 2025-06-30 409616 2345 Accton