Financial Database Analysis and Application

Setup

Clear environment and load required packages:

# Clear environment
rm(list=ls())

# Load required packages
library(tidyverse)
library(tidyquant)
library(timetk)
library(zoo)
library(knitr)
library(DT)

Question 1: Import Data

Task: Please load up packages tidyquant, tidyverse and timetk. Import the data file from Tronclass: tej_day_price_2024_20250630.txt. Try using functions read_csv(), read_tsv() and read_delim() to import data. Show me the imported results of by using one of the three functions: glimpse(), head() or str() to show the results.

# Method 1: read_csv()
data_csv <- read_csv("tej_day_price_2024_20250630.txt")

# Method 2: read_tsv() - recommended for tab-separated files
data_tsv <- read_tsv("tej_day_price_2024_20250630.txt")

# Method 3: read_delim() with explicit delimiter
data_delim <- read_delim("tej_day_price_2024_20250630.txt", delim = "\t")

# Show imported results using glimpse()
glimpse(data_tsv)

## Rows: 337,347
## Columns: 12
## $ CO_ID                 <chr> "1101 TCC", "1102 ACC", "1103 CHC", "1104 UCC", …
## $ Date                  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102…
## $ `TSE ID`              <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, …
## $ `TSE Sector`          <chr> "01", "01", "01", "01", "01", "01", "01", "02", …
## $ `English Short Name`  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSI…
## $ `Open(NTD)`           <dbl> 32.5373, 37.2642, 17.7825, 26.0628, 14.1679, 16.…
## $ `High(NTD)`           <dbl> 32.5373, 37.4442, 17.7825, 26.1505, 14.1679, 16.…
## $ `Low(NTD)`            <dbl> 32.3038, 36.9492, 17.5953, 25.9750, 14.0343, 16.…
## $ `Close(NTD)`          <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.…
## $ `Volume(1000S)`       <dbl> 14937, 6223, 171, 260, 442, 228, 57, 126, 48, 18…
## $ `Amount(NTD1000)`     <dbl> 518751, 256522, 3240, 7736, 6992, 4159, 1075, 24…
## $ `Market Cap.(NTD MN)` <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754,…

# Use data_tsv for subsequent analysis
data <- data_tsv

Question 2: Replace Column Names

Task: Replace column 2, 3, 5, 9 and 12 with new column names: “date”, “id”, “name”, “price”, “cap”. Show your results.

# Replace column 2, 3, 5, 9, 12 with new names
# Column 2: Date → date
# Column 3: TSE ID → id  
# Column 5: English Short Name → name
# Column 9: Close(NTD) → price
# Column 12: Market Cap.(NTD MN) → cap

data_q2 <- data %>%
  select(
    date = 2,      # Date
    id = 3,        # TSE ID
    name = 5,      # English Short Name
    price = 9,     # Close(NTD)
    cap = 12       # Market Cap.(NTD MN)
  )

glimpse(data_q2)

## Rows: 337,347
## Columns: 5
## $ date  <dbl> 20240102, 20240102, 20240102, 20240102, 20240102, 20240102, 2024…
## $ id    <dbl> 1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 1203, 1210, 1213…
## $ name  <chr> "TCC", "ACC", "CHC", "UCC", "Lucky Cement", "HSINGTA", "Tuna Cem…
## $ price <dbl> 32.3972, 37.0392, 17.6421, 26.0628, 14.0788, 16.1807, 18.3336, 1…
## $ cap   <dbl> 262026, 145941, 14896, 19995, 6395, 6209, 10754, 9640, 13992, 52…

Question 3: Convert to Wide Format

Task: Select column id, date, price, and change id format to text, date format to date. Also change the data format from long to wide and show your results.

# Select id, date, price and convert formats
data_q3 <- data_q2 %>%
  select(id, date, price) %>%
  mutate(
    id = as.character(id),           # Change id to text
    date = ymd(date)                 # Change date to Date format
  ) %>%
  pivot_wider(                       # Convert long to wide
    names_from = id,
    values_from = price
  )

# Show results
head(data_q3, 10)

## # A tibble: 10 × 947
##    date       `1101` `1102` `1103` `1104` `1108` `1109` `1110` `1201` `1203`
##    <date>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 2024-01-02   32.4   37.0   17.6   26.1   14.1   16.2   18.3   18.2   55.3
##  2 2024-01-03   31.9   36.6   17.5   25.9   14.0   16.1   18.2   18.2   54.7
##  3 2024-01-04   31.9   37.0   17.5   25.5   14.1   16.1   18.4   18.1   54.1
##  4 2024-01-05   32.1   37.0   17.5   25.8   14.1   16.1   18.3   18.1   54.9
##  5 2024-01-08   32.0   37.2   17.6   25.7   14.2   16.1   18.3   18.1   54.6
##  6 2024-01-09   31.8   36.9   17.5   25.4   13.8   16.0   18.4   18.1   54.6
##  7 2024-01-10   31.5   36.6   17.4   25.4   13.7   16.0   18.5   18.0   53.4
##  8 2024-01-11   31.5   36.7   17.5   25.6   13.8   16.1   18.5   18.0   55.1
##  9 2024-01-12   31.5   36.6   17.5   25.6   13.8   16.0   18.4   18.0   54.4
## 10 2024-01-15   31.4   36.6   17.4   25.4   13.8   16.0   18.3   18.0   53.8
## # ℹ 937 more variables: `1210` <dbl>, `1213` <dbl>, `1215` <dbl>, `1216` <dbl>,
## #   `1217` <dbl>, `1218` <dbl>, `1219` <dbl>, `1220` <dbl>, `1225` <dbl>,
## #   `1227` <dbl>, `1229` <dbl>, `1231` <dbl>, `1232` <dbl>, `1233` <dbl>,
## #   `1234` <dbl>, `1235` <dbl>, `1236` <dbl>, `1301` <dbl>, `1303` <dbl>,
## #   `1304` <dbl>, `1305` <dbl>, `1307` <dbl>, `1308` <dbl>, `1309` <dbl>,
## #   `1310` <dbl>, `1312` <dbl>, `1313` <dbl>, `1314` <dbl>, `1315` <dbl>,
## #   `1316` <dbl>, `1319` <dbl>, `1321` <dbl>, `1323` <dbl>, `1324` <dbl>, …

Question 4: Identify Stocks with NA Values

Task: Show the stock ids with NA values and compute the number of NA for each stock.

# Count NA values for each stock
na_summary <- data_q3 %>%
  select(-date) %>%                          # Exclude date column
  summarise(across(everything(), 
                   ~sum(is.na(.)))) %>%      # Count NAs per column
  pivot_longer(everything(),                 # Convert to long format
               names_to = "key",
               values_to = "value") %>%
  filter(value > 0) %>%                      # Keep only stocks with NA
  arrange(value)                             # Sort by number of NAs

# Display as interactive table
datatable(na_summary, 
          caption = "Stocks with NA Values",
          options = list(pageLength = 10))

Summary: There are 10 stocks with NA values.

Question 5: Fill NA Values

Task: Replace NA values with the closest available stock prices (using na.locf()).

# Fill NA values with last observation carried forward
data_q5 <- data_q3 %>%
  mutate(across(-date, ~na.locf(., na.rm = FALSE)))

# Show results
head(data_q5, 10)

## # A tibble: 10 × 947
##    date       `1101` `1102` `1103` `1104` `1108` `1109` `1110` `1201` `1203`
##    <date>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 2024-01-02   32.4   37.0   17.6   26.1   14.1   16.2   18.3   18.2   55.3
##  2 2024-01-03   31.9   36.6   17.5   25.9   14.0   16.1   18.2   18.2   54.7
##  3 2024-01-04   31.9   37.0   17.5   25.5   14.1   16.1   18.4   18.1   54.1
##  4 2024-01-05   32.1   37.0   17.5   25.8   14.1   16.1   18.3   18.1   54.9
##  5 2024-01-08   32.0   37.2   17.6   25.7   14.2   16.1   18.3   18.1   54.6
##  6 2024-01-09   31.8   36.9   17.5   25.4   13.8   16.0   18.4   18.1   54.6
##  7 2024-01-10   31.5   36.6   17.4   25.4   13.7   16.0   18.5   18.0   53.4
##  8 2024-01-11   31.5   36.7   17.5   25.6   13.8   16.1   18.5   18.0   55.1
##  9 2024-01-12   31.5   36.6   17.5   25.6   13.8   16.0   18.4   18.0   54.4
## 10 2024-01-15   31.4   36.6   17.4   25.4   13.8   16.0   18.3   18.0   53.8
## # ℹ 937 more variables: `1210` <dbl>, `1213` <dbl>, `1215` <dbl>, `1216` <dbl>,
## #   `1217` <dbl>, `1218` <dbl>, `1219` <dbl>, `1220` <dbl>, `1225` <dbl>,
## #   `1227` <dbl>, `1229` <dbl>, `1231` <dbl>, `1232` <dbl>, `1233` <dbl>,
## #   `1234` <dbl>, `1235` <dbl>, `1236` <dbl>, `1301` <dbl>, `1303` <dbl>,
## #   `1304` <dbl>, `1305` <dbl>, `1307` <dbl>, `1308` <dbl>, `1309` <dbl>,
## #   `1310` <dbl>, `1312` <dbl>, `1313` <dbl>, `1314` <dbl>, `1315` <dbl>,
## #   `1316` <dbl>, `1319` <dbl>, `1321` <dbl>, `1323` <dbl>, `1324` <dbl>, …

Question 6: Remove Stocks with NA

Task: Delete the stock which contains prices of NA in your data in question 4. Show the updated number of rows and columns in your filtered data.

# Get list of stock IDs that have NA values
stocks_with_na <- na_summary$key

# Remove these stocks from the dataset
data_q6 <- data_q3 %>%
  select(-all_of(stocks_with_na))

# Show updated dimensions
cat("Number of rows:", nrow(data_q6), "\n")

## Number of rows: 358

cat("Number of columns:", ncol(data_q6), "\n")

## Number of columns: 937

Results: After removing stocks with NA values: - Rows: 358 - Columns: 937 (1 date column + 936 stock columns) - Stocks removed: 10

Question 7: Calculate Daily Returns

Task: Convert data in Question 6 into time series data (xts). And calculate daily rate of returns (compute discrete returns). Delete the first row and show the first five stocks with first five days of returns.

# Convert to xts time series data
data_q7_xts <- data_q6 %>%
  tk_xts(date_var = date)

# Calculate daily rate of returns (discrete)
returns_daily <- Return.calculate(data_q7_xts, method = "discrete")

# Delete the first row (NA values from return calculation)
returns_daily <- returns_daily[-1, ]

# Show first 5 stocks with first 5 days of returns
head(returns_daily[, 1:5], 5)

##                    1101         1102         1103         1104         1108
## 2024-01-03 -0.014408653 -0.012149290 -0.007958236 -0.006733735 -0.003160781
## 2024-01-04  0.000000000  0.012298711  0.000000000 -0.013558772  0.003170803
## 2024-01-05  0.004384536 -0.001214929  0.002674026  0.010306896  0.000000000
## 2024-01-08 -0.002909225  0.004865628  0.002666895 -0.003399291  0.006328664
## 2024-01-09 -0.005841680 -0.007263102 -0.002659801 -0.013655209 -0.025155457

Question 8: Calculate Monthly Returns

Task: Compute monthly returns. Delete the first row and show the first five stocks with first five months of returns.

# Convert daily data to monthly using to.monthly()
data_monthly <- to.monthly(data_q7_xts, OHLC = FALSE)

# Calculate monthly returns (discrete)
returns_monthly <- Return.calculate(data_monthly, method = "discrete")

# Delete the first row
returns_monthly <- returns_monthly[-1, ]

# Show first 5 stocks with first 5 months of returns
head(returns_monthly[, 1:5], 5)

##                  1101        1102         1103        1104         1108
## Feb 2024  0.006272034  0.01760804 -0.008404066  0.01887014  0.036185231
## Mar 2024  0.001554899  0.02101398 -0.016950585  0.06397241  0.003170803
## Apr 2024 -0.003108301  0.05811288  0.057470064  0.11234002  0.072790295
## May 2024  0.029639309 -0.04920108  0.032611536 -0.03271487  0.000000000
## Jun 2024  0.036364817  0.05535680 -0.036845213  0.04852830 -0.011805133

Question 9: Top 20 Largest Cap Firms

Task: Find the 20 largest cap firms in the year end of 2024 and 2025. Show the results.

# Find 20 largest cap firms for year-end 2024 and 2025
largest_caps <- data_q2 %>%
  mutate(
    date = ymd(date),                        # Convert to date
    year1 = year(date),                      # Extract year
    id = as.character(id)                    # Convert id to text
  ) %>%
  filter(
    date == as.Date("2024-12-31") |          # Year-end 2024
    date == as.Date("2025-06-30")            # Year-end 2025 (last date)
  ) %>%
  group_by(year1) %>%                        # Group by year
  arrange(desc(cap)) %>%                     # Sort by cap descending
  slice(1:20) %>%                            # Take top 20
  ungroup() %>%                              # Remove grouping
  mutate(cap1 = scales::dollar(cap,          # Format cap with dollar sign
                               prefix = "$", 
                               big.mark = ",", 
                               accuracy = 1)) %>%
  select(date, year1, cap, cap1, id, name)   # Select relevant columns

# Display as interactive table
datatable(largest_caps, 
          caption = "Top 20 Largest Cap Firms (2024 & 2025)",
          options = list(pageLength = 20))

Top 10 Firms in 2024

largest_caps %>%
  filter(year1 == 2024) %>%
  head(10) %>%
  kable(caption = "Top 10 Largest Market Cap Firms - Year End 2024")

Top 10 Largest Market Cap Firms - Year End 2024
date	year1	cap	cap1	id	name
2024-12-31	2024	27877688	$27,877,688	2330	TSMC
2024-12-31	2024	2556073	$2,556,073	2317	Hon Hai
2024-12-31	2024	2266389	$2,266,389	2454	MediaTek
2024-12-31	2024	1234015	$1,234,015	2881	Fubon Financial
2024-12-31	2024	1118242	$1,118,242	2308	DELTA
2024-12-31	2024	1108574	$1,108,574	2382	QCI
2024-12-31	2024	1001907	$1,001,907	2882	CATHAY FHC
2024-12-31	2024	958045	$958,045	2412	CHT
2024-12-31	2024	767154	$767,154	2891	CTBC Holding
2024-12-31	2024	715219	$715,219	3711	ASEH

Top 10 Firms in 2025

largest_caps %>%
  filter(year1 == 2025) %>%
  head(10) %>%
  kable(caption = "Top 10 Largest Market Cap Firms - Year End 2025")

Top 10 Largest Market Cap Firms - Year End 2025
date	year1	cap	cap1	id	name
2025-06-30	2025	27488572	$27,488,572	2330	TSMC
2025-06-30	2025	2236570	$2,236,570	2317	Hon Hai
2025-06-30	2025	2002073	$2,002,073	2454	MediaTek
2025-06-30	2025	1193018	$1,193,018	2881	Fubon Financial
2025-06-30	2025	1072785	$1,072,785	2308	DELTA
2025-06-30	2025	1060291	$1,060,291	2382	QCI
2025-06-30	2025	1047255	$1,047,255	2412	CHT
2025-06-30	2025	921226	$921,226	2882	CATHAY FHC
2025-06-30	2025	858722	$858,722	2891	CTBC Holding
2025-06-30	2025	652308	$652,308	3711	ASEH

Summary Statistics

summary_stats <- data.frame(
  Metric = c("Original data rows", 
             "Original data columns",
             "After cleaning rows",
             "After cleaning columns",
             "Stocks removed",
             "Daily returns rows",
             "Daily returns columns",
             "Monthly returns rows",
             "Monthly returns columns"),
  Value = c(nrow(data),
            ncol(data),
            nrow(data_q6),
            ncol(data_q6),
            length(stocks_with_na),
            nrow(returns_daily),
            ncol(returns_daily),
            nrow(returns_monthly),
            ncol(returns_monthly))
)

kable(summary_stats, caption = "Analysis Summary")

Analysis Summary
Metric	Value
Original data rows	337347
Original data columns	12
After cleaning rows	358
After cleaning columns	937
Stocks removed	10
Daily returns rows	357
Daily returns columns	936
Monthly returns rows	17
Monthly returns columns	936

Conclusion

This analysis successfully:

✓ Imported and processed Taiwan stock market data
✓ Cleaned data by handling NA values
✓ Converted data formats (long to wide, regular to xts)
✓ Calculated daily and monthly returns
✓ Identified top firms by market capitalization

Total stocks analyzed: 936 stocks
Time period: 2024-01-02 to 2025-06-30
Total trading days: 358

Session Information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Sonoma 14.7.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.34.0                  knitr_1.50                
##  [3] timetk_2.9.1               PerformanceAnalytics_2.0.8
##  [5] quantmod_0.4.28            TTR_0.24.4                
##  [7] xts_0.14.1                 zoo_1.8-14                
##  [9] tidyquant_1.0.11           lubridate_1.9.4           
## [11] forcats_1.0.1              stringr_1.5.2             
## [13] dplyr_1.1.4                purrr_1.1.0               
## [15] readr_2.1.5                tidyr_1.3.1               
## [17] tibble_3.3.0               ggplot2_4.0.0             
## [19] tidyverse_2.0.0           
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1    timeDate_4041.110   farver_2.1.2       
##  [4] S7_0.2.0            fastmap_1.2.0       digest_0.6.37      
##  [7] rpart_4.1.24        timechange_0.3.0    lifecycle_1.0.4    
## [10] survival_3.8-3      magrittr_2.0.3      compiler_4.5.1     
## [13] rlang_1.1.6         sass_0.4.10         tools_4.5.1        
## [16] yaml_2.3.10         data.table_1.17.8   htmlwidgets_1.6.4  
## [19] bit_4.6.0           curl_7.0.0          RColorBrewer_1.1-3 
## [22] withr_3.0.2         nnet_7.3-20         grid_4.5.1         
## [25] future_1.67.0       globals_0.18.0      scales_1.4.0       
## [28] MASS_7.3-65         cli_3.6.5           crayon_1.5.3       
## [31] rmarkdown_2.29      generics_0.1.4      rstudioapi_0.17.1  
## [34] future.apply_1.20.0 tzdb_0.5.0          cachem_1.1.0       
## [37] splines_4.5.1       parallel_4.5.1      vctrs_0.6.5        
## [40] hardhat_1.4.2       Matrix_1.7-3        jsonlite_2.0.0     
## [43] hms_1.1.3           bit64_4.6.0-1       RobStatTM_1.0.11   
## [46] listenv_0.9.1       crosstalk_1.2.2     gower_1.0.2        
## [49] jquerylib_0.1.4     recipes_1.3.1       glue_1.8.0         
## [52] parallelly_1.45.1   codetools_0.2-20    rsample_1.3.1      
## [55] stringi_1.8.7       gtable_0.3.6        quadprog_1.5-8     
## [58] pillar_1.11.1       furrr_0.3.1         htmltools_0.5.8.1  
## [61] ipred_0.9-15        lava_1.8.1          R6_2.6.1           
## [64] vroom_1.6.6         evaluate_1.0.5      lattice_0.22-7     
## [67] bslib_0.9.0         class_7.3-23        Rcpp_1.1.0         
## [70] prodlim_2025.04.28  xfun_0.53           pkgconfig_2.0.3

Financial Database Analysis and Application - Midterm

Purevjav Odkhuu

2025/10/27