BAV demo

Author

Mike Nguyen

Code
library(reticulate)
py_require(
  c(
    "pandas",
    "seaborn",
    "matplotlib",
    "pymysql",
    "dotenv",
    "sqlalchemy",
    "fuzzywuzzy",
    "pymongo"
    # "urllib"
  )
)

Connect

Code
import pymysql
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
True
Code

# Connect
conn = pymysql.connect(
    host=os.getenv('SQL_HOST'),
    port=int(os.getenv('SQL_PORT')),
    user=os.getenv('SQL_USER'),
    password=os.getenv('SQL_PASS'),
    database=os.getenv('SQL_DB')
)

cursor = conn.cursor()

# Show all tables
# cursor.execute("SHOW TABLES")
# tables = cursor.fetchall()
# print("Available tables:")
# for table in tables:
#     print(f"  - {table[0]}")

# Show info for each table
# for table in tables:
#     table_name = table[0]
#     cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
#     count = cursor.fetchone()[0]
#     print(f"\n{table_name}: {count} rows")
#     cursor.execute(f"SELECT * FROM {table_name} LIMIT 3")
#     print(cursor.fetchall())

# conn.close()

Read Dataset

Code
import pandas as pd
df_historical_price = pd.read_sql("SELECT * FROM dataset_historical_price LIMIT 10", conn)
Code
# Get first 10 distinct symbols
symbols = pd.read_sql("SELECT DISTINCT symbol FROM dataset_historical_price LIMIT 10", conn)
symbol_list = symbols['symbol'].tolist()

# Get all data for those 100 symbols
df_historical_price = pd.read_sql(
    "SELECT * FROM dataset_historical_price WHERE symbol IN (%s)" % ','.join(['%s']*len(symbol_list)), 
    conn, 
    params=symbol_list
)


# print(f"Symbols: {symbol_list}")
# print(f"Loaded {len(df_historical_price)} rows")
# print(df_historical_price.head())

Visualization

Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Convert date to datetime
df_historical_price['date'] = pd.to_datetime(df_historical_price['date'])

df_historical_price = df_historical_price[df_historical_price['year'] == 2025]

# Create figure with multiple subplots
fig = plt.figure(figsize=(20, 12))

# 1. Price Trends by Symbol
plt.figure(figsize=(12, 6))
for symbol in df_historical_price['symbol'].unique():
    symbol_data = df_historical_price[df_historical_price['symbol'] == symbol].sort_values('date')
    plt.plot(symbol_data['date'], symbol_data['close_price'], label=symbol, linewidth=2)
plt.title('Stock Price Trends', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.xticks(rotation=45)
(array([20089., 20103., 20120., 20134., 20148., 20162., 20179., 20193.,
       20209., 20223.]), [Text(20089.0, 0, '2025-01-01'), Text(20103.0, 0, '2025-01-15'), Text(20120.0, 0, '2025-02-01'), Text(20134.0, 0, '2025-02-15'), Text(20148.0, 0, '2025-03-01'), Text(20162.0, 0, '2025-03-15'), Text(20179.0, 0, '2025-04-01'), Text(20193.0, 0, '2025-04-15'), Text(20209.0, 0, '2025-05-01'), Text(20223.0, 0, '2025-05-15')])
Code
plt.tight_layout()
plt.show()

Daily Returns Distribution

Code
# 2. Daily Returns Distribution
df_historical_price_sorted = df_historical_price.sort_values(['symbol', 'date'])
df_historical_price_sorted['returns'] = df_historical_price_sorted.groupby('symbol')['close_price'].pct_change() * 100
plt.figure(figsize=(10, 6))
plt.hist(df_historical_price_sorted['returns'].dropna(), bins=50, edgecolor='black', alpha=0.7)
plt.axvline(0, color='red', linestyle='--', linewidth=2)
plt.title('Daily Returns Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Returns (%)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

Trading Volume by Symbol

Code
# 3. Trading Volume by Symbol
vol_by_symbol = df_historical_price.groupby('symbol')['vol_total'].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(range(len(vol_by_symbol)), vol_by_symbol.values, color='steelblue')
plt.xticks(range(len(vol_by_symbol)), vol_by_symbol.index, rotation=45)
([<matplotlib.axis.XTick object at 0x7fad40922610>, <matplotlib.axis.XTick object at 0x7fad40901ad0>, <matplotlib.axis.XTick object at 0x7fad40953b90>, <matplotlib.axis.XTick object at 0x7fad4095a110>, <matplotlib.axis.XTick object at 0x7fad4095c350>, <matplotlib.axis.XTick object at 0x7fad4095e650>, <matplotlib.axis.XTick object at 0x7fad4095ef90>, <matplotlib.axis.XTick object at 0x7fad40954f10>, <matplotlib.axis.XTick object at 0x7fad409571d0>, <matplotlib.axis.XTick object at 0x7fad40961550>], [Text(0, 0, 'AAA'), Text(1, 0, 'CACB2404'), Text(2, 0, 'CACB2504'), Text(3, 0, 'CACB2503'), Text(4, 0, 'CACB2502'), Text(5, 0, 'CFPT2404'), Text(6, 0, 'CACB2501'), Text(7, 0, 'CACB2506'), Text(8, 0, 'CACB2405'), Text(9, 0, 'CACB2505')])
Code
plt.title('Total Trading Volume by Symbol', fontsize=14, fontweight='bold')
plt.ylabel('Volume')
plt.tight_layout()
plt.show()

Foreign Trading Activity

Code
# 4. Foreign Trading Activity
df_historical_price['foreign_net'] = df_historical_price['buy_vol_foreign'] - df_historical_price['sell_vol_foreign']
foreign_by_symbol = df_historical_price.groupby('symbol')['foreign_net'].sum().sort_values()
colors = ['red' if x < 0 else 'green' for x in foreign_by_symbol.values]
plt.figure(figsize=(10, 6))
plt.barh(range(len(foreign_by_symbol)), foreign_by_symbol.values, color=colors)
plt.yticks(range(len(foreign_by_symbol)), foreign_by_symbol.index)
([<matplotlib.axis.YTick object at 0x7fad4079ed10>, <matplotlib.axis.YTick object at 0x7fad40799ad0>, <matplotlib.axis.YTick object at 0x7fad41194e90>, <matplotlib.axis.YTick object at 0x7fad407e6f50>, <matplotlib.axis.YTick object at 0x7fad407f1290>, <matplotlib.axis.YTick object at 0x7fad407f3590>, <matplotlib.axis.YTick object at 0x7fad407f59d0>, <matplotlib.axis.YTick object at 0x7fad407f6350>, <matplotlib.axis.YTick object at 0x7fad407f8550>, <matplotlib.axis.YTick object at 0x7fad407fa810>], [Text(0, 0, 'CFPT2404'), Text(0, 1, 'CACB2404'), Text(0, 2, 'CACB2501'), Text(0, 3, 'CACB2405'), Text(0, 4, 'CACB2504'), Text(0, 5, 'CACB2503'), Text(0, 6, 'CACB2505'), Text(0, 7, 'CACB2502'), Text(0, 8, 'CACB2506'), Text(0, 9, 'AAA')])
Code
plt.title('Foreign Net Trading (Buy - Sell)', fontsize=14, fontweight='bold')
plt.xlabel('Net Volume')
plt.axvline(0, color='black', linestyle='--', linewidth=1)
plt.tight_layout()
plt.show()

Price vs Volume Correlation

Code
# 5. Price vs Volume Correlation
plt.figure(figsize=(10, 6))
plt.scatter(df_historical_price['vol_total'], df_historical_price['close_price'], alpha=0.5)
plt.title('Price vs Trading Volume', fontsize=14, fontweight='bold')
plt.xlabel('Trading Volume')
plt.ylabel('Close Price')
correlation = df_historical_price[['vol_total', 'close_price']].corr().iloc[0, 1]
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', transform=plt.gca().transAxes, 
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5), verticalalignment='top')
plt.tight_layout()
plt.show()

Volatility Analysis

Code
# 6. Volatility Analysis
df_historical_price['volatility'] = ((df_historical_price['high_price'] - df_historical_price['low_price']) / df_historical_price['open_price']) * 100
volatility_by_symbol = df_historical_price.groupby('symbol')['volatility'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
plt.bar(range(len(volatility_by_symbol)), volatility_by_symbol.values, color='coral')
plt.xticks(range(len(volatility_by_symbol)), volatility_by_symbol.index, rotation=45)
([<matplotlib.axis.XTick object at 0x7fad3c427210>, <matplotlib.axis.XTick object at 0x7fad409a1450>, <matplotlib.axis.XTick object at 0x7fad3c422490>, <matplotlib.axis.XTick object at 0x7fad3c421b50>, <matplotlib.axis.XTick object at 0x7fad3c4214d0>, <matplotlib.axis.XTick object at 0x7fad40853a50>, <matplotlib.axis.XTick object at 0x7fad40851910>, <matplotlib.axis.XTick object at 0x7fad4084bfd0>, <matplotlib.axis.XTick object at 0x7fad407e27d0>, <matplotlib.axis.XTick object at 0x7fad4084ad90>], [Text(0, 0, 'CACB2506'), Text(1, 0, 'CACB2501'), Text(2, 0, 'CACB2504'), Text(3, 0, 'CACB2405'), Text(4, 0, 'CACB2503'), Text(5, 0, 'CACB2404'), Text(6, 0, 'CACB2502'), Text(7, 0, 'CFPT2404'), Text(8, 0, 'CACB2505'), Text(9, 0, 'AAA')])
Code
plt.title('Average Daily Volatility by Symbol', fontsize=14, fontweight='bold')
plt.ylabel('Volatility (%)')
plt.tight_layout()
plt.show()

Buy vs Sell Pressure

Code
# 7. Buy vs Sell Pressure
df_historical_price['buy_pressure'] = df_historical_price['buy_vol'] / (df_historical_price['buy_vol'] + df_historical_price['sell_vol']) * 100
plt.figure(figsize=(12, 6))
for symbol in df_historical_price['symbol'].unique():
    symbol_data = df_historical_price[df_historical_price['symbol'] == symbol].sort_values('date')
    plt.plot(symbol_data['date'], symbol_data['buy_pressure'], label=symbol, linewidth=2)
plt.axhline(50, color='red', linestyle='--', linewidth=2, alpha=0.5)
plt.title('Buy Pressure Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Buy Pressure (%)')
plt.legend()
plt.xticks(rotation=45)
(array([20089., 20103., 20120., 20134., 20148., 20162., 20179., 20193.,
       20209., 20223.]), [Text(20089.0, 0, '2025-01-01'), Text(20103.0, 0, '2025-01-15'), Text(20120.0, 0, '2025-02-01'), Text(20134.0, 0, '2025-02-15'), Text(20148.0, 0, '2025-03-01'), Text(20162.0, 0, '2025-03-15'), Text(20179.0, 0, '2025-04-01'), Text(20193.0, 0, '2025-04-15'), Text(20209.0, 0, '2025-05-01'), Text(20223.0, 0, '2025-05-15')])
Code
plt.tight_layout()
plt.show()

Trading Activity Heatmap

Code
# 8. Trading Activity Heatmap
pivot_data = df_historical_price.pivot_table(values='vol_total', index='symbol', columns=df_historical_price['date'].dt.date, aggfunc='sum')
plt.figure(figsize=(14, 6))
sns.heatmap(pivot_data.fillna(0), cmap='YlOrRd', cbar_kws={'label': 'Volume'})
plt.title('Trading Activity Heatmap', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Symbol')
plt.tight_layout()
plt.show()

Connect to Mongo DB

Code
from pymongo import MongoClient
import pandas as pd
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv()
True
Code
# Connect to MongoDB
client = MongoClient(
    host=os.getenv('MONGO_HOST'),
    port=int(os.getenv('MONGO_PORT')),
    username=os.getenv('MONGO_USER'),
    password=os.getenv('MONGO_PASS'),
    authSource=os.getenv('MONGO_DB')
)

db = client[os.getenv('MONGO_DB')]

# Query fundamental_annual for those 10 symbols
df_fundamental = pd.DataFrame(list(
    db.fundamental_annual.find({'symbol': {'$in': symbol_list}})
))

client.close()

print(f"Loaded {len(df_fundamental)} rows for {len(symbol_list)} symbols")
Loaded 18 rows for 10 symbols
Code
print(df_fundamental.head())
                        _id nl_interest_share_prov  ...  ca_afs  na_fix_lease_fin
0  68623e2483cd31192d8fef84                   None  ...    None               0.0
1  68623e2483cd31192d8fef85                   None  ...    None               0.0
2  68623e2483cd31192d8fef86                   None  ...    None               0.0
3  68623e2483cd31192d8fef87                   None  ...    None               0.0
4  68623e2483cd31192d8fef88                   None  ...    None               0.0

[5 rows x 301 columns]

Merge Multiple Datasets

Code
# Merge the dataframes on symbol column
df_merged = pd.merge(
    df_historical_price, 
    df_fundamental, 
    on='symbol', 
    how='left'
)

print(f"Merged dataframe: {len(df_merged)} rows")
Merged dataframe: 2173 rows
Code
print(df_merged.head())
   id  adj_ratio  average_price  ...  is_net_revenue  ca_afs  na_fix_lease_fin
0  22        1.0       0.850371  ...             NaN     NaN               NaN
1  23        1.0       0.810073  ...             NaN     NaN               NaN
2  24        1.0       0.736440  ...             NaN     NaN               NaN
3  25        1.0       0.766328  ...             NaN     NaN               NaN
4  26        1.0       0.734751  ...             NaN     NaN               NaN

[5 rows x 332 columns]
Code
print(df_merged.columns)
Index(['id', 'adj_ratio', 'average_price', 'basic_price', 'buy_count',
       'buy_vol_foreign', 'buy_val_foreigh', 'buy_vol', 'close_price',
       'foreign_room',
       ...
       'na_total_other', 'cfo_other_operating_expense', 'pb', 'ca_htm',
       'e_develop_fund', 'pe', 'ca_fin_invest', 'is_net_revenue', 'ca_afs',
       'na_fix_lease_fin'],
      dtype='object', length=332)

Use R and Python interchangeably

Code
library(reticulate)

# Access your Python df directly
df_r <- py$df_historical_price

head(df_r)
   id adj_ratio average_price basic_price buy_count buy_vol_foreign
21 22         1      0.850371        0.86         0               0
22 23         1      0.810073        0.86         0               0
23 24         1      0.736440        0.82         0               0
24 25         1      0.766328        0.75         0               0
25 26         1      0.734751        0.77         0               0
26 27         1      0.725673        0.76         0               0
   buy_val_foreigh buy_vol close_price foreign_room                date
21               0       0        0.86      2.5e+07 2025-01-02 07:00:00
22               0       0        0.82      2.5e+07 2025-01-03 07:00:00
23               0       0        0.75      2.5e+07 2025-01-06 07:00:00
24               0       0        0.77      2.5e+07 2025-01-07 07:00:00
25               0       0        0.76      2.5e+07 2025-01-08 07:00:00
26               0       0        0.74      2.5e+07 2025-01-09 07:00:00
   high_price low_price open_price prop_trading_deal prop_trading_putth
21       0.90      0.83       0.87         -12279000                  0
22       0.86      0.79       0.86        -132411000                  0
23       0.78      0.72       0.78        -102404000                  0
24       0.77      0.75       0.77         -40350000                  0
25       0.77      0.72       0.76          31040000                  0
26       0.85      0.72       0.85        -120970000                  0
   prop_trading_net sell_count sell_vol_foreign sel_val_foreign sell_vol
21        -12279000          0                0               0        0
22       -132411000          0                0               0        0
23       -102404000          0                0               0        0
24        -40350000          0                0               0        0
25         31040000          0                0               0        0
26       -120970000          0                0               0        0
     symbol unit val_putth val_total vol_deal vol_putth vol_total year
21 CACB2404 1000         0 128406000   151000         0    151000 2025
22 CACB2404 1000         0 255254000   315100         0    315100 2025
23 CACB2404 1000         0 221153000   300300         0    300300 2025
24 CACB2404 1000         0  77629000   101300         0    101300 2025
25 CACB2404 1000         0 150477000   204800         0    204800 2025
26 CACB2404 1000         0 686487000   946000         0    946000 2025
   foreign_net volatility buy_pressure
21           0   8.045977          NaN
22           0   8.139535          NaN
23           0   7.692308          NaN
24           0   2.597403          NaN
25           0   6.578947          NaN
26           0  15.294118          NaN
Code
library(tidyverse)
df_r_mod <- df_r %>% 
  filter(id == 22)

head(df_r_mod)
   id adj_ratio average_price basic_price buy_count buy_vol_foreign
21 22         1      0.850371        0.86         0               0
   buy_val_foreigh buy_vol close_price foreign_room                date
21               0       0        0.86      2.5e+07 2025-01-02 07:00:00
   high_price low_price open_price prop_trading_deal prop_trading_putth
21        0.9      0.83       0.87         -12279000                  0
   prop_trading_net sell_count sell_vol_foreign sel_val_foreign sell_vol
21        -12279000          0                0               0        0
     symbol unit val_putth val_total vol_deal vol_putth vol_total year
21 CACB2404 1000         0 128406000   151000         0    151000 2025
   foreign_net volatility buy_pressure
21           0   8.045977          NaN
Code
# send it back to Python
py$df_r_mod <- df_r_mod
Code
# sanity check
df_r_mod.head()
      id  adj_ratio  average_price  ...  foreign_net  volatility  buy_pressure
21  22.0        1.0       0.850371  ...          0.0    8.045977           NaN

[1 rows x 32 columns]