import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# fix random seed for reproducibility
tf.random.set_seed(7)

# Open the text file 
df = pd.read_csv('/Users/nnthieu/Downloads/archive-2/Data/Stocks/a.us.txt')
# Sort DataFrame by date
df = df.sort_values('Date')

print(df.head())
print(df.shape)
print(df.info())

         Date    Open    High     Low   Close    Volume  OpenInt
0  1999-11-18  30.713  33.754  27.002  29.702  66277506        0
1  1999-11-19  28.986  29.027  26.872  27.257  16142920        0
2  1999-11-22  27.886  29.702  27.044  29.702   6970266        0
3  1999-11-23  28.688  29.446  27.002  27.002   6332082        0
4  1999-11-24  27.083  28.309  27.002  27.717   5132147        0
(4521, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     4521 non-null   object 
 1   Open     4521 non-null   float64
 2   High     4521 non-null   float64
 3   Low      4521 non-null   float64
 4   Close    4521 non-null   float64
 5   Volume   4521 non-null   int64  
 6   OpenInt  4521 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 247.4+ KB
None

df.describe()

df = df.drop('OpenInt', axis = 1)

plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),(df['Low']+df['High'])/2.0)
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.xlabel('Date',fontsize=18)
plt.ylabel('Mid Price',fontsize=18)
plt.title('Mid Price by Date',fontsize=18)
plt.show()

import numpy as np
# First calculate the mid prices from the highest and lowest
high_prices = df.loc[:,'High']
low_prices = df.loc[:,'Low']
df['mid_prices'] = (high_prices+low_prices)/2.0

dft = df[['Date','mid_prices']]
dft.head()

dft = dft.groupby('Date').sum()
dft['Date'] = dft.index
dft = dft[['Date','mid_prices']]

dft['Date'] = pd.to_datetime(dft['Date'],format = '%Y-%m-%d')
dft.head()

data = dft[['mid_prices']]
# Convert the dataframe to a numpy array
dataset = data.values
training_data_len = int(np.ceil( len(dataset) * .95 ))
dataset

array([[30.378 ],
       [27.9495],
       [28.373 ],
       ...,
       [68.0505],
       [67.445 ],
       [67.14  ]])

np.ceil(len(dataset))

4521.0

# Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

scaled_data

array([[0.24636497],
       [0.22042029],
       [0.22494471],
       ...,
       [0.64883604],
       [0.64236723],
       [0.63910879]])

# Create the training data set 
# Create the scaled training data set
train_data = scaled_data[0:int(training_data_len), :]
# Split the data into x_train and y_train data sets
x_train = []
y_train = []

# we use 60 days ahead for forecasting range
for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])
    if i<= 61:
        print(x_train)
        print(y_train)
        print()
        
# Convert the x_train and y_train to numpy arrays 
x_train, y_train = np.array(x_train), np.array(y_train)

# Reshape the data
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# x_train.shape

[array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936,
       0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603,
       0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436,
       0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 ,
       0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637,
       0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856,
       0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315,
       0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649,
       0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857,
       0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716,
       0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 ,
       0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ])]
[0.4971635524502419]

[array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936,
       0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603,
       0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436,
       0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 ,
       0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637,
       0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856,
       0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315,
       0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649,
       0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857,
       0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716,
       0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 ,
       0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ]), array([0.22042029, 0.22494471, 0.22335288, 0.21727936, 0.21840112,
       0.22112005, 0.22427166, 0.22949051, 0.23981603, 0.24633826,
       0.25222482, 0.24746002, 0.24790338, 0.25065436, 0.24884886,
       0.24862451, 0.23712381, 0.22222578, 0.2436407 , 0.25558476,
       0.2573956 , 0.25604949, 0.26030149, 0.27319637, 0.29233037,
       0.33808211, 0.43385896, 0.47802955, 0.48504856, 0.44941936,
       0.40366228, 0.37907439, 0.35496191, 0.37233315, 0.41647169,
       0.40997618, 0.39801075, 0.4115413 , 0.41357649, 0.43025865,
       0.43316988, 0.4207878 , 0.41156266, 0.42595857, 0.4124387 ,
       0.41629008, 0.41854428, 0.41559565, 0.39912716, 0.42033375,
       0.45569586, 0.46833435, 0.4726184 , 0.4852462 , 0.50035789,
       0.47849428, 0.47757016, 0.46854267, 0.4690074 , 0.49716355])]
[0.4971635524502419, 0.5033065179534844]

data.plot()

<Axes: xlabel='Date'>

from keras.models import Sequential # type: ignore
from keras.layers import Dense, LSTM # type: ignore

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(x_train, y_train, batch_size=1, epochs=1)

/Users/anaconda3/lib/python3.11/site-packages/keras/src/layers/rnn/rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(**kwargs)

4235/4235 ━━━━━━━━━━━━━━━━━━━━ 69s 16ms/step - loss: 0.0019

<keras.src.callbacks.history.History at 0x16846ddd0>

# Create the testing data set
# Create a new array containing scaled values from index 1543 to 2002 
test_data = scaled_data[training_data_len - 60: , :]
# Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
    x_test.append(test_data[i-60:i, 0])
    
# Convert the data to a numpy array
x_test = np.array(x_test)

# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))

# Get the models predicted price values 
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse

8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 54ms/step

2.193327947069509

import matplotlib.pyplot as plt

# Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid.loc[:, 'Predictions'] = predictions  # Use .loc[] to set values

# Visualize the data
plt.figure(figsize=(16, 6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('mid_prices', fontsize=18)
plt.plot(train['mid_prices'])
plt.plot(valid[['mid_prices', 'Predictions']])
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.legend(['Train', 'Val', 'Predictions'], loc='upper left')
plt.show()

/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_883/1047648143.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid.loc[:, 'Predictions'] = predictions  # Use .loc[] to set values

	Open	High	Low	Close	Volume	OpenInt
count	4521.000000	4521.000000	4521.000000	4521.000000	4.521000e+03	4521.0
mean	27.856296	28.270442	27.452486	27.871357	3.993503e+06	0.0
std	12.940880	13.176000	12.711735	12.944389	2.665730e+06	0.0
min	7.223100	7.513900	7.087800	7.323800	0.000000e+00	0.0
25%	19.117000	19.435000	18.780000	19.089000	2.407862e+06	0.0
50%	24.456000	24.809000	24.159000	24.490000	3.460621e+06	0.0
75%	36.502000	37.046000	35.877000	36.521000	4.849809e+06	0.0
max	105.300000	109.370000	97.881000	107.320000	6.627751e+07	0.0

SLTM Time Series Analysis Using Python¶

Loading data¶

Data visualization¶

Prediction and Forecasting with LSTM¶

Evaluation¶

Conclusion¶

	Date	mid_prices
0	1999-11-18	30.3780
1	1999-11-19	27.9495
2	1999-11-22	28.3730
3	1999-11-23	28.2240
4	1999-11-24	27.6555

	Date	mid_prices
Date
1999-11-18	1999-11-18	30.3780
1999-11-19	1999-11-19	27.9495
1999-11-22	1999-11-22	28.3730
1999-11-23	1999-11-23	28.2240
1999-11-24	1999-11-24	27.6555