SLTM Time Series Analysis Using PythonΒΆ
Building Short Long Time Memory model, SLTM model, for forcasting time series data using Python.
Loading dataΒΆ
InΒ [Β ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
InΒ [Β ]:
# fix random seed for reproducibility
tf.random.set_seed(7)
Loading data from the local folder
InΒ [Β ]:
# Open the text file
df = pd.read_csv('/Users/nnthieu/Downloads/archive-2/Data/Stocks/a.us.txt')
# Sort DataFrame by date
df = df.sort_values('Date')
print(df.head())
print(df.shape)
print(df.info())
Date Open High Low Close Volume OpenInt 0 1999-11-18 30.713 33.754 27.002 29.702 66277506 0 1 1999-11-19 28.986 29.027 26.872 27.257 16142920 0 2 1999-11-22 27.886 29.702 27.044 29.702 6970266 0 3 1999-11-23 28.688 29.446 27.002 27.002 6332082 0 4 1999-11-24 27.083 28.309 27.002 27.717 5132147 0 (4521, 7) <class 'pandas.core.frame.DataFrame'> RangeIndex: 4521 entries, 0 to 4520 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 4521 non-null object 1 Open 4521 non-null float64 2 High 4521 non-null float64 3 Low 4521 non-null float64 4 Close 4521 non-null float64 5 Volume 4521 non-null int64 6 OpenInt 4521 non-null int64 dtypes: float64(4), int64(2), object(1) memory usage: 247.4+ KB None
InΒ [Β ]:
df.describe()
Out[Β ]:
Open | High | Low | Close | Volume | OpenInt | |
---|---|---|---|---|---|---|
count | 4521.000000 | 4521.000000 | 4521.000000 | 4521.000000 | 4.521000e+03 | 4521.0 |
mean | 27.856296 | 28.270442 | 27.452486 | 27.871357 | 3.993503e+06 | 0.0 |
std | 12.940880 | 13.176000 | 12.711735 | 12.944389 | 2.665730e+06 | 0.0 |
min | 7.223100 | 7.513900 | 7.087800 | 7.323800 | 0.000000e+00 | 0.0 |
25% | 19.117000 | 19.435000 | 18.780000 | 19.089000 | 2.407862e+06 | 0.0 |
50% | 24.456000 | 24.809000 | 24.159000 | 24.490000 | 3.460621e+06 | 0.0 |
75% | 36.502000 | 37.046000 | 35.877000 | 36.521000 | 4.849809e+06 | 0.0 |
max | 105.300000 | 109.370000 | 97.881000 | 107.320000 | 6.627751e+07 | 0.0 |
InΒ [Β ]:
df = df.drop('OpenInt', axis = 1)
Checking by ploting data Mid-Price over time from 1999 to 2017
InΒ [Β ]:
plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),(df['Low']+df['High'])/2.0)
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.xlabel('Date',fontsize=18)
plt.ylabel('Mid Price',fontsize=18)
plt.title('Mid Price by Date',fontsize=18)
plt.show()
InΒ [Β ]:
import numpy as np
# First calculate the mid prices from the highest and lowest
high_prices = df.loc[:,'High']
low_prices = df.loc[:,'Low']
df['mid_prices'] = (high_prices+low_prices)/2.0
InΒ [Β ]:
dft = df[['Date','mid_prices']]
dft.head()
Out[Β ]:
Date | mid_prices | |
---|---|---|
0 | 1999-11-18 | 30.3780 |
1 | 1999-11-19 | 27.9495 |
2 | 1999-11-22 | 28.3730 |
3 | 1999-11-23 | 28.2240 |
4 | 1999-11-24 | 27.6555 |
InΒ [Β ]:
dft = dft.groupby('Date').sum()
dft['Date'] = dft.index
dft = dft[['Date','mid_prices']]
dft['Date'] = pd.to_datetime(dft['Date'],format = '%Y-%m-%d')
dft.head()
Out[Β ]:
Date | mid_prices | |
---|---|---|
Date | ||
1999-11-18 | 1999-11-18 | 30.3780 |
1999-11-19 | 1999-11-19 | 27.9495 |
1999-11-22 | 1999-11-22 | 28.3730 |
1999-11-23 | 1999-11-23 | 28.2240 |
1999-11-24 | 1999-11-24 | 27.6555 |
Convert data to numpy array
InΒ [Β ]:
data = dft[['mid_prices']]
# Convert the dataframe to a numpy array
dataset = data.values
training_data_len = int(np.ceil( len(dataset) * .95 ))
dataset
Out[Β ]:
array([[30.378 ], [27.9495], [28.373 ], ..., [68.0505], [67.445 ], [67.14 ]])
InΒ [Β ]:
np.ceil(len(dataset))
Out[Β ]:
4521.0
Normalizing data
InΒ [Β ]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
scaled_data
Out[Β ]:
array([[0.24636497], [0.22042029], [0.22494471], ..., [0.64883604], [0.64236723], [0.63910879]])
spliting the train and test (validation) data sets and reshape the data set for LSTM input layer:
InΒ [Β ]:
# Create the training data set
# Create the scaled training data set
train_data = scaled_data[0:int(training_data_len), :]
# Split the data into x_train and y_train data sets
x_train = []
y_train = []
# we use 60 days ahead for forecasting range
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
if i<= 61:
print(x_train)
print(y_train)
print()
# Convert the x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)
# Reshape the data
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# x_train.shape
[array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936, 0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603, 0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436, 0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 , 0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637, 0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856, 0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315, 0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649, 0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857, 0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716, 0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 , 0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ])] [0.4971635524502419] [array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936, 0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603, 0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436, 0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 , 0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637, 0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856, 0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315, 0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649, 0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857, 0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716, 0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 , 0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ]), array([0.22042029, 0.22494471, 0.22335288, 0.21727936, 0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603, 0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436, 0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 , 0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637, 0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856, 0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315, 0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649, 0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857, 0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716, 0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 , 0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 , 0.49716355])] [0.4971635524502419, 0.5033065179534844]
Data visualizationΒΆ
InΒ [Β ]:
data.plot()
Out[Β ]:
<Axes: xlabel='Date'>
Prediction and Forecasting with LSTMΒΆ
neural network (deep learning) model
InΒ [Β ]:
from keras.models import Sequential # type: ignore
from keras.layers import Dense, LSTM # type: ignore
# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
# Train the model
model.fit(x_train, y_train, batch_size=1, epochs=1)
/Users/anaconda3/lib/python3.11/site-packages/keras/src/layers/rnn/rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(**kwargs)
4235/4235 ββββββββββββββββββββ 69s 16ms/step - loss: 0.0019
Out[Β ]:
<keras.src.callbacks.history.History at 0x16846ddd0>
InΒ [Β ]:
# Create the testing data set
# Create a new array containing scaled values from index 1543 to 2002
test_data = scaled_data[training_data_len - 60: , :]
# Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i, 0])
# Convert the data to a numpy array
x_test = np.array(x_test)
# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))
# Get the models predicted price values
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)
# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse
8/8 ββββββββββββββββββββ 1s 54ms/step
Out[Β ]:
2.193327947069509
EvaluationΒΆ
InΒ [Β ]:
import matplotlib.pyplot as plt
# Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid.loc[:, 'Predictions'] = predictions # Use .loc[] to set values
# Visualize the data
plt.figure(figsize=(16, 6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('mid_prices', fontsize=18)
plt.plot(train['mid_prices'])
plt.plot(valid[['mid_prices', 'Predictions']])
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.legend(['Train', 'Val', 'Predictions'], loc='upper left')
plt.show()
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_883/1047648143.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy valid.loc[:, 'Predictions'] = predictions # Use .loc[] to set values