SLTM Time Series Analysis Using PythonΒΆ

Building Short Long Time Memory model, SLTM model, for forcasting time series data using Python.

https://www.kaggle.com/code/shedai/time-series-prediction-with-lstm

Loading dataΒΆ

InΒ [Β ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
InΒ [Β ]:
# fix random seed for reproducibility
tf.random.set_seed(7)

Loading data from the local folder

InΒ [Β ]:
# Open the text file 
df = pd.read_csv('/Users/nnthieu/Downloads/archive-2/Data/Stocks/a.us.txt')
# Sort DataFrame by date
df = df.sort_values('Date')

print(df.head())
print(df.shape)
print(df.info())
         Date    Open    High     Low   Close    Volume  OpenInt
0  1999-11-18  30.713  33.754  27.002  29.702  66277506        0
1  1999-11-19  28.986  29.027  26.872  27.257  16142920        0
2  1999-11-22  27.886  29.702  27.044  29.702   6970266        0
3  1999-11-23  28.688  29.446  27.002  27.002   6332082        0
4  1999-11-24  27.083  28.309  27.002  27.717   5132147        0
(4521, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     4521 non-null   object 
 1   Open     4521 non-null   float64
 2   High     4521 non-null   float64
 3   Low      4521 non-null   float64
 4   Close    4521 non-null   float64
 5   Volume   4521 non-null   int64  
 6   OpenInt  4521 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 247.4+ KB
None
InΒ [Β ]:
df.describe()
Out[Β ]:
Open High Low Close Volume OpenInt
count 4521.000000 4521.000000 4521.000000 4521.000000 4.521000e+03 4521.0
mean 27.856296 28.270442 27.452486 27.871357 3.993503e+06 0.0
std 12.940880 13.176000 12.711735 12.944389 2.665730e+06 0.0
min 7.223100 7.513900 7.087800 7.323800 0.000000e+00 0.0
25% 19.117000 19.435000 18.780000 19.089000 2.407862e+06 0.0
50% 24.456000 24.809000 24.159000 24.490000 3.460621e+06 0.0
75% 36.502000 37.046000 35.877000 36.521000 4.849809e+06 0.0
max 105.300000 109.370000 97.881000 107.320000 6.627751e+07 0.0
InΒ [Β ]:
df = df.drop('OpenInt', axis = 1)

Checking by ploting data Mid-Price over time from 1999 to 2017

InΒ [Β ]:
plt.figure(figsize = (18,9))
plt.plot(range(df.shape[0]),(df['Low']+df['High'])/2.0)
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.xlabel('Date',fontsize=18)
plt.ylabel('Mid Price',fontsize=18)
plt.title('Mid Price by Date',fontsize=18)
plt.show()
No description has been provided for this image
InΒ [Β ]:
import numpy as np
# First calculate the mid prices from the highest and lowest
high_prices = df.loc[:,'High']
low_prices = df.loc[:,'Low']
df['mid_prices'] = (high_prices+low_prices)/2.0
InΒ [Β ]:
dft = df[['Date','mid_prices']]
dft.head()
Out[Β ]:
Date mid_prices
0 1999-11-18 30.3780
1 1999-11-19 27.9495
2 1999-11-22 28.3730
3 1999-11-23 28.2240
4 1999-11-24 27.6555
InΒ [Β ]:
dft = dft.groupby('Date').sum()
dft['Date'] = dft.index
dft = dft[['Date','mid_prices']]

dft['Date'] = pd.to_datetime(dft['Date'],format = '%Y-%m-%d')
dft.head()
Out[Β ]:
Date mid_prices
Date
1999-11-18 1999-11-18 30.3780
1999-11-19 1999-11-19 27.9495
1999-11-22 1999-11-22 28.3730
1999-11-23 1999-11-23 28.2240
1999-11-24 1999-11-24 27.6555

Convert data to numpy array

InΒ [Β ]:
data = dft[['mid_prices']]
# Convert the dataframe to a numpy array
dataset = data.values
training_data_len = int(np.ceil( len(dataset) * .95 ))
dataset
Out[Β ]:
array([[30.378 ],
       [27.9495],
       [28.373 ],
       ...,
       [68.0505],
       [67.445 ],
       [67.14  ]])
InΒ [Β ]:
np.ceil(len(dataset))
Out[Β ]:
4521.0

Normalizing data

InΒ [Β ]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

scaled_data
Out[Β ]:
array([[0.24636497],
       [0.22042029],
       [0.22494471],
       ...,
       [0.64883604],
       [0.64236723],
       [0.63910879]])

spliting the train and test (validation) data sets and reshape the data set for LSTM input layer:

InΒ [Β ]:
# Create the training data set 
# Create the scaled training data set
train_data = scaled_data[0:int(training_data_len), :]
# Split the data into x_train and y_train data sets
x_train = []
y_train = []

# we use 60 days ahead for forecasting range
for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])
    if i<= 61:
        print(x_train)
        print(y_train)
        print()
        
# Convert the x_train and y_train to numpy arrays 
x_train, y_train = np.array(x_train), np.array(y_train)

# Reshape the data
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# x_train.shape
[array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936,
       0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603,
       0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436,
       0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 ,
       0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637,
       0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856,
       0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315,
       0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649,
       0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857,
       0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716,
       0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 ,
       0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ])]
[0.4971635524502419]

[array([0.24636497, 0.22042029, 0.22494471, 0.22335288, 0.21727936,
       0.21840112, 0.22112005, 0.22427166, 0.22949051, 0.23981603,
       0.24633826, 0.25222482, 0.24746002, 0.24790338, 0.25065436,
       0.24884886, 0.24862451, 0.23712381, 0.22222578, 0.2436407 ,
       0.25558476, 0.2573956 , 0.25604949, 0.26030149, 0.27319637,
       0.29233037, 0.33808211, 0.43385896, 0.47802955, 0.48504856,
       0.44941936, 0.40366228, 0.37907439, 0.35496191, 0.37233315,
       0.41647169, 0.40997618, 0.39801075, 0.4115413 , 0.41357649,
       0.43025865, 0.43316988, 0.4207878 , 0.41156266, 0.42595857,
       0.4124387 , 0.41629008, 0.41854428, 0.41559565, 0.39912716,
       0.42033375, 0.45569586, 0.46833435, 0.4726184 , 0.4852462 ,
       0.50035789, 0.47849428, 0.47757016, 0.46854267, 0.4690074 ]), array([0.22042029, 0.22494471, 0.22335288, 0.21727936, 0.21840112,
       0.22112005, 0.22427166, 0.22949051, 0.23981603, 0.24633826,
       0.25222482, 0.24746002, 0.24790338, 0.25065436, 0.24884886,
       0.24862451, 0.23712381, 0.22222578, 0.2436407 , 0.25558476,
       0.2573956 , 0.25604949, 0.26030149, 0.27319637, 0.29233037,
       0.33808211, 0.43385896, 0.47802955, 0.48504856, 0.44941936,
       0.40366228, 0.37907439, 0.35496191, 0.37233315, 0.41647169,
       0.40997618, 0.39801075, 0.4115413 , 0.41357649, 0.43025865,
       0.43316988, 0.4207878 , 0.41156266, 0.42595857, 0.4124387 ,
       0.41629008, 0.41854428, 0.41559565, 0.39912716, 0.42033375,
       0.45569586, 0.46833435, 0.4726184 , 0.4852462 , 0.50035789,
       0.47849428, 0.47757016, 0.46854267, 0.4690074 , 0.49716355])]
[0.4971635524502419, 0.5033065179534844]

Data visualizationΒΆ

InΒ [Β ]:
data.plot()
Out[Β ]:
<Axes: xlabel='Date'>
No description has been provided for this image

Prediction and Forecasting with LSTMΒΆ

neural network (deep learning) model

InΒ [Β ]:
from keras.models import Sequential # type: ignore
from keras.layers import Dense, LSTM # type: ignore

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(x_train, y_train, batch_size=1, epochs=1)
/Users/anaconda3/lib/python3.11/site-packages/keras/src/layers/rnn/rnn.py:204: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(**kwargs)
4235/4235 ━━━━━━━━━━━━━━━━━━━━ 69s 16ms/step - loss: 0.0019
Out[Β ]:
<keras.src.callbacks.history.History at 0x16846ddd0>
InΒ [Β ]:
# Create the testing data set
# Create a new array containing scaled values from index 1543 to 2002 
test_data = scaled_data[training_data_len - 60: , :]
# Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
    x_test.append(test_data[i-60:i, 0])
    
# Convert the data to a numpy array
x_test = np.array(x_test)

# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))

# Get the models predicted price values 
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

# Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse
8/8 ━━━━━━━━━━━━━━━━━━━━ 1s 54ms/step
Out[Β ]:
2.193327947069509

EvaluationΒΆ

InΒ [Β ]:
import matplotlib.pyplot as plt

# Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid.loc[:, 'Predictions'] = predictions  # Use .loc[] to set values

# Visualize the data
plt.figure(figsize=(16, 6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('mid_prices', fontsize=18)
plt.plot(train['mid_prices'])
plt.plot(valid[['mid_prices', 'Predictions']])
plt.xticks(range(0,df.shape[0],500),df['Date'].loc[::500],rotation=45)
plt.legend(['Train', 'Val', 'Predictions'], loc='upper left')
plt.show()
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_883/1047648143.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid.loc[:, 'Predictions'] = predictions  # Use .loc[] to set values
No description has been provided for this image

ConclusionΒΆ