EHR Data: Machine Learning Classification Models with Time Series Data¶

Loading data¶

I have 5 .txt file of time series data in a local folder. Each txt file for a patient with many rows and some columns for features. I load them into a dataset for all with 'patient_id' for each file.

This post imitates the time series data extracted form EHR.

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [ ]:
# Step 1: Load and preprocess data
data_folder = '/Users/nnthieu/Downloads/Data'
patient_data = []

# Iterate over files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith('.txt'):  # Assuming patient data files have .csv extension
        file_path = os.path.join(data_folder, filename)
        patient_df = pd.read_csv(file_path)
        patient_id = int(filename.split('.')[0])  # Extract patient ID from file name
        patient_df['patient_id'] = patient_id
        patient_data.append(patient_df)

# Combine data for all patients into a single DataFrame
all_data = pd.concat(patient_data, ignore_index=True)
In [ ]:
print(all_data.head())
print(all_data.shape)
         Date   Open   High    Low  Close   Volume  OpenInt  patient_id
0  2014-10-02  20.00  20.10  17.60  18.50  2799073        0           5
1  2014-10-03  18.20  18.75  18.05  18.65   155562        0           5
2  2014-10-06  18.48  19.58  18.48  19.24   188229        0           5
3  2014-10-07  19.25  19.48  18.93  19.24   176606        0           5
4  2014-10-08  19.17  19.48  18.90  19.13    37046        0           5
(23319, 8)

Merging data with the patient's diagnosed disease

In [ ]:
target = {'patient_id':[1,2,3,4,5], 'class':[0,1,0,1,0]}
target = pd.DataFrame(target)
print(target.head())
print(target.shape)
   patient_id  class
0           1      0
1           2      1
2           3      0
3           4      1
4           5      0
(5, 2)
In [ ]:
merged_df = pd.merge(all_data, target, on='patient_id', how='left')
merged_df = merged_df.drop(['OpenInt', 'Date'], axis =1)

print(merged_df.head())
print(merged_df.shape)
print(merged_df['class'].unique())
print(merged_df['class'].value_counts())
    Open   High    Low  Close   Volume  patient_id  class
0  20.00  20.10  17.60  18.50  2799073           5      0
1  18.20  18.75  18.05  18.65   155562           5      0
2  18.48  19.58  18.48  19.24   188229           5      0
3  19.25  19.48  18.93  19.24   176606           5      0
4  19.17  19.48  18.90  19.13    37046           5      0
(23319, 7)
[0 1]
class
1    17508
0     5811
Name: count, dtype: int64
In [ ]:
merged_df['Time'] = merged_df.groupby('patient_id').cumcount() + 1
merged_df.groupby('patient_id').head(3)
Out[ ]:
Open High Low Close Volume patient_id class Time
0 20.0000 20.1000 17.6000 18.5000 2799073 5 0 1
1 18.2000 18.7500 18.0500 18.6500 155562 5 0 2
2 18.4800 19.5800 18.4800 19.2400 188229 5 0 3
785 1.0500 1.7900 1.0200 1.3800 408720000 4 1 1
786 1.4900 1.5000 1.2500 1.3400 79231200 4 1 2
787 1.3400 1.3400 1.1700 1.2000 48026400 4 1 3
6219 18.5000 25.9000 18.0000 24.5000 1584600 3 0 1
6220 24.2500 27.1200 22.5000 25.0000 83000 3 0 2
6221 25.4700 26.2000 24.5500 25.2600 67300 3 0 3
6724 2.2972 2.3178 2.2972 2.2972 26437 2 1 1
6725 2.2972 2.2972 2.2544 2.2759 29887 2 1 2
6726 2.2759 2.2759 2.2342 2.2342 106900 2 1 3
18798 30.7130 33.7540 27.0020 29.7020 66277506 1 0 1
18799 28.9860 29.0270 26.8720 27.2570 16142920 1 0 2
18800 27.8860 29.7020 27.0440 29.7020 6970266 1 0 3
In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Convert infinite values to NaN
merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Plot line graph
plt.figure(figsize=(10, 6))
sns.lineplot(data=merged_df, x='Time', y='Close', hue='patient_id', palette='Set1', errorbar=None)
plt.title('Close Price Over Time')
plt.xlabel('Time')
plt.ylabel('Close')
plt.legend(title='Patient ID', loc='upper right')
plt.show()
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
No description has been provided for this image

Building classification model and validation¶

In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor

# Assuming the last column contains the target variable (disease outcome: 1 for diseased, 0 for non-diseased)
X = merged_df.drop(columns=['class'])
y = merged_df['class']

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 3: Model Selection and Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
predictions = model.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,  display_labels=model.classes_)
disp.plot()
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1162
           1       1.00      1.00      1.00      3502

    accuracy                           1.00      4664
   macro avg       1.00      1.00      1.00      4664
weighted avg       1.00      1.00      1.00      4664

Out[ ]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x15392c250>
No description has been provided for this image
In [ ]: