import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load and preprocess data
data_folder = '/Users/nnthieu/Downloads/Data'
patient_data = []

# Iterate over files in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith('.txt'):  # Assuming patient data files have .csv extension
        file_path = os.path.join(data_folder, filename)
        patient_df = pd.read_csv(file_path)
        patient_id = int(filename.split('.')[0])  # Extract patient ID from file name
        patient_df['patient_id'] = patient_id
        patient_data.append(patient_df)

# Combine data for all patients into a single DataFrame
all_data = pd.concat(patient_data, ignore_index=True)

print(all_data.head())
print(all_data.shape)

         Date   Open   High    Low  Close   Volume  OpenInt  patient_id
0  2014-10-02  20.00  20.10  17.60  18.50  2799073        0           5
1  2014-10-03  18.20  18.75  18.05  18.65   155562        0           5
2  2014-10-06  18.48  19.58  18.48  19.24   188229        0           5
3  2014-10-07  19.25  19.48  18.93  19.24   176606        0           5
4  2014-10-08  19.17  19.48  18.90  19.13    37046        0           5
(23319, 8)

target = {'patient_id':[1,2,3,4,5], 'class':[0,1,0,1,0]}
target = pd.DataFrame(target)
print(target.head())
print(target.shape)

   patient_id  class
0           1      0
1           2      1
2           3      0
3           4      1
4           5      0
(5, 2)

merged_df = pd.merge(all_data, target, on='patient_id', how='left')
merged_df = merged_df.drop(['OpenInt', 'Date'], axis =1)

print(merged_df.head())
print(merged_df.shape)
print(merged_df['class'].unique())
print(merged_df['class'].value_counts())

    Open   High    Low  Close   Volume  patient_id  class
0  20.00  20.10  17.60  18.50  2799073           5      0
1  18.20  18.75  18.05  18.65   155562           5      0
2  18.48  19.58  18.48  19.24   188229           5      0
3  19.25  19.48  18.93  19.24   176606           5      0
4  19.17  19.48  18.90  19.13    37046           5      0
(23319, 7)
[0 1]
class
1    17508
0     5811
Name: count, dtype: int64

merged_df['Time'] = merged_df.groupby('patient_id').cumcount() + 1
merged_df.groupby('patient_id').head(3)

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Convert infinite values to NaN
merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Plot line graph
plt.figure(figsize=(10, 6))
sns.lineplot(data=merged_df, x='Time', y='Close', hue='patient_id', palette='Set1', errorbar=None)
plt.title('Close Price Over Time')
plt.xlabel('Time')
plt.ylabel('Close')
plt.legend(title='Patient ID', loc='upper right')
plt.show()

/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)
/Users/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1075: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  data_subset = grouped_data.get_group(pd_key)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor

# Assuming the last column contains the target variable (disease outcome: 1 for diseased, 0 for non-diseased)
X = merged_df.drop(columns=['class'])
y = merged_df['class']

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 3: Model Selection and Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 4: Model Evaluation
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
predictions = model.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,  display_labels=model.classes_)
disp.plot()

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1162
           1       1.00      1.00      1.00      3502

    accuracy                           1.00      4664
   macro avg       1.00      1.00      1.00      4664
weighted avg       1.00      1.00      1.00      4664

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x15392c250>

	Open	High	Low	Close	Volume	patient_id	class	Time
0	20.0000	20.1000	17.6000	18.5000	2799073	5	0	1
1	18.2000	18.7500	18.0500	18.6500	155562	5	0	2
2	18.4800	19.5800	18.4800	19.2400	188229	5	0	3
785	1.0500	1.7900	1.0200	1.3800	408720000	4	1	1
786	1.4900	1.5000	1.2500	1.3400	79231200	4	1	2
787	1.3400	1.3400	1.1700	1.2000	48026400	4	1	3
6219	18.5000	25.9000	18.0000	24.5000	1584600	3	0	1
6220	24.2500	27.1200	22.5000	25.0000	83000	3	0	2
6221	25.4700	26.2000	24.5500	25.2600	67300	3	0	3
6724	2.2972	2.3178	2.2972	2.2972	26437	2	1	1
6725	2.2972	2.2972	2.2544	2.2759	29887	2	1	2
6726	2.2759	2.2759	2.2342	2.2342	106900	2	1	3
18798	30.7130	33.7540	27.0020	29.7020	66277506	1	0	1
18799	28.9860	29.0270	26.8720	27.2570	16142920	1	0	2
18800	27.8860	29.7020	27.0440	29.7020	6970266	1	0	3

EHR Data: Machine Learning Classification Models with Time Series Data¶

Loading data¶

Building classification model and validation¶