import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pandas as pd
df = pd.read_csv("/Users/nnthieu/Downloads/Data/Readmission_models.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   176 non-null    object 
 1   start                176 non-null    object 
 2   stop                 176 non-null    object 
 3   patient              176 non-null    object 
 4   organization         176 non-null    object 
 5   provider             176 non-null    object 
 6   payer                176 non-null    object 
 7   encounterclass       176 non-null    object 
 8   code                 176 non-null    int64  
 9   description          176 non-null    object 
 10  base_encounter_cost  176 non-null    float64
 11  total_claim_cost     176 non-null    float64
 12  payer_coverage       176 non-null    float64
 13  reasoncode           176 non-null    int64  
 14  reasondescription    176 non-null    object 
 15  id-2                 176 non-null    object 
 16  birthdate            176 non-null    object 
 17  deathdate            130 non-null    object 
 18  ssn                  176 non-null    object 
 19  drivers              176 non-null    object 
 20  passport             176 non-null    object 
 21  prefix               176 non-null    object 
 22  first                176 non-null    object 
 23  middle               59 non-null     object 
 24  last                 176 non-null    object 
 25  suffix               1 non-null      object 
 26  maiden               30 non-null     object 
 27  marital              176 non-null    object 
 28  race                 176 non-null    object 
 29  ethnicity            176 non-null    object 
 30  gender               176 non-null    object 
 31  birthplace           176 non-null    object 
 32  address              176 non-null    object 
 33  city                 176 non-null    object 
 34  state                176 non-null    object 
 35  county               176 non-null    object 
 36  fips                 32 non-null     float64
 37  zip                  176 non-null    int64  
 38  lat                  176 non-null    float64
 39  lon                  176 non-null    float64
 40  healthcare_expenses  176 non-null    float64
 41  healthcare_coverage  176 non-null    float64
 42  income               176 non-null    int64  
dtypes: float64(8), int64(4), object(31)
memory usage: 59.3+ KB

df['admitted_date'] = pd.to_datetime(df['start'])
df['discharged_date'] = pd.to_datetime(df['stop'])
df['birthdate'] = pd.to_datetime(df['birthdate'])

# Calculate the length of stay
df['length_of_stay'] = (df['discharged_date'] - df['admitted_date']).dt.days

# Calculate the age
df['age'] = (df['admitted_date'] - df['birthdate']).apply(lambda x: x.days // 365)

# Sort data by patient_id and admitted_date
df.sort_values(by=['patient', 'admitted_date'], inplace=True)

# Create a column to indicate if a patient was readmitted within 30 days
df['readmitted'] = 0
for i in range(len(df) - 1):
    if df.iloc[i]['patient'] == df.iloc[i + 1]['patient']:
        if (df.iloc[i + 1]['admitted_date'] - df.iloc[i]['discharged_date']).days <= 30:
            df.at[i, 'readmitted'] = 1

df['readmitted'].value_counts()

readmitted
0    122
1     54
Name: count, dtype: int64

df.isnull().sum()

id                       0
start                    0
stop                     0
patient                  0
organization             0
provider                 0
payer                    0
encounterclass           0
code                     0
description              0
base_encounter_cost      0
total_claim_cost         0
payer_coverage           0
reasoncode               0
reasondescription        0
id-2                     0
birthdate                0
deathdate               46
ssn                      0
drivers                  0
passport                 0
prefix                   0
first                    0
middle                 117
last                     0
suffix                 175
maiden                 146
marital                  0
race                     0
ethnicity                0
gender                   0
birthplace               0
address                  0
city                     0
state                    0
county                   0
fips                   144
zip                      0
lat                      0
lon                      0
healthcare_expenses      0
healthcare_coverage      0
income                   0
admitted_date            0
discharged_date          0
length_of_stay           0
age                      0
readmitted               0
dtype: int64

df['gender'] = df['gender'].replace({'F': 0, 'M': 1})

/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_59497/3294156851.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df['gender'] = df['gender'].replace({'F': 0, 'M': 1})

df_f = df[['total_claim_cost','gender','income','length_of_stay','age','readmitted']]

df_f.head()

# Define features and target variable
X = df_f.drop(columns=['readmitted'])
y = df_f['readmitted']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))

Confusion Matrix:
[[33  5]
 [ 7  8]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.85        38
           1       0.62      0.53      0.57        15

    accuracy                           0.77        53
   macro avg       0.72      0.70      0.71        53
weighted avg       0.77      0.77      0.77        53

Accuracy: 0.7735849056603774

	total_claim_cost	gender	income	length_of_stay	age	readmitted
161	1012.63	0	77504	1	34	0
162	3480.38	0	77504	4	49	0
151	199646.60	1	29497	22	63	1
24	5750.55	0	47200	1	25	0
25	83463.56	0	47200	7	47	0