Heart Disease Logistic Regression Using Python¶

Data loading¶

In [104]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()
Out[104]:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol Chol_Status BP_Status Weight_Status Smoking_Status
0 Dead Other NaN Female 29 62.50 140.0 78 124 121.0 0.0 55.0 NaN NaN Normal Overweight Non-smoker
1 Dead Cancer NaN Female 41 59.75 194.0 92 144 183.0 0.0 57.0 181.0 Desirable High Overweight Non-smoker
2 Alive NaN NaN Female 57 62.25 132.0 90 170 114.0 10.0 NaN 250.0 High High Overweight Moderate (6-15)
3 Alive NaN NaN Female 39 65.75 158.0 80 128 123.0 0.0 NaN 242.0 High Normal Overweight Non-smoker
4 Alive NaN NaN Male 42 66.00 156.0 76 110 116.0 20.0 NaN 281.0 High Optimal Overweight Heavy (16-25)
In [102]:
df.shape
Out[102]:
(5209, 17)
In [103]:
df.isna().sum()
Out[103]:
Status               0
DeathCause        3218
AgeCHDdiag        3760
Sex                  0
AgeAtStart           0
Height               6
Weight               6
Diastolic            0
Systolic             0
MRW                  6
Smoking             36
AgeAtDeath        3218
Cholesterol        152
Chol_Status        152
BP_Status            0
Weight_Status        6
Smoking_Status      36
dtype: int64
In [52]:
df["Status"].value_counts()
Out[52]:
Alive    3218
Dead     1991
Name: Status, dtype: int64
In [53]:
df["Status"].isna().sum()
Out[53]:
0
In [54]:
df["DeathCause"].value_counts()
Out[54]:
Coronary Heart Disease       605
Cancer                       539
Cerebral Vascular Disease    378
Other                        357
Unknown                      112
Name: DeathCause, dtype: int64
In [55]:
df["DeathCause"].isna().sum()
Out[55]:
3218
In [56]:
df["Sex"].value_counts()
Out[56]:
Female    2873
Male      2336
Name: Sex, dtype: int64
In [57]:
df["Sex"].isna().sum()
Out[57]:
0
In [58]:
df["Chol_Status"].value_counts()
Out[58]:
Borderline    1861
High          1791
Desirable     1405
Name: Chol_Status, dtype: int64
In [59]:
df['Chol_Status'].isna().sum()
Out[59]:
152
In [60]:
df["BP_Status"].value_counts()
Out[60]:
High       2267
Normal     2143
Optimal     799
Name: BP_Status, dtype: int64
In [61]:
df['BP_Status'].isna().sum()
Out[61]:
0
In [62]:
df["Weight_Status"].value_counts()
Out[62]:
Overweight     3550
Normal         1472
Underweight     181
Name: Weight_Status, dtype: int64
In [63]:
df["Weight_Status"].isna().sum()
Out[63]:
6
In [64]:
df["Smoking_Status"].value_counts()
Out[64]:
Non-smoker           2501
Heavy (16-25)        1046
Light (1-5)           579
Moderate (6-15)       576
Very Heavy (> 25)     471
Name: Smoking_Status, dtype: int64
In [65]:
df['Smoking_Status'].isna().sum()
Out[65]:
36
In [66]:
df.describe()
Out[66]:
AgeCHDdiag AgeAtStart Height Weight Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol
count 1449.000000 5209.000000 5203.000000 5203.000000 5209.000000 5209.000000 5203.000000 5173.000000 1991.000000 5057.000000
mean 63.302968 44.068727 64.813185 153.086681 85.358610 136.909580 119.957525 9.366518 70.536414 227.417441
std 10.018215 8.574954 3.582707 28.915426 12.973091 23.739596 19.983401 12.031451 10.559406 44.935524
min 32.000000 28.000000 51.500000 67.000000 50.000000 82.000000 67.000000 0.000000 36.000000 96.000000
25% 57.000000 37.000000 62.250000 132.000000 76.000000 120.000000 106.000000 0.000000 63.000000 196.000000
50% 63.000000 43.000000 64.500000 150.000000 84.000000 132.000000 118.000000 1.000000 71.000000 223.000000
75% 70.000000 51.000000 67.500000 172.000000 92.000000 148.000000 131.000000 20.000000 79.000000 255.000000
max 90.000000 62.000000 76.500000 300.000000 160.000000 300.000000 268.000000 60.000000 93.000000 568.000000

Cleaning data¶

Imputating the missing data in 'Chol_Status' as the proportions of other values in the variable.

In [112]:
proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)

missing_count = df['Chol_Status'].isna().sum()

missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)

df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples
In [68]:
df["Chol_Status"].value_counts()
Out[68]:
Borderline    1911
High          1850
Desirable     1448
Name: Chol_Status, dtype: int64
In [69]:
df["Chol_Status"].isna().sum()
Out[69]:
0

Imputing 6 the missing data in 'Weight_Status' with 'Normal' value.

In [113]:
df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"
In [71]:
df['Weight_Status'].isna().sum()
Out[71]:
0
In [72]:
df["Weight_Status"].value_counts()
Out[72]:
Overweight     3550
Normal         1478
Underweight     181
Name: Weight_Status, dtype: int64
In [114]:
proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)

missing_countD = df['DeathCause'].isna().sum()

missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)

df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD
In [97]:
df['DeathCause'].isna().sum()
Out[97]:
0
In [75]:
df['BP_Status'].isna().sum()
Out[75]:
0

Imputing the issing data in 'Smoking_Status' with 'Non-smoker'

In [115]:
df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"
In [77]:
df['Smoking_Status'].isna().sum()
Out[77]:
0

Imputing numeric variabls with their means

In [110]:
# Calculate means for numeric columns
means = df.mean()

# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
    df[col] = df[col].fillna(means[col])
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_66411/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  means = df.mean()
In [79]:
df.describe()
Out[79]:
AgeCHDdiag AgeAtStart Height Weight Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol
count 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000 5209.000000
mean 63.302968 44.068727 64.813185 153.086681 85.358610 136.909580 119.957525 9.366518 70.536414 227.417441
std 5.282496 8.574954 3.580643 28.898765 12.973091 23.739596 19.971887 11.989796 6.527255 44.274927
min 32.000000 28.000000 51.500000 67.000000 50.000000 82.000000 67.000000 0.000000 36.000000 96.000000
25% 63.302968 37.000000 62.250000 132.000000 76.000000 120.000000 106.000000 0.000000 70.536414 197.000000
50% 63.302968 43.000000 64.500000 150.000000 84.000000 132.000000 118.000000 1.000000 70.536414 225.000000
75% 63.302968 51.000000 67.500000 172.000000 92.000000 148.000000 131.000000 20.000000 70.536414 251.000000
max 90.000000 62.000000 76.500000 300.000000 160.000000 300.000000 268.000000 60.000000 93.000000 568.000000

Decoding variables¶

For Status, 1 = 'Dead', 0 = 'Alive'

In [80]:
# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}

# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)
In [81]:
df["Status"].value_counts()
Out[81]:
0    3218
1    1991
Name: Status, dtype: int64
In [82]:
# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }

df['DeathCause'] = df['DeathCause'].map(status_mappingD)
In [83]:
df['DeathCause'].value_counts()
Out[83]:
1.0    1590
2.0    1398
3.0     982
4.0     957
Name: DeathCause, dtype: int64
In [84]:
df['DeathCause'].isna().sum()
Out[84]:
282
In [108]:
# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }

df['Sex'] = df['Sex'].map(status_mappingSex)
In [86]:
df['Sex'].value_counts()
Out[86]:
0    2873
1    2336
Name: Sex, dtype: int64
In [87]:
# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}

df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)
In [88]:
df["Smoking_Status"].value_counts()
Out[88]:
0    2537
3    1046
1     579
2     576
4     471
Name: Smoking_Status, dtype: int64
In [89]:
status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}

df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)
In [90]:
df["Chol_Status"].value_counts()
Out[90]:
1    1911
2    1850
0    1448
Name: Chol_Status, dtype: int64
In [91]:
status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}

df['BP_Status'] = df['BP_Status'].map(status_mappingBP)
In [92]:
df["BP_Status"].value_counts()
Out[92]:
2    2267
0    2143
1     799
Name: BP_Status, dtype: int64
In [93]:
status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)
In [94]:
df["Weight_Status"].value_counts()
Out[94]:
2    3550
0    1478
1     181
Name: Weight_Status, dtype: int64

Logistic regression with 'Status' is target variable¶

In [116]:
df.isna().sum()
Out[116]:
Status            0
DeathCause        0
AgeCHDdiag        0
Sex               0
AgeAtStart        0
Height            0
Weight            0
Diastolic         0
Systolic          0
MRW               0
Smoking           0
AgeAtDeath        0
Cholesterol       0
Chol_Status       0
BP_Status         0
Weight_Status     0
Smoking_Status    0
dtype: int64
In [ ]:
 
In [99]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# Split data into independent variables (X) and dependent variable (y)
X = df.drop('Status', axis=1)
y = df['Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix:
[[536 111]
 [143 252]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       647
           1       0.69      0.64      0.66       395

    accuracy                           0.76      1042
   macro avg       0.74      0.73      0.74      1042
weighted avg       0.75      0.76      0.75      1042

/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Select variables contributing most to the logistic model.

In [105]:
from sklearn.feature_selection import SelectKBest, chi2

# Assuming X_train, y_train are your training data
# Replace 5 with the desired number of features
k = 5
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Get the names of selected features
selected_features = X_train.columns[selected_indices]

print("Selected features:", selected_features)
Selected features: Index(['AgeAtStart', 'Diastolic', 'Systolic', 'Smoking', 'Cholesterol'], dtype='object')

Rewrite the model with selected variables. However, the accuracy is not improved more.

In [117]:
# Split data into independent variables (X) and dependent variable (y)
X = df.drop(['Status','DeathCause','AgeCHDdiag','Weight','Height','MRW','Chol_Status','BP_Status','Weight_Status','Smoking_Status'], axis=1)
y = df['Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix:
[[537 110]
 [143 252]]

Classification Report:
              precision    recall  f1-score   support

       Alive       0.79      0.83      0.81       647
        Dead       0.70      0.64      0.67       395

    accuracy                           0.76      1042
   macro avg       0.74      0.73      0.74      1042
weighted avg       0.75      0.76      0.75      1042

/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]: