import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()


df.shape

(5209, 17)


df.isna().sum()

Status               0
DeathCause        3218
AgeCHDdiag        3760
Sex                  0
AgeAtStart           0
Height               6
Weight               6
Diastolic            0
Systolic             0
MRW                  6
Smoking             36
AgeAtDeath        3218
Cholesterol        152
Chol_Status        152
BP_Status            0
Weight_Status        6
Smoking_Status      36
dtype: int64


df["Status"].value_counts()

Alive    3218
Dead     1991
Name: Status, dtype: int64


df["Status"].isna().sum()

0


df["DeathCause"].value_counts()

Coronary Heart Disease       605
Cancer                       539
Cerebral Vascular Disease    378
Other                        357
Unknown                      112
Name: DeathCause, dtype: int64


df["DeathCause"].isna().sum()

3218


df["Sex"].value_counts()

Female    2873
Male      2336
Name: Sex, dtype: int64


df["Sex"].isna().sum()

0


df["Chol_Status"].value_counts()

Borderline    1861
High          1791
Desirable     1405
Name: Chol_Status, dtype: int64


df['Chol_Status'].isna().sum()

152


df["BP_Status"].value_counts()

High       2267
Normal     2143
Optimal     799
Name: BP_Status, dtype: int64


df['BP_Status'].isna().sum()

0


df["Weight_Status"].value_counts()

Overweight     3550
Normal         1472
Underweight     181
Name: Weight_Status, dtype: int64


df["Weight_Status"].isna().sum()

6


df["Smoking_Status"].value_counts()

Non-smoker           2501
Heavy (16-25)        1046
Light (1-5)           579
Moderate (6-15)       576
Very Heavy (> 25)     471
Name: Smoking_Status, dtype: int64


df['Smoking_Status'].isna().sum()

36


df.describe()


proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)

missing_count = df['Chol_Status'].isna().sum()

missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)

df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples


df["Chol_Status"].value_counts()

Borderline    1911
High          1850
Desirable     1448
Name: Chol_Status, dtype: int64


df["Chol_Status"].isna().sum()

0


df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"


df['Weight_Status'].isna().sum()

0


df["Weight_Status"].value_counts()

Overweight     3550
Normal         1478
Underweight     181
Name: Weight_Status, dtype: int64


proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)

missing_countD = df['DeathCause'].isna().sum()

missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)

df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD


df['DeathCause'].isna().sum()

0


df['BP_Status'].isna().sum()

0


df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"


df['Smoking_Status'].isna().sum()

0


# Calculate means for numeric columns
means = df.mean()

# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
    df[col] = df[col].fillna(means[col])

/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_66411/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  means = df.mean()


df.describe()


# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}

# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)


df["Status"].value_counts()

0    3218
1    1991
Name: Status, dtype: int64


# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }

df['DeathCause'] = df['DeathCause'].map(status_mappingD)


df['DeathCause'].value_counts()

1.0    1590
2.0    1398
3.0     982
4.0     957
Name: DeathCause, dtype: int64


df['DeathCause'].isna().sum()

282


# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }

df['Sex'] = df['Sex'].map(status_mappingSex)


df['Sex'].value_counts()

0    2873
1    2336
Name: Sex, dtype: int64


# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}

df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)


df["Smoking_Status"].value_counts()

0    2537
3    1046
1     579
2     576
4     471
Name: Smoking_Status, dtype: int64


status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}

df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)


df["Chol_Status"].value_counts()

1    1911
2    1850
0    1448
Name: Chol_Status, dtype: int64


status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}

df['BP_Status'] = df['BP_Status'].map(status_mappingBP)


df["BP_Status"].value_counts()

2    2267
0    2143
1     799
Name: BP_Status, dtype: int64


status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)


df["Weight_Status"].value_counts()

2    3550
0    1478
1     181
Name: Weight_Status, dtype: int64


df.isna().sum()

Status            0
DeathCause        0
AgeCHDdiag        0
Sex               0
AgeAtStart        0
Height            0
Weight            0
Diastolic         0
Systolic          0
MRW               0
Smoking           0
AgeAtDeath        0
Cholesterol       0
Chol_Status       0
BP_Status         0
Weight_Status     0
Smoking_Status    0
dtype: int64


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# Split data into independent variables (X) and dependent variable (y)
X = df.drop('Status', axis=1)
y = df['Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[536 111]
 [143 252]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       647
           1       0.69      0.64      0.66       395

    accuracy                           0.76      1042
   macro avg       0.74      0.73      0.74      1042
weighted avg       0.75      0.76      0.75      1042

/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


from sklearn.feature_selection import SelectKBest, chi2

# Assuming X_train, y_train are your training data
# Replace 5 with the desired number of features
k = 5
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Get the names of selected features
selected_features = X_train.columns[selected_indices]

print("Selected features:", selected_features)

Selected features: Index(['AgeAtStart', 'Diastolic', 'Systolic', 'Smoking', 'Cholesterol'], dtype='object')


# Split data into independent variables (X) and dependent variable (y)
X = df.drop(['Status','DeathCause','AgeCHDdiag','Weight','Height','MRW','Chol_Status','BP_Status','Weight_Status','Smoking_Status'], axis=1)
y = df['Status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize logistic regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[537 110]
 [143 252]]

Classification Report:
              precision    recall  f1-score   support

       Alive       0.79      0.83      0.81       647
        Dead       0.70      0.64      0.67       395

    accuracy                           0.76      1042
   macro avg       0.74      0.73      0.74      1042
weighted avg       0.75      0.76      0.75      1042

/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol
count	1449.000000	5209.000000	5203.000000	5203.000000	5209.000000	5209.000000	5203.000000	5173.000000	1991.000000	5057.000000
mean	63.302968	44.068727	64.813185	153.086681	85.358610	136.909580	119.957525	9.366518	70.536414	227.417441
std	10.018215	8.574954	3.582707	28.915426	12.973091	23.739596	19.983401	12.031451	10.559406	44.935524
min	32.000000	28.000000	51.500000	67.000000	50.000000	82.000000	67.000000	0.000000	36.000000	96.000000
25%	57.000000	37.000000	62.250000	132.000000	76.000000	120.000000	106.000000	0.000000	63.000000	196.000000
50%	63.000000	43.000000	64.500000	150.000000	84.000000	132.000000	118.000000	1.000000	71.000000	223.000000
75%	70.000000	51.000000	67.500000	172.000000	92.000000	148.000000	131.000000	20.000000	79.000000	255.000000
max	90.000000	62.000000	76.500000	300.000000	160.000000	300.000000	268.000000	60.000000	93.000000	568.000000

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol
count	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000
mean	63.302968	44.068727	64.813185	153.086681	85.358610	136.909580	119.957525	9.366518	70.536414	227.417441
std	5.282496	8.574954	3.580643	28.898765	12.973091	23.739596	19.971887	11.989796	6.527255	44.274927
min	32.000000	28.000000	51.500000	67.000000	50.000000	82.000000	67.000000	0.000000	36.000000	96.000000
25%	63.302968	37.000000	62.250000	132.000000	76.000000	120.000000	106.000000	0.000000	70.536414	197.000000
50%	63.302968	43.000000	64.500000	150.000000	84.000000	132.000000	118.000000	1.000000	70.536414	225.000000
75%	63.302968	51.000000	67.500000	172.000000	92.000000	148.000000	131.000000	20.000000	70.536414	251.000000
max	90.000000	62.000000	76.500000	300.000000	160.000000	300.000000	268.000000	60.000000	93.000000	568.000000

Heart Disease Logistic Regression Using Python¶

Data loading¶

Cleaning data¶

Decoding variables¶

Logistic regression with 'Status' is target variable¶

	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol	Chol_Status	BP_Status	Weight_Status	Smoking_Status
0	Dead	Other	NaN	Female	29	62.50	140.0	78	124	121.0	0.0	55.0	NaN	NaN	Normal	Overweight	Non-smoker
1	Dead	Cancer	NaN	Female	41	59.75	194.0	92	144	183.0	0.0	57.0	181.0	Desirable	High	Overweight	Non-smoker
2	Alive	NaN	NaN	Female	57	62.25	132.0	90	170	114.0	10.0	NaN	250.0	High	High	Overweight	Moderate (6-15)
3	Alive	NaN	NaN	Female	39	65.75	158.0	80	128	123.0	0.0	NaN	242.0	High	Normal	Overweight	Non-smoker
4	Alive	NaN	NaN	Male	42	66.00	156.0	76	110	116.0	20.0	NaN	281.0	High	Optimal	Overweight	Heavy (16-25)