import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()
Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | Chol_Status | BP_Status | Weight_Status | Smoking_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Dead | Other | NaN | Female | 29 | 62.50 | 140.0 | 78 | 124 | 121.0 | 0.0 | 55.0 | NaN | NaN | Normal | Overweight | Non-smoker |
1 | Dead | Cancer | NaN | Female | 41 | 59.75 | 194.0 | 92 | 144 | 183.0 | 0.0 | 57.0 | 181.0 | Desirable | High | Overweight | Non-smoker |
2 | Alive | NaN | NaN | Female | 57 | 62.25 | 132.0 | 90 | 170 | 114.0 | 10.0 | NaN | 250.0 | High | High | Overweight | Moderate (6-15) |
3 | Alive | NaN | NaN | Female | 39 | 65.75 | 158.0 | 80 | 128 | 123.0 | 0.0 | NaN | 242.0 | High | Normal | Overweight | Non-smoker |
4 | Alive | NaN | NaN | Male | 42 | 66.00 | 156.0 | 76 | 110 | 116.0 | 20.0 | NaN | 281.0 | High | Optimal | Overweight | Heavy (16-25) |
df.shape
(5209, 17)
df.isna().sum()
Status 0 DeathCause 3218 AgeCHDdiag 3760 Sex 0 AgeAtStart 0 Height 6 Weight 6 Diastolic 0 Systolic 0 MRW 6 Smoking 36 AgeAtDeath 3218 Cholesterol 152 Chol_Status 152 BP_Status 0 Weight_Status 6 Smoking_Status 36 dtype: int64
df["Status"].value_counts()
Alive 3218 Dead 1991 Name: Status, dtype: int64
df["Status"].isna().sum()
0
df["DeathCause"].value_counts()
Coronary Heart Disease 605 Cancer 539 Cerebral Vascular Disease 378 Other 357 Unknown 112 Name: DeathCause, dtype: int64
df["DeathCause"].isna().sum()
3218
df["Sex"].value_counts()
Female 2873 Male 2336 Name: Sex, dtype: int64
df["Sex"].isna().sum()
0
df["Chol_Status"].value_counts()
Borderline 1861 High 1791 Desirable 1405 Name: Chol_Status, dtype: int64
df['Chol_Status'].isna().sum()
152
df["BP_Status"].value_counts()
High 2267 Normal 2143 Optimal 799 Name: BP_Status, dtype: int64
df['BP_Status'].isna().sum()
0
df["Weight_Status"].value_counts()
Overweight 3550 Normal 1472 Underweight 181 Name: Weight_Status, dtype: int64
df["Weight_Status"].isna().sum()
6
df["Smoking_Status"].value_counts()
Non-smoker 2501 Heavy (16-25) 1046 Light (1-5) 579 Moderate (6-15) 576 Very Heavy (> 25) 471 Name: Smoking_Status, dtype: int64
df['Smoking_Status'].isna().sum()
36
df.describe()
AgeCHDdiag | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | |
---|---|---|---|---|---|---|---|---|---|---|
count | 1449.000000 | 5209.000000 | 5203.000000 | 5203.000000 | 5209.000000 | 5209.000000 | 5203.000000 | 5173.000000 | 1991.000000 | 5057.000000 |
mean | 63.302968 | 44.068727 | 64.813185 | 153.086681 | 85.358610 | 136.909580 | 119.957525 | 9.366518 | 70.536414 | 227.417441 |
std | 10.018215 | 8.574954 | 3.582707 | 28.915426 | 12.973091 | 23.739596 | 19.983401 | 12.031451 | 10.559406 | 44.935524 |
min | 32.000000 | 28.000000 | 51.500000 | 67.000000 | 50.000000 | 82.000000 | 67.000000 | 0.000000 | 36.000000 | 96.000000 |
25% | 57.000000 | 37.000000 | 62.250000 | 132.000000 | 76.000000 | 120.000000 | 106.000000 | 0.000000 | 63.000000 | 196.000000 |
50% | 63.000000 | 43.000000 | 64.500000 | 150.000000 | 84.000000 | 132.000000 | 118.000000 | 1.000000 | 71.000000 | 223.000000 |
75% | 70.000000 | 51.000000 | 67.500000 | 172.000000 | 92.000000 | 148.000000 | 131.000000 | 20.000000 | 79.000000 | 255.000000 |
max | 90.000000 | 62.000000 | 76.500000 | 300.000000 | 160.000000 | 300.000000 | 268.000000 | 60.000000 | 93.000000 | 568.000000 |
Imputating the missing data in 'Chol_Status' as the proportions of other values in the variable.
proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)
missing_count = df['Chol_Status'].isna().sum()
missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)
df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples
df["Chol_Status"].value_counts()
Borderline 1911 High 1850 Desirable 1448 Name: Chol_Status, dtype: int64
df["Chol_Status"].isna().sum()
0
Imputing 6 the missing data in 'Weight_Status' with 'Normal' value.
df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"
df['Weight_Status'].isna().sum()
0
df["Weight_Status"].value_counts()
Overweight 3550 Normal 1478 Underweight 181 Name: Weight_Status, dtype: int64
proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)
missing_countD = df['DeathCause'].isna().sum()
missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)
df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD
df['DeathCause'].isna().sum()
0
df['BP_Status'].isna().sum()
0
Imputing the issing data in 'Smoking_Status' with 'Non-smoker'
df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"
df['Smoking_Status'].isna().sum()
0
Imputing numeric variabls with their means
# Calculate means for numeric columns
means = df.mean()
# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
df[col] = df[col].fillna(means[col])
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_66411/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. means = df.mean()
df.describe()
AgeCHDdiag | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | |
---|---|---|---|---|---|---|---|---|---|---|
count | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 |
mean | 63.302968 | 44.068727 | 64.813185 | 153.086681 | 85.358610 | 136.909580 | 119.957525 | 9.366518 | 70.536414 | 227.417441 |
std | 5.282496 | 8.574954 | 3.580643 | 28.898765 | 12.973091 | 23.739596 | 19.971887 | 11.989796 | 6.527255 | 44.274927 |
min | 32.000000 | 28.000000 | 51.500000 | 67.000000 | 50.000000 | 82.000000 | 67.000000 | 0.000000 | 36.000000 | 96.000000 |
25% | 63.302968 | 37.000000 | 62.250000 | 132.000000 | 76.000000 | 120.000000 | 106.000000 | 0.000000 | 70.536414 | 197.000000 |
50% | 63.302968 | 43.000000 | 64.500000 | 150.000000 | 84.000000 | 132.000000 | 118.000000 | 1.000000 | 70.536414 | 225.000000 |
75% | 63.302968 | 51.000000 | 67.500000 | 172.000000 | 92.000000 | 148.000000 | 131.000000 | 20.000000 | 70.536414 | 251.000000 |
max | 90.000000 | 62.000000 | 76.500000 | 300.000000 | 160.000000 | 300.000000 | 268.000000 | 60.000000 | 93.000000 | 568.000000 |
For Status, 1 = 'Dead', 0 = 'Alive'
# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}
# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)
df["Status"].value_counts()
0 3218 1 1991 Name: Status, dtype: int64
# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }
df['DeathCause'] = df['DeathCause'].map(status_mappingD)
df['DeathCause'].value_counts()
1.0 1590 2.0 1398 3.0 982 4.0 957 Name: DeathCause, dtype: int64
df['DeathCause'].isna().sum()
282
# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }
df['Sex'] = df['Sex'].map(status_mappingSex)
df['Sex'].value_counts()
0 2873 1 2336 Name: Sex, dtype: int64
# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}
df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)
df["Smoking_Status"].value_counts()
0 2537 3 1046 1 579 2 576 4 471 Name: Smoking_Status, dtype: int64
status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}
df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)
df["Chol_Status"].value_counts()
1 1911 2 1850 0 1448 Name: Chol_Status, dtype: int64
status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}
df['BP_Status'] = df['BP_Status'].map(status_mappingBP)
df["BP_Status"].value_counts()
2 2267 0 2143 1 799 Name: BP_Status, dtype: int64
status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)
df["Weight_Status"].value_counts()
2 3550 0 1478 1 181 Name: Weight_Status, dtype: int64
df.isna().sum()
Status 0 DeathCause 0 AgeCHDdiag 0 Sex 0 AgeAtStart 0 Height 0 Weight 0 Diastolic 0 Systolic 0 MRW 0 Smoking 0 AgeAtDeath 0 Cholesterol 0 Chol_Status 0 BP_Status 0 Weight_Status 0 Smoking_Status 0 dtype: int64
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
# Split data into independent variables (X) and dependent variable (y)
X = df.drop('Status', axis=1)
y = df['Status']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize logistic regression model
model = LogisticRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Predict on the testing data
y_pred = model.predict(X_test)
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix: [[536 111] [143 252]] Classification Report: precision recall f1-score support 0 0.79 0.83 0.81 647 1 0.69 0.64 0.66 395 accuracy 0.76 1042 macro avg 0.74 0.73 0.74 1042 weighted avg 0.75 0.76 0.75 1042
/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
Select variables contributing most to the logistic model.
from sklearn.feature_selection import SelectKBest, chi2
# Assuming X_train, y_train are your training data
# Replace 5 with the desired number of features
k = 5
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
# Get the selected feature indices
selected_indices = selector.get_support(indices=True)
# Get the names of selected features
selected_features = X_train.columns[selected_indices]
print("Selected features:", selected_features)
Selected features: Index(['AgeAtStart', 'Diastolic', 'Systolic', 'Smoking', 'Cholesterol'], dtype='object')
Rewrite the model with selected variables. However, the accuracy is not improved more.
# Split data into independent variables (X) and dependent variable (y)
X = df.drop(['Status','DeathCause','AgeCHDdiag','Weight','Height','MRW','Chol_Status','BP_Status','Weight_Status','Smoking_Status'], axis=1)
y = df['Status']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize logistic regression model
model = LogisticRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Predict on the testing data
y_pred = model.predict(X_test)
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Confusion Matrix: [[537 110] [143 252]] Classification Report: precision recall f1-score support Alive 0.79 0.83 0.81 647 Dead 0.70 0.64 0.67 395 accuracy 0.76 1042 macro avg 0.74 0.73 0.74 1042 weighted avg 0.75 0.76 0.75 1042
/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(