import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")

df.shape

(5209, 17)

df.head()

df.isna().sum()

Status               0
DeathCause        3218
AgeCHDdiag        3760
Sex                  0
AgeAtStart           0
Height               6
Weight               6
Diastolic            0
Systolic             0
MRW                  6
Smoking             36
AgeAtDeath        3218
Cholesterol        152
Chol_Status        152
BP_Status            0
Weight_Status        6
Smoking_Status      36
dtype: int64

df.describe()

df2 = df.dropna(subset="Cholesterol")
df2.head()
df2.isna().sum()

Status               0
DeathCause        3135
AgeCHDdiag        3649
Sex                  0
AgeAtStart           0
Height               6
Weight               6
Diastolic            0
Systolic             0
MRW                  6
Smoking              8
AgeAtDeath        3135
Cholesterol          0
Chol_Status          0
BP_Status            0
Weight_Status        6
Smoking_Status       8
dtype: int64

df2.shape

(5057, 17)

df2.loc[df2["AgeAtDeath"].isnull(), 'AgeAtDeath'] = 71
df2.loc[df2["AgeCHDdiag"].isnull(), 'AgeCHDdiag'] = 63.4
df2.loc[df2["Height"].isnull(), 'Height'] = 64.8
df2.loc[df2["Weight"].isnull(), 'Weight'] = 153.1
df2.loc[df2["MRW"].isnull(), 'MRW'] = 119.9
df2.loc[df2["Smoking"].isnull(), 'Smoking'] = 9.4

df2.loc[df2['DeathCause'].isna(), 'DeathCause'] = "Alive"
df2["DeathCause"].value_counts()

DeathCause
Alive                        3135
Coronary Heart Disease        583
Cancer                        521
Cerebral Vascular Disease     363
Other                         347
Unknown                       108
Name: count, dtype: int64

df2["Weight_Status"].unique()

array(['Overweight', 'Normal', 'Underweight', nan], dtype=object)

df2.loc[df2['Weight_Status'].isna(), 'Weight_Status'] = "Normal"
df2["Weight_Status"].value_counts()

Weight_Status
Overweight     3445
Normal         1436
Underweight     176
Name: count, dtype: int64

df2.loc[df2['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"
df2["Smoking_Status"].value_counts()

Smoking_Status
Non-smoker           2444
Heavy (16-25)        1029
Moderate (6-15)       563
Light (1-5)           563
Very Heavy (> 25)     458
Name: count, dtype: int64

df2.isna().sum()

Status            0
DeathCause        0
AgeCHDdiag        0
Sex               0
AgeAtStart        0
Height            0
Weight            0
Diastolic         0
Systolic          0
MRW               0
Smoking           0
AgeAtDeath        0
Cholesterol       0
Chol_Status       0
BP_Status         0
Weight_Status     0
Smoking_Status    0
dtype: int64

df2.shape

(5057, 17)

df2.head()

df2.describe()

import pandas as pd
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = df2.Cholesterol
X = df2.drop(['Cholesterol','Chol_Status'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "category"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train.head()

y_train.head()

1525    200.0
4352    167.0
1892    200.0
3207    250.0
4922    204.0
Name: Cholesterol, dtype: float64

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 32.56641304347827

model.feature_importances_
coefficients = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(np.transpose(model.feature_importances_))], axis = 1)
coefficients.columns = ['feature', 'coef']
coefficients

from sklearn.metrics import r2_score
r2_score(y_valid,preds)

0.06910983474898058

# Calculate the residual errors
residuals = np.abs(preds - y_valid)

# Define a threshold for identifying outliers
threshold = 3 * np.std(residuals)  # You can adjust the threshold as needed

# Identify outliers based on the threshold
outliers_indices = np.where(residuals > threshold)[0]
outliers = X_train.iloc[outliers_indices]

# Print or further analyze the outliers
print("Number of outliers:", len(outliers_indices))
print("Indices of outliers:", outliers_indices)
print("Outliers:", outliers)

Number of outliers: 75
Indices of outliers: [  14   35   54   56   58   59   79   93  121  171  228  244  248  254
  258  259  276  303  315  327  333  354  378  391  414  415  420  426
  433  465  469  472  496  526  528  538  540  566  574  583  591  609
  613  626  629  645  664  670  671  679  683  723  742  777  781  786
  820  823  824  836  861  868  876  887  896  908  918  943  947  971
  985  995  999 1002 1010]
Outliers:       AgeCHDdiag  AgeAtStart  Height  Weight  Diastolic  Systolic    MRW  \
128         63.4          53   64.00   195.0         92       170  157.0   
431         59.0          53   66.00   151.0         98       168  112.0   
3342        63.4          43   73.00   182.0         80       130  110.0   
4036        78.0          58   67.25   174.0         92       142  128.0   
3626        63.4          60   61.50   185.0        110       196  165.0   
...          ...         ...     ...     ...        ...       ...    ...   
894         63.4          41   61.00   107.0         84       170   96.0   
812         63.4          34   60.25   121.0         72       112  111.0   
17          56.0          56   67.25   122.0         72       120   87.0   
3946        63.4          36   66.00   136.0         76       124  103.0   
415         63.4          33   72.25   198.0         68       118  123.0   

      Smoking  AgeAtDeath  
128       0.0        63.0  
431      30.0        61.0  
3342      0.0        71.0  
4036      0.0        84.0  
3626      0.0        70.0  
...       ...         ...  
894      20.0        71.0  
812      20.0        71.0  
17       15.0        72.0  
3946     20.0        71.0  
415       9.4        71.0  

[75 rows x 9 columns]

# Drop outliers from X_train and y_train
X_train_clean = X_train.drop(outliers.index)
y_train_clean = y_train.drop(outliers.index)

from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [33.65152174 34.50763834 34.4574184  32.43346192 32.33155292]

print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
33.47631866269455

import xgboost

from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 34.80325125894056

my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train)
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 37.16936144621476

my_model = XGBRegressor(n_estimators=500, early_stopping_rounds=5)
my_model.fit(X_train, y_train, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 31.817632893799793

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5)
my_model.fit(X_train, y_train, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 31.661847668674152

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4, early_stopping_rounds=5)
my_model.fit(X_train, y_train, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 31.661847668674152

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol
count	1449.000000	5209.000000	5203.000000	5203.000000	5209.000000	5209.000000	5203.000000	5173.000000	1991.000000	5057.000000
mean	63.302968	44.068727	64.813185	153.086681	85.358610	136.909580	119.957525	9.366518	70.536414	227.417441
std	10.018215	8.574954	3.582707	28.915426	12.973091	23.739596	19.983401	12.031451	10.559406	44.935524
min	32.000000	28.000000	51.500000	67.000000	50.000000	82.000000	67.000000	0.000000	36.000000	96.000000
25%	57.000000	37.000000	62.250000	132.000000	76.000000	120.000000	106.000000	0.000000	63.000000	196.000000
50%	63.000000	43.000000	64.500000	150.000000	84.000000	132.000000	118.000000	1.000000	71.000000	223.000000
75%	70.000000	51.000000	67.500000	172.000000	92.000000	148.000000	131.000000	20.000000	79.000000	255.000000
max	90.000000	62.000000	76.500000	300.000000	160.000000	300.000000	268.000000	60.000000	93.000000	568.000000

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol
count	5057.000000	5057.000000	5057.000000	5057.000000	5057.000000	5057.00000	5057.000000	5057.000000	5057.000000	5057.000000
mean	63.397785	44.072573	64.826043	153.068934	85.392130	136.95966	119.871149	9.383271	70.866917	227.417441
std	5.268384	8.571096	3.584462	28.888501	12.984148	23.75323	19.840459	12.008497	6.473159	44.935524
min	32.000000	28.000000	51.500000	67.000000	50.000000	82.00000	67.000000	0.000000	36.000000	96.000000
25%	63.400000	37.000000	62.250000	132.000000	76.000000	120.00000	106.000000	0.000000	71.000000	196.000000
50%	63.400000	43.000000	64.500000	150.000000	84.000000	132.00000	118.000000	1.000000	71.000000	223.000000
75%	63.400000	51.000000	67.500000	172.000000	92.000000	148.00000	131.000000	20.000000	71.000000	255.000000
max	90.000000	62.000000	76.500000	300.000000	160.000000	300.00000	268.000000	60.000000	93.000000	568.000000

Cholesterol Machine Learning Models Using Python¶

Data loading¶

Summary¶

Cleaning data¶

Building models¶

Validating models¶

XGBoost¶

Conclusions¶

	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol	Chol_Status	BP_Status	Weight_Status	Smoking_Status
0	Dead	Other	NaN	Female	29	62.50	140.0	78	124	121.0	0.0	55.0	NaN	NaN	Normal	Overweight	Non-smoker
1	Dead	Cancer	NaN	Female	41	59.75	194.0	92	144	183.0	0.0	57.0	181.0	Desirable	High	Overweight	Non-smoker
2	Alive	NaN	NaN	Female	57	62.25	132.0	90	170	114.0	10.0	NaN	250.0	High	High	Overweight	Moderate (6-15)
3	Alive	NaN	NaN	Female	39	65.75	158.0	80	128	123.0	0.0	NaN	242.0	High	Normal	Overweight	Non-smoker
4	Alive	NaN	NaN	Male	42	66.00	156.0	76	110	116.0	20.0	NaN	281.0	High	Optimal	Overweight	Heavy (16-25)

	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol	Chol_Status	BP_Status	Weight_Status	Smoking_Status
1	Dead	Cancer	63.4	Female	41	59.75	194.0	92	144	183.0	0.0	57.0	181.0	Desirable	High	Overweight	Non-smoker
2	Alive	Alive	63.4	Female	57	62.25	132.0	90	170	114.0	10.0	71.0	250.0	High	High	Overweight	Moderate (6-15)
3	Alive	Alive	63.4	Female	39	65.75	158.0	80	128	123.0	0.0	71.0	242.0	High	Normal	Overweight	Non-smoker
4	Alive	Alive	63.4	Male	42	66.00	156.0	76	110	116.0	20.0	71.0	281.0	High	Optimal	Overweight	Heavy (16-25)
5	Alive	Alive	63.4	Female	58	61.75	131.0	92	176	117.0	0.0	71.0	196.0	Desirable	High	Overweight	Non-smoker

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath
1525	63.4	37	67.00	148.0	72	102	109.0	5.0	71.0
4352	63.4	33	63.00	117.0	72	118	98.0	0.0	71.0
1892	59.0	39	60.75	131.0	88	158	120.0	0.0	71.0
3207	78.0	52	65.00	140.0	120	210	109.0	5.0	80.0
4922	68.0	56	65.50	188.0	78	132	144.0	0.0	88.0

	feature	coef
0	AgeCHDdiag	0.078315
1	AgeAtStart	0.167187
2	Height	0.135412
3	Weight	0.138190
4	Diastolic	0.104912
5	Systolic	0.124986
6	MRW	0.121181
7	Smoking	0.059044
8	AgeAtDeath	0.070773