Use the data of heart disease in which some indicators may be related to the concentration of cholesterol such as weight, age, height, smoking ...
Building a linear regression model with cholesterol as the target variable and other variables as independent variables.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()
| Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | Chol_Status | BP_Status | Weight_Status | Smoking_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Dead | Other | NaN | Female | 29 | 62.50 | 140.0 | 78 | 124 | 121.0 | 0.0 | 55.0 | NaN | NaN | Normal | Overweight | Non-smoker |
| 1 | Dead | Cancer | NaN | Female | 41 | 59.75 | 194.0 | 92 | 144 | 183.0 | 0.0 | 57.0 | 181.0 | Desirable | High | Overweight | Non-smoker |
| 2 | Alive | NaN | NaN | Female | 57 | 62.25 | 132.0 | 90 | 170 | 114.0 | 10.0 | NaN | 250.0 | High | High | Overweight | Moderate (6-15) |
| 3 | Alive | NaN | NaN | Female | 39 | 65.75 | 158.0 | 80 | 128 | 123.0 | 0.0 | NaN | 242.0 | High | Normal | Overweight | Non-smoker |
| 4 | Alive | NaN | NaN | Male | 42 | 66.00 | 156.0 | 76 | 110 | 116.0 | 20.0 | NaN | 281.0 | High | Optimal | Overweight | Heavy (16-25) |
df.shape
(5209, 17)
df.isna().sum()
Status 0 DeathCause 3218 AgeCHDdiag 3760 Sex 0 AgeAtStart 0 Height 6 Weight 6 Diastolic 0 Systolic 0 MRW 6 Smoking 36 AgeAtDeath 3218 Cholesterol 152 Chol_Status 152 BP_Status 0 Weight_Status 6 Smoking_Status 36 dtype: int64
look at the data distribution of the numeric variables:
import matplotlib.pyplot as plt
def plot_histograms(df, bins=10, alpha=0.5, colors=None):
"""
Plot histograms for all numeric variables in the DataFrame.
Parameters:
df (DataFrame): The DataFrame containing numeric variables.
bins (int): Number of bins for the histograms. Default is 10.
alpha (float): Transparency level of the histograms. Default is 0.5.
colors (list): List of colors for the histograms. If None, default colors will be used.
Returns:
None
"""
if colors is None:
colors = plt.cm.tab10.colors # Default color palette
num_variables = df.select_dtypes(include='number').shape[1]
num_rows = (num_variables + 1) // 2
num_cols = 2
plt.figure(figsize=(12, 6 * num_rows))
for i, col in enumerate(df.select_dtypes(include='number'), start=1):
plt.subplot(num_rows, num_cols, i)
plt.hist(df[col], bins=bins, alpha=alpha, color=colors[i % len(colors)])
plt.title(f'Histogram of {col}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()
# Example usage:
# Assuming df is your DataFrame containing numeric variables
plot_histograms(df)
proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)
missing_countD = df['DeathCause'].isna().sum()
missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)
df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD
df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"
df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"
proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)
missing_count = df['Chol_Status'].isna().sum()
missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)
df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples
# Calculate means for numeric columns
means = df.mean()
# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
df[col] = df[col].fillna(means[col])
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_85659/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. means = df.mean()
df.describe()
| AgeCHDdiag | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 |
| mean | 63.302968 | 44.068727 | 64.813185 | 153.086681 | 85.358610 | 136.909580 | 119.957525 | 9.366518 | 70.536414 | 227.417441 |
| std | 5.282496 | 8.574954 | 3.580643 | 28.898765 | 12.973091 | 23.739596 | 19.971887 | 11.989796 | 6.527255 | 44.274927 |
| min | 32.000000 | 28.000000 | 51.500000 | 67.000000 | 50.000000 | 82.000000 | 67.000000 | 0.000000 | 36.000000 | 96.000000 |
| 25% | 63.302968 | 37.000000 | 62.250000 | 132.000000 | 76.000000 | 120.000000 | 106.000000 | 0.000000 | 70.536414 | 197.000000 |
| 50% | 63.302968 | 43.000000 | 64.500000 | 150.000000 | 84.000000 | 132.000000 | 118.000000 | 1.000000 | 70.536414 | 225.000000 |
| 75% | 63.302968 | 51.000000 | 67.500000 | 172.000000 | 92.000000 | 148.000000 | 131.000000 | 20.000000 | 70.536414 | 251.000000 |
| max | 90.000000 | 62.000000 | 76.500000 | 300.000000 | 160.000000 | 300.000000 | 268.000000 | 60.000000 | 93.000000 | 568.000000 |
df.isna().sum()
Status 0 DeathCause 0 AgeCHDdiag 0 Sex 0 AgeAtStart 0 Height 0 Weight 0 Diastolic 0 Systolic 0 MRW 0 Smoking 0 AgeAtDeath 0 Cholesterol 0 Chol_Status 0 BP_Status 0 Weight_Status 0 Smoking_Status 0 dtype: int64
# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}
# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)
# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }
df['DeathCause'] = df['DeathCause'].map(status_mappingD)
# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }
df['Sex'] = df['Sex'].map(status_mappingSex)
# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}
df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)
status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}
df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)
status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}
df['BP_Status'] = df['BP_Status'].map(status_mappingBP)
status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)
# Function to detect outliers using IQR method
def detect_outliers(df, variable):
Q1 = df[variable].quantile(0.25)
Q3 = df[variable].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[variable] < lower_bound) | (df[variable] > upper_bound)]
return outliers
# Define numeric variables to detect outliers
numeric_variables = ['Cholesterol', 'AgeCHDdiag', 'AgeAtStart', 'Height', 'Weight', 'Smoking']
# Create a dictionary to store outliers for each variable
outliers_dict = {}
# Detect outliers in each numeric variable
for col in numeric_variables:
outliers_dict[col] = detect_outliers(df, col)
# Print outliers for each variable
for col, outliers in outliers_dict.items():
print(f"Outliers in {col}:")
print(outliers)
print()
Outliers in Cholesterol:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
89 0 2.0 63.302968 0 52 62.00 135.0
123 1 1.0 73.000000 0 47 62.00 124.0
143 0 1.0 63.302968 1 45 68.50 160.0
187 0 4.0 76.000000 0 54 64.25 146.0
197 1 1.0 61.000000 1 59 67.25 164.0
... ... ... ... ... ... ... ...
4874 1 1.0 53.000000 1 47 64.50 143.0
5022 1 3.0 63.302968 0 62 60.50 108.0
5138 1 1.0 59.000000 0 55 59.75 148.0
5146 0 NaN 63.302968 1 56 65.00 145.0
5161 1 4.0 63.302968 0 57 62.00 159.0
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
89 82 144 116.0 0.0 70.536414 339.0
123 80 140 107.0 0.0 77.000000 347.0
143 86 136 111.0 0.0 70.536414 347.0
187 96 148 118.0 0.0 70.536414 418.0
197 100 152 117.0 0.0 81.000000 334.0
... ... ... ... ... ... ...
4874 92 158 112.0 15.0 71.000000 386.0
5022 72 128 99.0 15.0 82.000000 350.0
5138 90 124 140.0 0.0 61.000000 400.0
5146 90 132 111.0 20.0 70.536414 360.0
5161 80 124 137.0 0.0 81.000000 334.0
Chol_Status BP_Status Weight_Status Smoking_Status
89 2 2 2 0
123 2 0 0 0
143 2 0 2 0
187 2 2 2 0
197 2 2 2 0
... ... ... ... ...
4874 2 2 2 2
5022 2 0 0 2
5138 2 2 2 0
5146 2 2 2 3
5161 2 0 2 0
[108 rows x 17 columns]
Outliers in AgeCHDdiag:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
11 0 3.0 57.0 1 33 64.25 151.0
12 0 3.0 55.0 1 33 70.00 174.0
13 0 2.0 79.0 1 57 67.25 165.0
14 0 4.0 66.0 1 44 69.00 155.0
17 1 2.0 56.0 1 56 67.25 122.0
... ... ... ... ... ... ... ...
5200 1 1.0 55.0 1 47 67.00 186.0
5201 0 1.0 61.0 1 59 66.50 156.0
5202 1 1.0 59.0 1 59 71.00 177.0
5204 1 1.0 79.0 1 49 64.50 173.0
5207 1 1.0 50.0 1 36 68.25 164.0
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
11 68 108 118.0 0.0 70.536414 221.000000
12 90 142 114.0 0.0 70.536414 188.000000
13 76 128 118.0 15.0 70.536414 227.417441
14 90 130 105.0 30.0 70.536414 292.000000
17 72 120 87.0 15.0 72.000000 194.000000
... ... ... ... ... ... ...
5200 105 155 133.0 5.0 57.000000 199.000000
5201 84 124 116.0 20.0 70.536414 223.000000
5202 68 108 113.0 25.0 65.000000 246.000000
5204 80 110 135.0 20.0 81.000000 228.000000
5207 64 108 114.0 40.0 64.000000 238.000000
Chol_Status BP_Status Weight_Status Smoking_Status
11 1 1 2 0
12 0 2 2 0
13 2 0 2 2
14 2 2 0 4
17 0 0 1 2
... ... ... ... ...
5200 0 2 2 1
5201 1 0 2 3
5202 2 1 2 3
5204 1 0 2 3
5207 1 1 2 4
[1449 rows x 17 columns]
Outliers in AgeAtStart:
Empty DataFrame
Columns: [Status, DeathCause, AgeCHDdiag, Sex, AgeAtStart, Height, Weight, Diastolic, Systolic, MRW, Smoking, AgeAtDeath, Cholesterol, Chol_Status, BP_Status, Weight_Status, Smoking_Status]
Index: []
Outliers in Height:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
418 0 NaN 63.302968 1 44 75.50 177.0
678 0 4.0 55.000000 1 29 75.50 204.0
1893 1 2.0 63.302968 1 34 76.00 220.0
2249 1 4.0 63.302968 0 51 51.50 72.0
3355 0 2.0 63.302968 1 44 76.00 169.0
3508 1 4.0 63.302968 0 60 53.75 119.0
3882 0 1.0 63.302968 1 33 76.50 221.0
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
418 62 122 101.0 0.0 70.536414 199.0
678 90 140 116.0 0.0 70.536414 246.0
1893 80 138 122.0 10.0 62.000000 192.0
2249 92 126 88.0 1.0 79.000000 234.0
3355 70 102 93.0 5.0 70.536414 205.0
3508 100 190 135.0 0.0 76.000000 250.0
3882 100 155 122.0 20.0 70.536414 225.0
Chol_Status BP_Status Weight_Status Smoking_Status
418 0 0 0 0
678 2 2 2 0
1893 0 0 2 2
2249 1 2 1 1
3355 1 1 0 1
3508 2 2 2 0
3882 1 2 2 3
Outliers in Weight:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
154 0 4.0 63.000000 0 37 63.00 236.0
436 1 3.0 63.302968 0 50 62.75 241.0
491 1 3.0 63.302968 1 52 72.00 247.0
671 0 3.0 63.302968 1 29 70.25 243.0
765 1 4.0 63.302968 0 33 56.25 71.0
836 1 3.0 63.302968 1 48 66.00 250.0
1236 1 1.0 59.000000 1 39 73.50 244.0
1623 0 1.0 63.302968 0 51 65.25 239.0
1647 0 3.0 32.000000 1 32 73.00 239.0
1664 1 NaN 60.000000 1 34 72.50 245.0
1679 1 1.0 75.000000 1 51 74.00 239.0
1772 0 1.0 72.000000 1 44 73.25 238.0
1778 1 2.0 62.000000 1 40 71.50 240.0
1796 0 NaN 57.000000 1 31 69.00 238.0
1913 0 1.0 33.000000 1 33 64.75 260.0
1944 0 2.0 63.302968 1 44 68.25 244.0
2099 0 2.0 63.302968 0 38 59.50 242.0
2119 1 3.0 63.302968 1 59 68.25 276.0
2307 1 4.0 63.302968 1 60 69.25 234.0
2348 1 1.0 49.000000 1 33 70.50 235.0
2437 1 1.0 75.000000 0 53 60.25 271.0
2547 1 4.0 63.302968 1 50 66.50 237.0
2576 0 NaN 43.000000 0 43 60.25 235.0
2592 1 3.0 63.302968 1 38 72.25 273.0
2609 0 2.0 63.302968 1 33 66.25 241.0
2761 0 1.0 63.302968 1 38 72.25 234.0
2913 0 3.0 63.302968 1 39 67.25 237.0
2946 0 2.0 63.302968 1 47 71.75 236.0
2991 1 3.0 63.302968 0 60 64.00 235.0
3123 1 1.0 71.000000 0 49 62.00 261.0
3124 1 1.0 64.000000 1 52 72.00 236.0
3251 0 4.0 63.302968 0 57 67.00 293.0
3314 0 NaN 63.302968 1 36 70.50 244.0
3359 1 2.0 63.302968 0 36 63.75 300.0
3615 1 4.0 63.302968 0 47 60.00 238.0
3660 1 1.0 51.000000 1 51 69.00 239.0
3844 0 2.0 63.302968 0 39 65.50 250.0
3871 0 2.0 63.302968 1 33 73.75 245.0
4239 1 NaN 63.302968 0 56 60.50 269.0
4532 0 1.0 63.302968 1 38 73.00 246.0
4703 0 1.0 63.000000 0 47 61.00 300.0
4857 1 3.0 63.302968 0 59 62.75 281.0
4960 1 1.0 41.000000 1 35 70.00 235.0
5005 1 1.0 58.000000 1 54 68.00 256.0
5062 0 4.0 63.302968 1 32 68.00 236.0
5097 1 4.0 63.302968 0 46 57.75 67.0
5133 0 4.0 63.302968 0 32 61.00 275.0
5199 1 1.0 64.000000 1 58 72.75 255.0
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
154 96 178 197.0 0.0 70.536414 227.417441
436 150 242 208.0 0.0 58.000000 213.000000
491 104 154 153.0 20.0 68.000000 188.000000
671 90 162 160.0 20.0 70.536414 163.000000
765 90 116 73.0 0.0 61.000000 192.000000
836 96 170 185.0 0.0 72.000000 180.000000
1236 100 144 147.0 20.0 71.000000 276.000000
1623 88 150 187.0 1.0 70.536414 175.000000
1647 92 138 144.0 0.0 70.536414 171.000000
1664 90 140 152.0 0.0 64.000000 271.000000
1679 100 158 141.0 5.0 77.000000 243.000000
1772 100 150 143.0 20.0 70.536414 237.000000
1778 90 128 153.0 40.0 70.000000 228.000000
1796 88 140 161.0 0.0 70.536414 250.000000
1913 100 148 203.0 0.0 70.536414 227.417441
1944 126 190 169.0 0.0 70.536414 292.000000
2099 86 132 228.0 0.0 70.536414 157.000000
2119 80 124 192.0 0.0 81.000000 182.000000
2307 98 138 158.0 30.0 86.000000 175.000000
2348 96 146 155.0 25.0 51.000000 224.000000
2437 130 246 249.0 0.0 85.000000 200.000000
2547 100 156 176.0 5.0 76.000000 247.000000
2576 134 246 216.0 0.0 70.536414 300.000000
2592 120 180 170.0 0.0 58.000000 227.000000
2609 90 140 179.0 0.0 70.536414 259.000000
2761 84 134 145.0 0.0 70.536414 155.000000
2913 130 204 169.0 10.0 70.536414 204.000000
2946 88 130 150.0 0.0 70.536414 248.000000
2991 96 190 190.0 0.0 92.000000 227.417441
3123 86 150 225.0 0.0 77.000000 245.000000
3124 104 156 147.0 0.0 70.000000 255.000000
3251 108 170 215.0 0.0 70.536414 242.000000
3314 95 135 161.0 0.0 70.536414 204.000000
3359 108 182 250.0 20.0 54.000000 215.000000
3615 100 180 218.0 0.0 53.000000 227.417441
3660 122 228 161.0 40.0 61.000000 226.000000
3844 105 135 195.0 0.0 70.536414 242.000000
3871 90 150 148.0 0.0 70.536414 179.000000
4239 120 210 247.0 20.0 82.000000 150.000000
4532 94 124 148.0 0.0 70.536414 174.000000
4703 120 208 268.0 0.0 70.536414 185.000000
4857 100 152 242.0 0.0 69.000000 188.000000
4960 96 152 155.0 30.0 43.000000 285.000000
5005 100 182 178.0 45.0 60.000000 286.000000
5062 100 150 164.0 0.0 70.536414 226.000000
5097 80 124 67.0 0.0 56.000000 234.000000
5133 82 136 246.0 0.0 70.536414 154.000000
5199 88 130 158.0 40.0 80.000000 260.000000
Chol_Status BP_Status Weight_Status Smoking_Status
154 2 2 2 0
436 1 2 2 0
491 0 2 2 3
671 0 2 2 3
765 0 2 1 0
836 0 2 2 0
1236 2 2 2 3
1623 0 2 2 1
1647 0 2 2 0
1664 2 2 2 0
1679 2 2 2 1
1772 1 2 2 3
1778 1 2 2 4
1796 2 0 2 0
1913 1 2 2 0
1944 2 2 2 0
2099 0 0 2 0
2119 0 0 2 0
2307 0 2 2 4
2348 1 2 2 3
2437 1 2 2 0
2547 2 2 2 1
2576 2 2 2 0
2592 1 2 2 0
2609 2 2 2 0
2761 0 0 2 0
2913 1 2 2 2
2946 2 0 2 0
2991 1 2 2 0
3123 2 2 2 0
3124 2 2 2 0
3251 2 2 2 0
3314 1 2 2 0
3359 1 2 2 3
3615 1 2 2 0
3660 1 2 2 4
3844 2 2 2 0
3871 0 2 2 0
4239 0 2 2 3
4532 0 2 2 0
4703 0 2 2 0
4857 0 2 2 0
4960 2 2 2 4
5005 2 2 2 4
5062 1 2 2 0
5097 1 0 1 0
5133 0 0 2 0
5199 2 0 2 4
Outliers in Smoking:
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
498 0 2.0 63.302968 1 34 65.75 137.000000
914 0 3.0 66.000000 1 38 68.00 174.000000
1093 0 2.0 63.302968 1 32 70.00 173.000000
1263 1 4.0 74.000000 1 54 68.00 153.086681
1699 1 1.0 57.000000 1 53 66.25 150.000000
2043 0 2.0 63.302968 1 32 70.50 203.000000
2903 0 NaN 55.000000 1 35 68.25 158.000000
2910 0 4.0 63.302968 1 34 70.50 164.000000
3468 1 1.0 40.000000 1 36 70.50 190.000000
4110 1 NaN 42.000000 1 32 67.25 222.000000
4127 0 3.0 63.302968 1 36 68.25 152.000000
4296 1 1.0 66.000000 1 46 67.00 141.000000
4303 0 2.0 63.302968 1 51 62.75 134.000000
4360 1 1.0 62.000000 1 42 67.50 184.000000
4368 1 1.0 41.000000 1 35 68.50 185.000000
4376 0 4.0 63.302968 1 37 67.00 176.000000
4917 1 2.0 63.302968 1 40 64.00 126.000000
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
498 80 116 105.000000 60.0 70.536414 227.417441
914 88 148 121.000000 60.0 70.536414 305.000000
1093 80 120 114.000000 60.0 70.536414 234.000000
1263 90 132 119.957525 60.0 80.000000 307.000000
1699 86 120 111.000000 60.0 67.000000 209.000000
2043 90 140 134.000000 60.0 70.536414 146.000000
2903 80 130 110.000000 60.0 70.536414 177.000000
2910 90 115 108.000000 60.0 70.536414 159.000000
3468 100 140 125.000000 60.0 54.000000 362.000000
4110 105 150 159.000000 55.0 52.000000 220.000000
4127 70 124 106.000000 55.0 70.536414 163.000000
4296 70 118 101.000000 60.0 68.000000 246.000000
4303 100 150 110.000000 60.0 70.536414 314.000000
4360 82 135 131.000000 60.0 64.000000 345.000000
4368 104 176 128.000000 60.0 51.000000 263.000000
4376 86 118 126.000000 60.0 70.536414 210.000000
4917 66 98 98.000000 60.0 64.000000 215.000000
Chol_Status BP_Status Weight_Status Smoking_Status
498 1 0 0 4
914 2 2 2 4
1093 1 0 2 4
1263 2 2 0 4
1699 1 0 2 4
2043 0 2 2 4
2903 0 0 2 4
2910 0 2 0 4
3468 2 2 2 4
4110 1 2 2 4
4127 0 0 0 4
4296 2 1 0 4
4303 2 2 2 4
4360 2 0 2 4
4368 2 2 2 4
4376 1 0 2 4
4917 1 1 0 4
# Drop outliers from df
for col, outliers in outliers_dict.items():
try:
df.drop(outliers.index, inplace=True)
except KeyError:
# Handle KeyError if the column doesn't exist in the DataFrame
pass
# Reset index after dropping outliers
df.reset_index(drop=True, inplace=True)
# Verify the DataFrame after dropping outliers
print(df)
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \
0 1 4.0 63.302968 0 29 62.50 140.0
1 1 2.0 63.302968 0 41 59.75 194.0
2 0 4.0 63.302968 0 57 62.25 132.0
3 0 2.0 63.302968 0 39 65.75 158.0
4 0 1.0 63.302968 1 42 66.00 156.0
... ... ... ... ... ... ... ...
5041 1 1.0 79.000000 1 49 64.50 173.0
5042 0 2.0 63.302968 0 42 60.00 141.0
5043 0 2.0 63.302968 0 51 58.25 123.0
5044 1 1.0 50.000000 1 36 68.25 164.0
5045 0 2.0 63.302968 1 36 70.50 177.0
Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \
0 78 124 121.0 0.0 55.000000 227.417441
1 92 144 183.0 0.0 57.000000 181.000000
2 90 170 114.0 10.0 70.536414 250.000000
3 80 128 123.0 0.0 70.536414 242.000000
4 76 110 116.0 20.0 70.536414 281.000000
... ... ... ... ... ... ...
5041 80 110 135.0 20.0 81.000000 228.000000
5042 76 124 129.0 5.0 70.536414 209.000000
5043 90 152 119.0 1.0 70.536414 197.000000
5044 64 108 114.0 40.0 64.000000 238.000000
5045 68 94 116.0 50.0 70.536414 240.000000
Chol_Status BP_Status Weight_Status Smoking_Status
0 2 0 2 0
1 0 2 2 0
2 2 2 2 2
3 2 0 2 0
4 2 1 2 3
... ... ... ... ...
5041 1 0 2 3
5042 1 0 2 1
5043 0 2 2 1
5044 1 1 2 4
5045 2 1 2 4
[5046 rows x 17 columns]
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize linear regression model
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) score:", r2)
Mean Squared Error: 1421.387817864974 R-squared (R2) score: 0.08504489192920817
# Extract coefficients
coefficients = model.coef_
# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)
# Print the top contributing features
print("Top contributing features:")
print(sorted_coefficients_df)
Top contributing features:
Feature Coefficient Absolute Coefficient
3 Sex -4.994471 4.994471
13 Weight_Status 2.317034 2.317034
5 Height -2.234509 2.234509
14 Smoking_Status 2.134919 2.134919
0 Status 1.662626 1.662626
4 AgeAtStart 1.173081 1.173081
1 DeathCause -0.777538 0.777538
12 BP_Status 0.495003 0.495003
6 Weight 0.411385 0.411385
9 MRW -0.408968 0.408968
2 AgeCHDdiag -0.322008 0.322008
7 Diastolic 0.203016 0.203016
11 AgeAtDeath -0.120730 0.120730
8 Systolic 0.032830 0.032830
10 Smoking -0.012846 0.012846
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize linear regression model
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Extract coefficients
coefficients = model.coef_
# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)
# Select top 7 features
top_features = sorted_coefficients_df.iloc[:7]['Feature'].tolist()
# Fit the model on training data with top 7 features
X_train_top = X_train[top_features]
model.fit(X_train_top, y_train)
# Evaluate the model on testing data
X_test_top = X_test[top_features]
score = model.score(X_test_top, y_test)
print("R-squared (R2) score using top 7 features:", score)
R-squared (R2) score using top 7 features: 0.07588634596412047
import seaborn as sns
import matplotlib.pyplot as plt
#Plot actual vs. predicted cholesterol values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Cholesterol')
plt.ylabel('Predicted Cholesterol')
plt.title('Actual vs. Predicted Cholesterol')
plt.grid(True)
plt.show()
'DeathCause' should be removed from the model because that it is consequence not casual of the cholesterol concentration.
# Calculate mean cholesterol for each DeathCause
mean_cholesterol = df.groupby('DeathCause')['Cholesterol'].mean()
# Create bar plot
plt.figure(figsize=(10, 6))
mean_cholesterol.plot(kind='bar', color='skyblue')
plt.title('Mean Cholesterol by Death Cause')
plt.xlabel('Death Cause')
plt.ylabel('Mean Cholesterol')
plt.xticks(rotation=45) # Rotate x-axis labels for better visibility
# Add mean values to the bars
for index, value in enumerate(mean_cholesterol):
plt.text(index, value, str(round(value, 2)), ha='center', va='bottom')
plt.tight_layout()
plt.show()
Cholesterol average is not significantly different between causes of death. So, this variable must be removed from the linear regression model of cholesterol.
import statsmodels.api as sma
X_train = sma.add_constant(X_train) ## let's add an intercept (beta_0) to our model
X_test = sma.add_constant(X_test)
import statsmodels.api as sm
lm2 = sm.OLS(y_train,X_train).fit()
lm2.summary()
| Dep. Variable: | Cholesterol | R-squared: | 0.109 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.106 |
| Method: | Least Squares | F-statistic: | 32.81 |
| Date: | Sun, 14 Apr 2024 | Prob (F-statistic): | 1.39e-89 |
| Time: | 15:18:19 | Log-Likelihood: | -20347. |
| No. Observations: | 4036 | AIC: | 4.073e+04 |
| Df Residuals: | 4020 | BIC: | 4.083e+04 |
| Df Model: | 15 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 308.3604 | 59.269 | 5.203 | 0.000 | 192.161 | 424.560 |
| Status | 1.6626 | 1.435 | 1.159 | 0.247 | -1.150 | 4.476 |
| DeathCause | -0.7775 | 0.538 | -1.446 | 0.148 | -1.832 | 0.277 |
| AgeCHDdiag | -0.3220 | 0.124 | -2.591 | 0.010 | -0.566 | -0.078 |
| Sex | -4.9945 | 2.031 | -2.459 | 0.014 | -8.977 | -1.012 |
| AgeAtStart | 1.1731 | 0.092 | 12.698 | 0.000 | 0.992 | 1.354 |
| Height | -2.2345 | 0.915 | -2.443 | 0.015 | -4.028 | -0.441 |
| Weight | 0.4114 | 0.195 | 2.115 | 0.035 | 0.030 | 0.793 |
| Diastolic | 0.2030 | 0.080 | 2.542 | 0.011 | 0.046 | 0.360 |
| Systolic | 0.0328 | 0.044 | 0.740 | 0.459 | -0.054 | 0.120 |
| MRW | -0.4090 | 0.238 | -1.722 | 0.085 | -0.875 | 0.057 |
| Smoking | -0.0128 | 0.170 | -0.076 | 0.940 | -0.346 | 0.320 |
| AgeAtDeath | -0.1207 | 0.105 | -1.149 | 0.251 | -0.327 | 0.085 |
| BP_Status | 0.4950 | 0.806 | 0.614 | 0.539 | -1.086 | 2.076 |
| Weight_Status | 2.3170 | 0.895 | 2.589 | 0.010 | 0.562 | 4.072 |
| Smoking_Status | 2.1349 | 1.392 | 1.534 | 0.125 | -0.593 | 4.863 |
| Omnibus: | 47.131 | Durbin-Watson: | 2.047 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 47.829 |
| Skew: | 0.255 | Prob(JB): | 4.11e-11 |
| Kurtosis: | 2.845 | Cond. No. | 2.83e+04 |
influence = lm2.get_influence()
resid_student = influence.resid_studentized_external
resid = pd.concat([X_train,pd.Series(resid_student,name = "Studentized Residuals")],axis = 1)
resid.head()
| const | Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | BP_Status | Weight_Status | Smoking_Status | Studentized Residuals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3404 | 1.0 | 0.0 | 1.0 | 63.302968 | 1.0 | 48.0 | 68.00 | 190.0 | 102.0 | 154.0 | 132.0 | 0.0 | 70.536414 | 2.0 | 2.0 | 0.0 | 0.260895 |
| 463 | 1.0 | 1.0 | 1.0 | 55.000000 | 1.0 | 41.0 | 69.75 | 199.0 | 108.0 | 162.0 | 134.0 | 0.0 | 57.000000 | 2.0 | 2.0 | 0.0 | -0.262146 |
| 2373 | 1.0 | 0.0 | 3.0 | 52.000000 | 0.0 | 36.0 | 62.25 | 150.0 | 70.0 | 115.0 | 129.0 | 10.0 | 70.536414 | 1.0 | 2.0 | 2.0 | 0.386980 |
| 3470 | 1.0 | 1.0 | 1.0 | 74.000000 | 0.0 | 54.0 | 60.50 | 160.0 | 102.0 | 186.0 | 147.0 | 10.0 | 76.000000 | 2.0 | 2.0 | 2.0 | 0.029897 |
| 4831 | 1.0 | 0.0 | 2.0 | 67.000000 | 0.0 | 37.0 | 59.25 | 202.0 | 88.0 | 120.0 | 191.0 | 0.0 | 70.536414 | 0.0 | 2.0 | 0.0 | NaN |
resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:]
| const | Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | BP_Status | Weight_Status | Smoking_Status | Studentized Residuals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3951 | 1.0 | 0.0 | 4.0 | 63.302968 | 0.0 | 36.0 | 59.0 | 137.0 | 90.0 | 140.0 | 129.0 | 0.0 | 70.536414 | 2.0 | 2.0 | 0.0 | 3.386411 |
| 1631 | 1.0 | 0.0 | 1.0 | 63.302968 | 0.0 | 53.0 | 64.0 | 145.0 | 88.0 | 130.0 | 117.0 | 0.0 | 70.536414 | 0.0 | 2.0 | 0.0 | 3.004549 |
| 766 | 1.0 | 0.0 | 3.0 | 61.000000 | 0.0 | 31.0 | 63.0 | 108.0 | 74.0 | 116.0 | 90.0 | 0.0 | 70.536414 | 1.0 | 1.0 | 0.0 | 3.006256 |
ind = resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:].index
ind
Int64Index([3951, 1631, 766], dtype='int64')
y_train.drop(ind,axis = 0,inplace = True)
X_train.drop(ind,axis = 0,inplace = True) #Intercept column is there
from statsmodels.stats.outliers_influence import variance_inflation_factor
[variance_inflation_factor(X_train.values, j) for j in range(X_train.shape[1])]
[10076.96450825812, 1.39726232315385, 1.0053644012541958, 1.2038020419953424, 2.9230720513123316, 1.803760234929722, 29.928615338015483, 81.69961400686279, 3.049245691697844, 3.125390428579405, 58.53830049938579, 11.78698053542615, 1.3524705474598187, 1.5654284427754148, 1.8889839119632796, 11.765412868907452]
We create a function to remove the collinear variables. We choose a threshold of 5 which means if VIF is more than 5 for a particular variable then that variable will be removed.
def calculate_vif(x):
thresh = 5.0
output = pd.DataFrame()
k = x.shape[1]
vif = [variance_inflation_factor(x.values, j) for j in range(x.shape[1])]
for i in range(1,k):
print("Iteration no.")
print(i)
print(vif)
a = np.argmax(vif)
print("Max VIF is for variable no.:")
print(a)
if vif[a] <= thresh :
break
if i == 1 :
output = x.drop(x.columns[a], axis = 1)
vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
elif i > 1 :
output = output.drop(output.columns[a],axis = 1)
vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
return(output)
train_out = calculate_vif(X_train)
train_out.head()
Iteration no. 1 [10076.96450825812, 1.39726232315385, 1.0053644012541958, 1.2038020419953424, 2.9230720513123316, 1.803760234929722, 29.928615338015483, 81.69961400686279, 3.049245691697844, 3.125390428579405, 58.53830049938579, 11.78698053542615, 1.3524705474598187, 1.5654284427754148, 1.8889839119632796, 11.765412868907452] Max VIF is for variable no.: 0 Iteration no. 2 [2.2680498769708857, 5.149110964561775, 176.82044595057542, 4.649332188872404, 49.2469808444578, 315.0171692885646, 265.6236458740845, 134.17015476982604, 107.76448893577012, 246.04717480793195, 19.088826024637964, 157.8717427232369, 3.489293916740208, 6.169409804587098, 21.18440362788717] Max VIF is for variable no.: 5 Iteration no. 3 [2.262635788435497, 5.081824736490531, 124.41869062271418, 4.626422881493438, 48.897284279602296, 173.46508836912358, 125.60382004306241, 107.66305900182297, 207.2352634160289, 19.088622808797087, 133.3033514041389, 3.3563561031028994, 5.8698907122947, 21.049054319376673] Max VIF is for variable no.: 8 Iteration no. 4 [2.2616786161253035, 5.08098710347792, 123.45478216243256, 2.8600599588315094, 48.52879141558591, 60.837399537318774, 124.83482318326554, 107.32456708382153, 19.06523850752273, 132.39881541965138, 3.3563016552995055, 5.267727538963271, 21.02929240184767] Max VIF is for variable no.: 9 Iteration no. 5 [2.1664387170234622, 5.046715432001964, 74.38589069594636, 2.855797469146641, 42.43294552405285, 59.044791898000476, 124.22052357503861, 107.2344041053779, 19.017411192284534, 3.3199762662978656, 5.219569377654603, 20.966049479003505] Max VIF is for variable no.: 6 Iteration no. 6 [2.1656391445269842, 5.0456168509455095, 70.35891807565832, 2.855605507493809, 42.402813122929714, 56.30802544065065, 53.274845467319075, 19.006754040187634, 3.1995014606046905, 5.219555772173678, 20.94132301732525] Max VIF is for variable no.: 2 Iteration no. 7 [2.022845552183428, 4.929164438927844, 2.7243301623714555, 31.113637801170885, 44.239055798919104, 47.37968401001749, 18.97091168595589, 3.018458559936772, 4.881825142420282, 20.786335930148596] Max VIF is for variable no.: 5 Iteration no. 8 [2.0226828330207645, 4.8535852047183194, 2.595141043618071, 22.151900363818545, 32.204607975426754, 18.962525934811897, 2.4829710496776674, 4.786450705712884, 20.743703120726828] Max VIF is for variable no.: 4 Iteration no. 9 [1.9150598219642052, 4.632084824282184, 2.1244727738654507, 9.229873214669365, 18.921457213227143, 2.4515496059020516, 3.517449764369447, 20.512890229520252] Max VIF is for variable no.: 7 Iteration no. 10 [1.9106524815596229, 4.609844017796759, 2.123452072332671, 8.986091436972144, 1.8456877931063589, 2.4498443291984833, 3.4930015685842766] Max VIF is for variable no.: 3 Iteration no. 11 [1.7480932699859404, 2.914224375241127, 2.073365152571798, 1.8309779894845701, 2.2545045236829746, 2.814688615351666] Max VIF is for variable no.: 1
| Status | DeathCause | Sex | Smoking | BP_Status | Weight_Status | |
|---|---|---|---|---|---|---|
| 3404 | 0 | 1.0 | 1 | 0.0 | 2 | 2 |
| 463 | 1 | 1.0 | 1 | 0.0 | 2 | 2 |
| 2373 | 0 | 3.0 | 0 | 10.0 | 1 | 2 |
| 3470 | 1 | 1.0 | 0 | 10.0 | 2 | 2 |
| 4831 | 0 | 2.0 | 0 | 0.0 | 0 | 2 |
import statsmodels.api as sma
import statsmodels.api as sm
train_out = sma.add_constant(train_out) ## let's add an intercept (beta_0) to our model
#X_test.drop(["Status"],axis = 1,inplace = True)
X_test = sma.add_constant(X_test)
lm2 = sm.OLS(y_train,train_out).fit()
lm2.summary()
| Dep. Variable: | Cholesterol | R-squared: | 0.055 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.054 |
| Method: | Least Squares | F-statistic: | 39.38 |
| Date: | Sun, 14 Apr 2024 | Prob (F-statistic): | 8.80e-47 |
| Time: | 16:01:48 | Log-Likelihood: | -20449. |
| No. Observations: | 4033 | AIC: | 4.091e+04 |
| Df Residuals: | 4026 | BIC: | 4.096e+04 |
| Df Model: | 6 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 211.8868 | 1.865 | 113.587 | 0.000 | 208.230 | 215.544 |
| Status | 11.3978 | 1.300 | 8.769 | 0.000 | 8.849 | 13.946 |
| DeathCause | -0.7294 | 0.552 | -1.320 | 0.187 | -1.813 | 0.354 |
| Sex | -4.3260 | 1.332 | -3.248 | 0.001 | -6.937 | -1.715 |
| Smoking | 0.0748 | 0.055 | 1.350 | 0.177 | -0.034 | 0.184 |
| BP_Status | 3.9552 | 0.690 | 5.735 | 0.000 | 2.603 | 5.307 |
| Weight_Status | 5.4018 | 0.688 | 7.853 | 0.000 | 4.053 | 6.750 |
| Omnibus: | 55.169 | Durbin-Watson: | 2.046 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 54.483 |
| Skew: | 0.262 | Prob(JB): | 1.48e-12 |
| Kurtosis: | 2.775 | Cond. No. | 49.3 |