Use the data of heart disease in which some indicators may be related to the concentration of cholesterol such as weight, age, height, smoking ...
Building a linear regression model with cholesterol as the target variable and other variables as independent variables.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()
Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | Chol_Status | BP_Status | Weight_Status | Smoking_Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Dead | Other | NaN | Female | 29 | 62.50 | 140.0 | 78 | 124 | 121.0 | 0.0 | 55.0 | NaN | NaN | Normal | Overweight | Non-smoker |
1 | Dead | Cancer | NaN | Female | 41 | 59.75 | 194.0 | 92 | 144 | 183.0 | 0.0 | 57.0 | 181.0 | Desirable | High | Overweight | Non-smoker |
2 | Alive | NaN | NaN | Female | 57 | 62.25 | 132.0 | 90 | 170 | 114.0 | 10.0 | NaN | 250.0 | High | High | Overweight | Moderate (6-15) |
3 | Alive | NaN | NaN | Female | 39 | 65.75 | 158.0 | 80 | 128 | 123.0 | 0.0 | NaN | 242.0 | High | Normal | Overweight | Non-smoker |
4 | Alive | NaN | NaN | Male | 42 | 66.00 | 156.0 | 76 | 110 | 116.0 | 20.0 | NaN | 281.0 | High | Optimal | Overweight | Heavy (16-25) |
df.shape
(5209, 17)
df.isna().sum()
Status 0 DeathCause 3218 AgeCHDdiag 3760 Sex 0 AgeAtStart 0 Height 6 Weight 6 Diastolic 0 Systolic 0 MRW 6 Smoking 36 AgeAtDeath 3218 Cholesterol 152 Chol_Status 152 BP_Status 0 Weight_Status 6 Smoking_Status 36 dtype: int64
look at the data distribution of the numeric variables:
import matplotlib.pyplot as plt
def plot_histograms(df, bins=10, alpha=0.5, colors=None):
"""
Plot histograms for all numeric variables in the DataFrame.
Parameters:
df (DataFrame): The DataFrame containing numeric variables.
bins (int): Number of bins for the histograms. Default is 10.
alpha (float): Transparency level of the histograms. Default is 0.5.
colors (list): List of colors for the histograms. If None, default colors will be used.
Returns:
None
"""
if colors is None:
colors = plt.cm.tab10.colors # Default color palette
num_variables = df.select_dtypes(include='number').shape[1]
num_rows = (num_variables + 1) // 2
num_cols = 2
plt.figure(figsize=(12, 6 * num_rows))
for i, col in enumerate(df.select_dtypes(include='number'), start=1):
plt.subplot(num_rows, num_cols, i)
plt.hist(df[col], bins=bins, alpha=alpha, color=colors[i % len(colors)])
plt.title(f'Histogram of {col}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.tight_layout()
plt.show()
# Example usage:
# Assuming df is your DataFrame containing numeric variables
plot_histograms(df)
proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)
missing_countD = df['DeathCause'].isna().sum()
missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)
df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD
df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"
df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"
proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)
missing_count = df['Chol_Status'].isna().sum()
missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)
df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples
# Calculate means for numeric columns
means = df.mean()
# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
df[col] = df[col].fillna(means[col])
/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_85659/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. means = df.mean()
df.describe()
AgeCHDdiag | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | Cholesterol | |
---|---|---|---|---|---|---|---|---|---|---|
count | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 | 5209.000000 |
mean | 63.302968 | 44.068727 | 64.813185 | 153.086681 | 85.358610 | 136.909580 | 119.957525 | 9.366518 | 70.536414 | 227.417441 |
std | 5.282496 | 8.574954 | 3.580643 | 28.898765 | 12.973091 | 23.739596 | 19.971887 | 11.989796 | 6.527255 | 44.274927 |
min | 32.000000 | 28.000000 | 51.500000 | 67.000000 | 50.000000 | 82.000000 | 67.000000 | 0.000000 | 36.000000 | 96.000000 |
25% | 63.302968 | 37.000000 | 62.250000 | 132.000000 | 76.000000 | 120.000000 | 106.000000 | 0.000000 | 70.536414 | 197.000000 |
50% | 63.302968 | 43.000000 | 64.500000 | 150.000000 | 84.000000 | 132.000000 | 118.000000 | 1.000000 | 70.536414 | 225.000000 |
75% | 63.302968 | 51.000000 | 67.500000 | 172.000000 | 92.000000 | 148.000000 | 131.000000 | 20.000000 | 70.536414 | 251.000000 |
max | 90.000000 | 62.000000 | 76.500000 | 300.000000 | 160.000000 | 300.000000 | 268.000000 | 60.000000 | 93.000000 | 568.000000 |
df.isna().sum()
Status 0 DeathCause 0 AgeCHDdiag 0 Sex 0 AgeAtStart 0 Height 0 Weight 0 Diastolic 0 Systolic 0 MRW 0 Smoking 0 AgeAtDeath 0 Cholesterol 0 Chol_Status 0 BP_Status 0 Weight_Status 0 Smoking_Status 0 dtype: int64
# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}
# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)
# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }
df['DeathCause'] = df['DeathCause'].map(status_mappingD)
# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }
df['Sex'] = df['Sex'].map(status_mappingSex)
# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}
df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)
status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}
df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)
status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}
df['BP_Status'] = df['BP_Status'].map(status_mappingBP)
status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)
# Function to detect outliers using IQR method
def detect_outliers(df, variable):
Q1 = df[variable].quantile(0.25)
Q3 = df[variable].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[variable] < lower_bound) | (df[variable] > upper_bound)]
return outliers
# Define numeric variables to detect outliers
numeric_variables = ['Cholesterol', 'AgeCHDdiag', 'AgeAtStart', 'Height', 'Weight', 'Smoking']
# Create a dictionary to store outliers for each variable
outliers_dict = {}
# Detect outliers in each numeric variable
for col in numeric_variables:
outliers_dict[col] = detect_outliers(df, col)
# Print outliers for each variable
for col, outliers in outliers_dict.items():
print(f"Outliers in {col}:")
print(outliers)
print()
Outliers in Cholesterol: Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 89 0 2.0 63.302968 0 52 62.00 135.0 123 1 1.0 73.000000 0 47 62.00 124.0 143 0 1.0 63.302968 1 45 68.50 160.0 187 0 4.0 76.000000 0 54 64.25 146.0 197 1 1.0 61.000000 1 59 67.25 164.0 ... ... ... ... ... ... ... ... 4874 1 1.0 53.000000 1 47 64.50 143.0 5022 1 3.0 63.302968 0 62 60.50 108.0 5138 1 1.0 59.000000 0 55 59.75 148.0 5146 0 NaN 63.302968 1 56 65.00 145.0 5161 1 4.0 63.302968 0 57 62.00 159.0 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 89 82 144 116.0 0.0 70.536414 339.0 123 80 140 107.0 0.0 77.000000 347.0 143 86 136 111.0 0.0 70.536414 347.0 187 96 148 118.0 0.0 70.536414 418.0 197 100 152 117.0 0.0 81.000000 334.0 ... ... ... ... ... ... ... 4874 92 158 112.0 15.0 71.000000 386.0 5022 72 128 99.0 15.0 82.000000 350.0 5138 90 124 140.0 0.0 61.000000 400.0 5146 90 132 111.0 20.0 70.536414 360.0 5161 80 124 137.0 0.0 81.000000 334.0 Chol_Status BP_Status Weight_Status Smoking_Status 89 2 2 2 0 123 2 0 0 0 143 2 0 2 0 187 2 2 2 0 197 2 2 2 0 ... ... ... ... ... 4874 2 2 2 2 5022 2 0 0 2 5138 2 2 2 0 5146 2 2 2 3 5161 2 0 2 0 [108 rows x 17 columns] Outliers in AgeCHDdiag: Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 11 0 3.0 57.0 1 33 64.25 151.0 12 0 3.0 55.0 1 33 70.00 174.0 13 0 2.0 79.0 1 57 67.25 165.0 14 0 4.0 66.0 1 44 69.00 155.0 17 1 2.0 56.0 1 56 67.25 122.0 ... ... ... ... ... ... ... ... 5200 1 1.0 55.0 1 47 67.00 186.0 5201 0 1.0 61.0 1 59 66.50 156.0 5202 1 1.0 59.0 1 59 71.00 177.0 5204 1 1.0 79.0 1 49 64.50 173.0 5207 1 1.0 50.0 1 36 68.25 164.0 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 11 68 108 118.0 0.0 70.536414 221.000000 12 90 142 114.0 0.0 70.536414 188.000000 13 76 128 118.0 15.0 70.536414 227.417441 14 90 130 105.0 30.0 70.536414 292.000000 17 72 120 87.0 15.0 72.000000 194.000000 ... ... ... ... ... ... ... 5200 105 155 133.0 5.0 57.000000 199.000000 5201 84 124 116.0 20.0 70.536414 223.000000 5202 68 108 113.0 25.0 65.000000 246.000000 5204 80 110 135.0 20.0 81.000000 228.000000 5207 64 108 114.0 40.0 64.000000 238.000000 Chol_Status BP_Status Weight_Status Smoking_Status 11 1 1 2 0 12 0 2 2 0 13 2 0 2 2 14 2 2 0 4 17 0 0 1 2 ... ... ... ... ... 5200 0 2 2 1 5201 1 0 2 3 5202 2 1 2 3 5204 1 0 2 3 5207 1 1 2 4 [1449 rows x 17 columns] Outliers in AgeAtStart: Empty DataFrame Columns: [Status, DeathCause, AgeCHDdiag, Sex, AgeAtStart, Height, Weight, Diastolic, Systolic, MRW, Smoking, AgeAtDeath, Cholesterol, Chol_Status, BP_Status, Weight_Status, Smoking_Status] Index: [] Outliers in Height: Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 418 0 NaN 63.302968 1 44 75.50 177.0 678 0 4.0 55.000000 1 29 75.50 204.0 1893 1 2.0 63.302968 1 34 76.00 220.0 2249 1 4.0 63.302968 0 51 51.50 72.0 3355 0 2.0 63.302968 1 44 76.00 169.0 3508 1 4.0 63.302968 0 60 53.75 119.0 3882 0 1.0 63.302968 1 33 76.50 221.0 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 418 62 122 101.0 0.0 70.536414 199.0 678 90 140 116.0 0.0 70.536414 246.0 1893 80 138 122.0 10.0 62.000000 192.0 2249 92 126 88.0 1.0 79.000000 234.0 3355 70 102 93.0 5.0 70.536414 205.0 3508 100 190 135.0 0.0 76.000000 250.0 3882 100 155 122.0 20.0 70.536414 225.0 Chol_Status BP_Status Weight_Status Smoking_Status 418 0 0 0 0 678 2 2 2 0 1893 0 0 2 2 2249 1 2 1 1 3355 1 1 0 1 3508 2 2 2 0 3882 1 2 2 3 Outliers in Weight: Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 154 0 4.0 63.000000 0 37 63.00 236.0 436 1 3.0 63.302968 0 50 62.75 241.0 491 1 3.0 63.302968 1 52 72.00 247.0 671 0 3.0 63.302968 1 29 70.25 243.0 765 1 4.0 63.302968 0 33 56.25 71.0 836 1 3.0 63.302968 1 48 66.00 250.0 1236 1 1.0 59.000000 1 39 73.50 244.0 1623 0 1.0 63.302968 0 51 65.25 239.0 1647 0 3.0 32.000000 1 32 73.00 239.0 1664 1 NaN 60.000000 1 34 72.50 245.0 1679 1 1.0 75.000000 1 51 74.00 239.0 1772 0 1.0 72.000000 1 44 73.25 238.0 1778 1 2.0 62.000000 1 40 71.50 240.0 1796 0 NaN 57.000000 1 31 69.00 238.0 1913 0 1.0 33.000000 1 33 64.75 260.0 1944 0 2.0 63.302968 1 44 68.25 244.0 2099 0 2.0 63.302968 0 38 59.50 242.0 2119 1 3.0 63.302968 1 59 68.25 276.0 2307 1 4.0 63.302968 1 60 69.25 234.0 2348 1 1.0 49.000000 1 33 70.50 235.0 2437 1 1.0 75.000000 0 53 60.25 271.0 2547 1 4.0 63.302968 1 50 66.50 237.0 2576 0 NaN 43.000000 0 43 60.25 235.0 2592 1 3.0 63.302968 1 38 72.25 273.0 2609 0 2.0 63.302968 1 33 66.25 241.0 2761 0 1.0 63.302968 1 38 72.25 234.0 2913 0 3.0 63.302968 1 39 67.25 237.0 2946 0 2.0 63.302968 1 47 71.75 236.0 2991 1 3.0 63.302968 0 60 64.00 235.0 3123 1 1.0 71.000000 0 49 62.00 261.0 3124 1 1.0 64.000000 1 52 72.00 236.0 3251 0 4.0 63.302968 0 57 67.00 293.0 3314 0 NaN 63.302968 1 36 70.50 244.0 3359 1 2.0 63.302968 0 36 63.75 300.0 3615 1 4.0 63.302968 0 47 60.00 238.0 3660 1 1.0 51.000000 1 51 69.00 239.0 3844 0 2.0 63.302968 0 39 65.50 250.0 3871 0 2.0 63.302968 1 33 73.75 245.0 4239 1 NaN 63.302968 0 56 60.50 269.0 4532 0 1.0 63.302968 1 38 73.00 246.0 4703 0 1.0 63.000000 0 47 61.00 300.0 4857 1 3.0 63.302968 0 59 62.75 281.0 4960 1 1.0 41.000000 1 35 70.00 235.0 5005 1 1.0 58.000000 1 54 68.00 256.0 5062 0 4.0 63.302968 1 32 68.00 236.0 5097 1 4.0 63.302968 0 46 57.75 67.0 5133 0 4.0 63.302968 0 32 61.00 275.0 5199 1 1.0 64.000000 1 58 72.75 255.0 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 154 96 178 197.0 0.0 70.536414 227.417441 436 150 242 208.0 0.0 58.000000 213.000000 491 104 154 153.0 20.0 68.000000 188.000000 671 90 162 160.0 20.0 70.536414 163.000000 765 90 116 73.0 0.0 61.000000 192.000000 836 96 170 185.0 0.0 72.000000 180.000000 1236 100 144 147.0 20.0 71.000000 276.000000 1623 88 150 187.0 1.0 70.536414 175.000000 1647 92 138 144.0 0.0 70.536414 171.000000 1664 90 140 152.0 0.0 64.000000 271.000000 1679 100 158 141.0 5.0 77.000000 243.000000 1772 100 150 143.0 20.0 70.536414 237.000000 1778 90 128 153.0 40.0 70.000000 228.000000 1796 88 140 161.0 0.0 70.536414 250.000000 1913 100 148 203.0 0.0 70.536414 227.417441 1944 126 190 169.0 0.0 70.536414 292.000000 2099 86 132 228.0 0.0 70.536414 157.000000 2119 80 124 192.0 0.0 81.000000 182.000000 2307 98 138 158.0 30.0 86.000000 175.000000 2348 96 146 155.0 25.0 51.000000 224.000000 2437 130 246 249.0 0.0 85.000000 200.000000 2547 100 156 176.0 5.0 76.000000 247.000000 2576 134 246 216.0 0.0 70.536414 300.000000 2592 120 180 170.0 0.0 58.000000 227.000000 2609 90 140 179.0 0.0 70.536414 259.000000 2761 84 134 145.0 0.0 70.536414 155.000000 2913 130 204 169.0 10.0 70.536414 204.000000 2946 88 130 150.0 0.0 70.536414 248.000000 2991 96 190 190.0 0.0 92.000000 227.417441 3123 86 150 225.0 0.0 77.000000 245.000000 3124 104 156 147.0 0.0 70.000000 255.000000 3251 108 170 215.0 0.0 70.536414 242.000000 3314 95 135 161.0 0.0 70.536414 204.000000 3359 108 182 250.0 20.0 54.000000 215.000000 3615 100 180 218.0 0.0 53.000000 227.417441 3660 122 228 161.0 40.0 61.000000 226.000000 3844 105 135 195.0 0.0 70.536414 242.000000 3871 90 150 148.0 0.0 70.536414 179.000000 4239 120 210 247.0 20.0 82.000000 150.000000 4532 94 124 148.0 0.0 70.536414 174.000000 4703 120 208 268.0 0.0 70.536414 185.000000 4857 100 152 242.0 0.0 69.000000 188.000000 4960 96 152 155.0 30.0 43.000000 285.000000 5005 100 182 178.0 45.0 60.000000 286.000000 5062 100 150 164.0 0.0 70.536414 226.000000 5097 80 124 67.0 0.0 56.000000 234.000000 5133 82 136 246.0 0.0 70.536414 154.000000 5199 88 130 158.0 40.0 80.000000 260.000000 Chol_Status BP_Status Weight_Status Smoking_Status 154 2 2 2 0 436 1 2 2 0 491 0 2 2 3 671 0 2 2 3 765 0 2 1 0 836 0 2 2 0 1236 2 2 2 3 1623 0 2 2 1 1647 0 2 2 0 1664 2 2 2 0 1679 2 2 2 1 1772 1 2 2 3 1778 1 2 2 4 1796 2 0 2 0 1913 1 2 2 0 1944 2 2 2 0 2099 0 0 2 0 2119 0 0 2 0 2307 0 2 2 4 2348 1 2 2 3 2437 1 2 2 0 2547 2 2 2 1 2576 2 2 2 0 2592 1 2 2 0 2609 2 2 2 0 2761 0 0 2 0 2913 1 2 2 2 2946 2 0 2 0 2991 1 2 2 0 3123 2 2 2 0 3124 2 2 2 0 3251 2 2 2 0 3314 1 2 2 0 3359 1 2 2 3 3615 1 2 2 0 3660 1 2 2 4 3844 2 2 2 0 3871 0 2 2 0 4239 0 2 2 3 4532 0 2 2 0 4703 0 2 2 0 4857 0 2 2 0 4960 2 2 2 4 5005 2 2 2 4 5062 1 2 2 0 5097 1 0 1 0 5133 0 0 2 0 5199 2 0 2 4 Outliers in Smoking: Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 498 0 2.0 63.302968 1 34 65.75 137.000000 914 0 3.0 66.000000 1 38 68.00 174.000000 1093 0 2.0 63.302968 1 32 70.00 173.000000 1263 1 4.0 74.000000 1 54 68.00 153.086681 1699 1 1.0 57.000000 1 53 66.25 150.000000 2043 0 2.0 63.302968 1 32 70.50 203.000000 2903 0 NaN 55.000000 1 35 68.25 158.000000 2910 0 4.0 63.302968 1 34 70.50 164.000000 3468 1 1.0 40.000000 1 36 70.50 190.000000 4110 1 NaN 42.000000 1 32 67.25 222.000000 4127 0 3.0 63.302968 1 36 68.25 152.000000 4296 1 1.0 66.000000 1 46 67.00 141.000000 4303 0 2.0 63.302968 1 51 62.75 134.000000 4360 1 1.0 62.000000 1 42 67.50 184.000000 4368 1 1.0 41.000000 1 35 68.50 185.000000 4376 0 4.0 63.302968 1 37 67.00 176.000000 4917 1 2.0 63.302968 1 40 64.00 126.000000 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 498 80 116 105.000000 60.0 70.536414 227.417441 914 88 148 121.000000 60.0 70.536414 305.000000 1093 80 120 114.000000 60.0 70.536414 234.000000 1263 90 132 119.957525 60.0 80.000000 307.000000 1699 86 120 111.000000 60.0 67.000000 209.000000 2043 90 140 134.000000 60.0 70.536414 146.000000 2903 80 130 110.000000 60.0 70.536414 177.000000 2910 90 115 108.000000 60.0 70.536414 159.000000 3468 100 140 125.000000 60.0 54.000000 362.000000 4110 105 150 159.000000 55.0 52.000000 220.000000 4127 70 124 106.000000 55.0 70.536414 163.000000 4296 70 118 101.000000 60.0 68.000000 246.000000 4303 100 150 110.000000 60.0 70.536414 314.000000 4360 82 135 131.000000 60.0 64.000000 345.000000 4368 104 176 128.000000 60.0 51.000000 263.000000 4376 86 118 126.000000 60.0 70.536414 210.000000 4917 66 98 98.000000 60.0 64.000000 215.000000 Chol_Status BP_Status Weight_Status Smoking_Status 498 1 0 0 4 914 2 2 2 4 1093 1 0 2 4 1263 2 2 0 4 1699 1 0 2 4 2043 0 2 2 4 2903 0 0 2 4 2910 0 2 0 4 3468 2 2 2 4 4110 1 2 2 4 4127 0 0 0 4 4296 2 1 0 4 4303 2 2 2 4 4360 2 0 2 4 4368 2 2 2 4 4376 1 0 2 4 4917 1 1 0 4
# Drop outliers from df
for col, outliers in outliers_dict.items():
try:
df.drop(outliers.index, inplace=True)
except KeyError:
# Handle KeyError if the column doesn't exist in the DataFrame
pass
# Reset index after dropping outliers
df.reset_index(drop=True, inplace=True)
# Verify the DataFrame after dropping outliers
print(df)
Status DeathCause AgeCHDdiag Sex AgeAtStart Height Weight \ 0 1 4.0 63.302968 0 29 62.50 140.0 1 1 2.0 63.302968 0 41 59.75 194.0 2 0 4.0 63.302968 0 57 62.25 132.0 3 0 2.0 63.302968 0 39 65.75 158.0 4 0 1.0 63.302968 1 42 66.00 156.0 ... ... ... ... ... ... ... ... 5041 1 1.0 79.000000 1 49 64.50 173.0 5042 0 2.0 63.302968 0 42 60.00 141.0 5043 0 2.0 63.302968 0 51 58.25 123.0 5044 1 1.0 50.000000 1 36 68.25 164.0 5045 0 2.0 63.302968 1 36 70.50 177.0 Diastolic Systolic MRW Smoking AgeAtDeath Cholesterol \ 0 78 124 121.0 0.0 55.000000 227.417441 1 92 144 183.0 0.0 57.000000 181.000000 2 90 170 114.0 10.0 70.536414 250.000000 3 80 128 123.0 0.0 70.536414 242.000000 4 76 110 116.0 20.0 70.536414 281.000000 ... ... ... ... ... ... ... 5041 80 110 135.0 20.0 81.000000 228.000000 5042 76 124 129.0 5.0 70.536414 209.000000 5043 90 152 119.0 1.0 70.536414 197.000000 5044 64 108 114.0 40.0 64.000000 238.000000 5045 68 94 116.0 50.0 70.536414 240.000000 Chol_Status BP_Status Weight_Status Smoking_Status 0 2 0 2 0 1 0 2 2 0 2 2 2 2 2 3 2 0 2 0 4 2 1 2 3 ... ... ... ... ... 5041 1 0 2 3 5042 1 0 2 1 5043 0 2 2 1 5044 1 1 2 4 5045 2 1 2 4 [5046 rows x 17 columns]
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize linear regression model
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) score:", r2)
Mean Squared Error: 1421.387817864974 R-squared (R2) score: 0.08504489192920817
# Extract coefficients
coefficients = model.coef_
# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)
# Print the top contributing features
print("Top contributing features:")
print(sorted_coefficients_df)
Top contributing features: Feature Coefficient Absolute Coefficient 3 Sex -4.994471 4.994471 13 Weight_Status 2.317034 2.317034 5 Height -2.234509 2.234509 14 Smoking_Status 2.134919 2.134919 0 Status 1.662626 1.662626 4 AgeAtStart 1.173081 1.173081 1 DeathCause -0.777538 0.777538 12 BP_Status 0.495003 0.495003 6 Weight 0.411385 0.411385 9 MRW -0.408968 0.408968 2 AgeCHDdiag -0.322008 0.322008 7 Diastolic 0.203016 0.203016 11 AgeAtDeath -0.120730 0.120730 8 Systolic 0.032830 0.032830 10 Smoking -0.012846 0.012846
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize linear regression model
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train, y_train)
# Extract coefficients
coefficients = model.coef_
# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)
# Select top 7 features
top_features = sorted_coefficients_df.iloc[:7]['Feature'].tolist()
# Fit the model on training data with top 7 features
X_train_top = X_train[top_features]
model.fit(X_train_top, y_train)
# Evaluate the model on testing data
X_test_top = X_test[top_features]
score = model.score(X_test_top, y_test)
print("R-squared (R2) score using top 7 features:", score)
R-squared (R2) score using top 7 features: 0.07588634596412047
import seaborn as sns
import matplotlib.pyplot as plt
#Plot actual vs. predicted cholesterol values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Cholesterol')
plt.ylabel('Predicted Cholesterol')
plt.title('Actual vs. Predicted Cholesterol')
plt.grid(True)
plt.show()
'DeathCause' should be removed from the model because that it is consequence not casual of the cholesterol concentration.
# Calculate mean cholesterol for each DeathCause
mean_cholesterol = df.groupby('DeathCause')['Cholesterol'].mean()
# Create bar plot
plt.figure(figsize=(10, 6))
mean_cholesterol.plot(kind='bar', color='skyblue')
plt.title('Mean Cholesterol by Death Cause')
plt.xlabel('Death Cause')
plt.ylabel('Mean Cholesterol')
plt.xticks(rotation=45) # Rotate x-axis labels for better visibility
# Add mean values to the bars
for index, value in enumerate(mean_cholesterol):
plt.text(index, value, str(round(value, 2)), ha='center', va='bottom')
plt.tight_layout()
plt.show()
Cholesterol average is not significantly different between causes of death. So, this variable must be removed from the linear regression model of cholesterol.
import statsmodels.api as sma
X_train = sma.add_constant(X_train) ## let's add an intercept (beta_0) to our model
X_test = sma.add_constant(X_test)
import statsmodels.api as sm
lm2 = sm.OLS(y_train,X_train).fit()
lm2.summary()
Dep. Variable: | Cholesterol | R-squared: | 0.109 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.106 |
Method: | Least Squares | F-statistic: | 32.81 |
Date: | Sun, 14 Apr 2024 | Prob (F-statistic): | 1.39e-89 |
Time: | 15:18:19 | Log-Likelihood: | -20347. |
No. Observations: | 4036 | AIC: | 4.073e+04 |
Df Residuals: | 4020 | BIC: | 4.083e+04 |
Df Model: | 15 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 308.3604 | 59.269 | 5.203 | 0.000 | 192.161 | 424.560 |
Status | 1.6626 | 1.435 | 1.159 | 0.247 | -1.150 | 4.476 |
DeathCause | -0.7775 | 0.538 | -1.446 | 0.148 | -1.832 | 0.277 |
AgeCHDdiag | -0.3220 | 0.124 | -2.591 | 0.010 | -0.566 | -0.078 |
Sex | -4.9945 | 2.031 | -2.459 | 0.014 | -8.977 | -1.012 |
AgeAtStart | 1.1731 | 0.092 | 12.698 | 0.000 | 0.992 | 1.354 |
Height | -2.2345 | 0.915 | -2.443 | 0.015 | -4.028 | -0.441 |
Weight | 0.4114 | 0.195 | 2.115 | 0.035 | 0.030 | 0.793 |
Diastolic | 0.2030 | 0.080 | 2.542 | 0.011 | 0.046 | 0.360 |
Systolic | 0.0328 | 0.044 | 0.740 | 0.459 | -0.054 | 0.120 |
MRW | -0.4090 | 0.238 | -1.722 | 0.085 | -0.875 | 0.057 |
Smoking | -0.0128 | 0.170 | -0.076 | 0.940 | -0.346 | 0.320 |
AgeAtDeath | -0.1207 | 0.105 | -1.149 | 0.251 | -0.327 | 0.085 |
BP_Status | 0.4950 | 0.806 | 0.614 | 0.539 | -1.086 | 2.076 |
Weight_Status | 2.3170 | 0.895 | 2.589 | 0.010 | 0.562 | 4.072 |
Smoking_Status | 2.1349 | 1.392 | 1.534 | 0.125 | -0.593 | 4.863 |
Omnibus: | 47.131 | Durbin-Watson: | 2.047 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 47.829 |
Skew: | 0.255 | Prob(JB): | 4.11e-11 |
Kurtosis: | 2.845 | Cond. No. | 2.83e+04 |
influence = lm2.get_influence()
resid_student = influence.resid_studentized_external
resid = pd.concat([X_train,pd.Series(resid_student,name = "Studentized Residuals")],axis = 1)
resid.head()
const | Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | BP_Status | Weight_Status | Smoking_Status | Studentized Residuals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3404 | 1.0 | 0.0 | 1.0 | 63.302968 | 1.0 | 48.0 | 68.00 | 190.0 | 102.0 | 154.0 | 132.0 | 0.0 | 70.536414 | 2.0 | 2.0 | 0.0 | 0.260895 |
463 | 1.0 | 1.0 | 1.0 | 55.000000 | 1.0 | 41.0 | 69.75 | 199.0 | 108.0 | 162.0 | 134.0 | 0.0 | 57.000000 | 2.0 | 2.0 | 0.0 | -0.262146 |
2373 | 1.0 | 0.0 | 3.0 | 52.000000 | 0.0 | 36.0 | 62.25 | 150.0 | 70.0 | 115.0 | 129.0 | 10.0 | 70.536414 | 1.0 | 2.0 | 2.0 | 0.386980 |
3470 | 1.0 | 1.0 | 1.0 | 74.000000 | 0.0 | 54.0 | 60.50 | 160.0 | 102.0 | 186.0 | 147.0 | 10.0 | 76.000000 | 2.0 | 2.0 | 2.0 | 0.029897 |
4831 | 1.0 | 0.0 | 2.0 | 67.000000 | 0.0 | 37.0 | 59.25 | 202.0 | 88.0 | 120.0 | 191.0 | 0.0 | 70.536414 | 0.0 | 2.0 | 0.0 | NaN |
resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:]
const | Status | DeathCause | AgeCHDdiag | Sex | AgeAtStart | Height | Weight | Diastolic | Systolic | MRW | Smoking | AgeAtDeath | BP_Status | Weight_Status | Smoking_Status | Studentized Residuals | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3951 | 1.0 | 0.0 | 4.0 | 63.302968 | 0.0 | 36.0 | 59.0 | 137.0 | 90.0 | 140.0 | 129.0 | 0.0 | 70.536414 | 2.0 | 2.0 | 0.0 | 3.386411 |
1631 | 1.0 | 0.0 | 1.0 | 63.302968 | 0.0 | 53.0 | 64.0 | 145.0 | 88.0 | 130.0 | 117.0 | 0.0 | 70.536414 | 0.0 | 2.0 | 0.0 | 3.004549 |
766 | 1.0 | 0.0 | 3.0 | 61.000000 | 0.0 | 31.0 | 63.0 | 108.0 | 74.0 | 116.0 | 90.0 | 0.0 | 70.536414 | 1.0 | 1.0 | 0.0 | 3.006256 |
ind = resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:].index
ind
Int64Index([3951, 1631, 766], dtype='int64')
y_train.drop(ind,axis = 0,inplace = True)
X_train.drop(ind,axis = 0,inplace = True) #Intercept column is there
from statsmodels.stats.outliers_influence import variance_inflation_factor
[variance_inflation_factor(X_train.values, j) for j in range(X_train.shape[1])]
[10076.96450825812, 1.39726232315385, 1.0053644012541958, 1.2038020419953424, 2.9230720513123316, 1.803760234929722, 29.928615338015483, 81.69961400686279, 3.049245691697844, 3.125390428579405, 58.53830049938579, 11.78698053542615, 1.3524705474598187, 1.5654284427754148, 1.8889839119632796, 11.765412868907452]
We create a function to remove the collinear variables. We choose a threshold of 5 which means if VIF is more than 5 for a particular variable then that variable will be removed.
def calculate_vif(x):
thresh = 5.0
output = pd.DataFrame()
k = x.shape[1]
vif = [variance_inflation_factor(x.values, j) for j in range(x.shape[1])]
for i in range(1,k):
print("Iteration no.")
print(i)
print(vif)
a = np.argmax(vif)
print("Max VIF is for variable no.:")
print(a)
if vif[a] <= thresh :
break
if i == 1 :
output = x.drop(x.columns[a], axis = 1)
vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
elif i > 1 :
output = output.drop(output.columns[a],axis = 1)
vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
return(output)
train_out = calculate_vif(X_train)
train_out.head()
Iteration no. 1 [10076.96450825812, 1.39726232315385, 1.0053644012541958, 1.2038020419953424, 2.9230720513123316, 1.803760234929722, 29.928615338015483, 81.69961400686279, 3.049245691697844, 3.125390428579405, 58.53830049938579, 11.78698053542615, 1.3524705474598187, 1.5654284427754148, 1.8889839119632796, 11.765412868907452] Max VIF is for variable no.: 0 Iteration no. 2 [2.2680498769708857, 5.149110964561775, 176.82044595057542, 4.649332188872404, 49.2469808444578, 315.0171692885646, 265.6236458740845, 134.17015476982604, 107.76448893577012, 246.04717480793195, 19.088826024637964, 157.8717427232369, 3.489293916740208, 6.169409804587098, 21.18440362788717] Max VIF is for variable no.: 5 Iteration no. 3 [2.262635788435497, 5.081824736490531, 124.41869062271418, 4.626422881493438, 48.897284279602296, 173.46508836912358, 125.60382004306241, 107.66305900182297, 207.2352634160289, 19.088622808797087, 133.3033514041389, 3.3563561031028994, 5.8698907122947, 21.049054319376673] Max VIF is for variable no.: 8 Iteration no. 4 [2.2616786161253035, 5.08098710347792, 123.45478216243256, 2.8600599588315094, 48.52879141558591, 60.837399537318774, 124.83482318326554, 107.32456708382153, 19.06523850752273, 132.39881541965138, 3.3563016552995055, 5.267727538963271, 21.02929240184767] Max VIF is for variable no.: 9 Iteration no. 5 [2.1664387170234622, 5.046715432001964, 74.38589069594636, 2.855797469146641, 42.43294552405285, 59.044791898000476, 124.22052357503861, 107.2344041053779, 19.017411192284534, 3.3199762662978656, 5.219569377654603, 20.966049479003505] Max VIF is for variable no.: 6 Iteration no. 6 [2.1656391445269842, 5.0456168509455095, 70.35891807565832, 2.855605507493809, 42.402813122929714, 56.30802544065065, 53.274845467319075, 19.006754040187634, 3.1995014606046905, 5.219555772173678, 20.94132301732525] Max VIF is for variable no.: 2 Iteration no. 7 [2.022845552183428, 4.929164438927844, 2.7243301623714555, 31.113637801170885, 44.239055798919104, 47.37968401001749, 18.97091168595589, 3.018458559936772, 4.881825142420282, 20.786335930148596] Max VIF is for variable no.: 5 Iteration no. 8 [2.0226828330207645, 4.8535852047183194, 2.595141043618071, 22.151900363818545, 32.204607975426754, 18.962525934811897, 2.4829710496776674, 4.786450705712884, 20.743703120726828] Max VIF is for variable no.: 4 Iteration no. 9 [1.9150598219642052, 4.632084824282184, 2.1244727738654507, 9.229873214669365, 18.921457213227143, 2.4515496059020516, 3.517449764369447, 20.512890229520252] Max VIF is for variable no.: 7 Iteration no. 10 [1.9106524815596229, 4.609844017796759, 2.123452072332671, 8.986091436972144, 1.8456877931063589, 2.4498443291984833, 3.4930015685842766] Max VIF is for variable no.: 3 Iteration no. 11 [1.7480932699859404, 2.914224375241127, 2.073365152571798, 1.8309779894845701, 2.2545045236829746, 2.814688615351666] Max VIF is for variable no.: 1
Status | DeathCause | Sex | Smoking | BP_Status | Weight_Status | |
---|---|---|---|---|---|---|
3404 | 0 | 1.0 | 1 | 0.0 | 2 | 2 |
463 | 1 | 1.0 | 1 | 0.0 | 2 | 2 |
2373 | 0 | 3.0 | 0 | 10.0 | 1 | 2 |
3470 | 1 | 1.0 | 0 | 10.0 | 2 | 2 |
4831 | 0 | 2.0 | 0 | 0.0 | 0 | 2 |
import statsmodels.api as sma
import statsmodels.api as sm
train_out = sma.add_constant(train_out) ## let's add an intercept (beta_0) to our model
#X_test.drop(["Status"],axis = 1,inplace = True)
X_test = sma.add_constant(X_test)
lm2 = sm.OLS(y_train,train_out).fit()
lm2.summary()
Dep. Variable: | Cholesterol | R-squared: | 0.055 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.054 |
Method: | Least Squares | F-statistic: | 39.38 |
Date: | Sun, 14 Apr 2024 | Prob (F-statistic): | 8.80e-47 |
Time: | 16:01:48 | Log-Likelihood: | -20449. |
No. Observations: | 4033 | AIC: | 4.091e+04 |
Df Residuals: | 4026 | BIC: | 4.096e+04 |
Df Model: | 6 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 211.8868 | 1.865 | 113.587 | 0.000 | 208.230 | 215.544 |
Status | 11.3978 | 1.300 | 8.769 | 0.000 | 8.849 | 13.946 |
DeathCause | -0.7294 | 0.552 | -1.320 | 0.187 | -1.813 | 0.354 |
Sex | -4.3260 | 1.332 | -3.248 | 0.001 | -6.937 | -1.715 |
Smoking | 0.0748 | 0.055 | 1.350 | 0.177 | -0.034 | 0.184 |
BP_Status | 3.9552 | 0.690 | 5.735 | 0.000 | 2.603 | 5.307 |
Weight_Status | 5.4018 | 0.688 | 7.853 | 0.000 | 4.053 | 6.750 |
Omnibus: | 55.169 | Durbin-Watson: | 2.046 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 54.483 |
Skew: | 0.262 | Prob(JB): | 1.48e-12 |
Kurtosis: | 2.775 | Cond. No. | 49.3 |