import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv("/Users/nnthieu/Downloads/heart.csv")
df.head()


df.shape

(5209, 17)


df.isna().sum()

Status               0
DeathCause        3218
AgeCHDdiag        3760
Sex                  0
AgeAtStart           0
Height               6
Weight               6
Diastolic            0
Systolic             0
MRW                  6
Smoking             36
AgeAtDeath        3218
Cholesterol        152
Chol_Status        152
BP_Status            0
Weight_Status        6
Smoking_Status      36
dtype: int64


look at the data distribution of the numeric variables:


import matplotlib.pyplot as plt

def plot_histograms(df, bins=10, alpha=0.5, colors=None):
    """
    Plot histograms for all numeric variables in the DataFrame.

    Parameters:
        df (DataFrame): The DataFrame containing numeric variables.
        bins (int): Number of bins for the histograms. Default is 10.
        alpha (float): Transparency level of the histograms. Default is 0.5.
        colors (list): List of colors for the histograms. If None, default colors will be used.

    Returns:
        None
    """
    if colors is None:
        colors = plt.cm.tab10.colors  # Default color palette

    num_variables = df.select_dtypes(include='number').shape[1]
    num_rows = (num_variables + 1) // 2
    num_cols = 2
    
    plt.figure(figsize=(12, 6 * num_rows))

    for i, col in enumerate(df.select_dtypes(include='number'), start=1):
        plt.subplot(num_rows, num_cols, i)
        plt.hist(df[col], bins=bins, alpha=alpha, color=colors[i % len(colors)])
        plt.title(f'Histogram of {col}')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# Example usage:
# Assuming df is your DataFrame containing numeric variables
plot_histograms(df)


proportionsD = df['DeathCause'].value_counts(normalize=True, dropna=True)

missing_countD = df['DeathCause'].isna().sum()

missing_samplesD = np.random.choice(proportionsD.index, size=missing_countD, p=proportionsD.values)

df.loc[df['DeathCause'].isna(), 'DeathCause'] = missing_samplesD


df.loc[df['Weight_Status'].isna(), 'Weight_Status'] = "Normal"


df.loc[df['Smoking_Status'].isna(), 'Smoking_Status'] = "Non-smoker"


proportions = df['Chol_Status'].value_counts(normalize=True, dropna=True)

missing_count = df['Chol_Status'].isna().sum()

missing_samples = np.random.choice(proportions.index, size=missing_count, p=proportions.values)

df.loc[df['Chol_Status'].isna(), 'Chol_Status'] = missing_samples


# Calculate means for numeric columns
means = df.mean()

# Impute missing values with means using map function
for col in df.select_dtypes(include='number'):
    df[col] = df[col].fillna(means[col])

/var/folders/cx/3wbhcqyd3cld6gvk_xjkvr_40000gn/T/ipykernel_85659/385573716.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  means = df.mean()


df.describe()


df.isna().sum()

Status            0
DeathCause        0
AgeCHDdiag        0
Sex               0
AgeAtStart        0
Height            0
Weight            0
Diastolic         0
Systolic          0
MRW               0
Smoking           0
AgeAtDeath        0
Cholesterol       0
Chol_Status       0
BP_Status         0
Weight_Status     0
Smoking_Status    0
dtype: int64


# Define mapping dictionary
status_mapping = {'Dead':1, 'Alive':0}

# Decode 'Status' column
df['Status'] = df['Status'].map(status_mapping)


# Define mapping dictionary
status_mappingD = {'Coronary Heart Disease':1, 'Cancer':2,'Cerebral Vascular Disease':3, 'Other':4 }

df['DeathCause'] = df['DeathCause'].map(status_mappingD)


# Define mapping dictionary
status_mappingSex = {'Male':1, 'Female':0 }

df['Sex'] = df['Sex'].map(status_mappingSex)


# Define mapping dictionary
status_mappingS = {'Light (1-5)':1, 'Non-smoker':0,'Moderate (6-15)':2,'Heavy (16-25)':3,'Very Heavy (> 25)':4}

df['Smoking_Status'] = df['Smoking_Status'].map(status_mappingS)


status_mappingC = {'Borderline':1, 'Desirable':0,'High':2}

df['Chol_Status'] = df['Chol_Status'].map(status_mappingC)


status_mappingBP = {'Normal':0, 'Optimal':1, 'High':2}

df['BP_Status'] = df['BP_Status'].map(status_mappingBP)


status_mappingW = {'Normal':0, 'Underweight':1, 'Overweight':2}
df['Weight_Status'] = df['Weight_Status'].map(status_mappingW)


# Function to detect outliers using IQR method
def detect_outliers(df, variable):
    Q1 = df[variable].quantile(0.25)
    Q3 = df[variable].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[variable] < lower_bound) | (df[variable] > upper_bound)]
    return outliers

# Define numeric variables to detect outliers
numeric_variables = ['Cholesterol', 'AgeCHDdiag', 'AgeAtStart', 'Height', 'Weight', 'Smoking']

# Create a dictionary to store outliers for each variable
outliers_dict = {}

# Detect outliers in each numeric variable
for col in numeric_variables:
    outliers_dict[col] = detect_outliers(df, col)

# Print outliers for each variable
for col, outliers in outliers_dict.items():
    print(f"Outliers in {col}:")
    print(outliers)
    print()

Outliers in Cholesterol:
      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height  Weight  \
89         0         2.0   63.302968    0          52   62.00   135.0   
123        1         1.0   73.000000    0          47   62.00   124.0   
143        0         1.0   63.302968    1          45   68.50   160.0   
187        0         4.0   76.000000    0          54   64.25   146.0   
197        1         1.0   61.000000    1          59   67.25   164.0   
...      ...         ...         ...  ...         ...     ...     ...   
4874       1         1.0   53.000000    1          47   64.50   143.0   
5022       1         3.0   63.302968    0          62   60.50   108.0   
5138       1         1.0   59.000000    0          55   59.75   148.0   
5146       0         NaN   63.302968    1          56   65.00   145.0   
5161       1         4.0   63.302968    0          57   62.00   159.0   

      Diastolic  Systolic    MRW  Smoking  AgeAtDeath  Cholesterol  \
89           82       144  116.0      0.0   70.536414        339.0   
123          80       140  107.0      0.0   77.000000        347.0   
143          86       136  111.0      0.0   70.536414        347.0   
187          96       148  118.0      0.0   70.536414        418.0   
197         100       152  117.0      0.0   81.000000        334.0   
...         ...       ...    ...      ...         ...          ...   
4874         92       158  112.0     15.0   71.000000        386.0   
5022         72       128   99.0     15.0   82.000000        350.0   
5138         90       124  140.0      0.0   61.000000        400.0   
5146         90       132  111.0     20.0   70.536414        360.0   
5161         80       124  137.0      0.0   81.000000        334.0   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
89              2          2              2               0  
123             2          0              0               0  
143             2          0              2               0  
187             2          2              2               0  
197             2          2              2               0  
...           ...        ...            ...             ...  
4874            2          2              2               2  
5022            2          0              0               2  
5138            2          2              2               0  
5146            2          2              2               3  
5161            2          0              2               0  

[108 rows x 17 columns]

Outliers in AgeCHDdiag:
      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height  Weight  \
11         0         3.0        57.0    1          33   64.25   151.0   
12         0         3.0        55.0    1          33   70.00   174.0   
13         0         2.0        79.0    1          57   67.25   165.0   
14         0         4.0        66.0    1          44   69.00   155.0   
17         1         2.0        56.0    1          56   67.25   122.0   
...      ...         ...         ...  ...         ...     ...     ...   
5200       1         1.0        55.0    1          47   67.00   186.0   
5201       0         1.0        61.0    1          59   66.50   156.0   
5202       1         1.0        59.0    1          59   71.00   177.0   
5204       1         1.0        79.0    1          49   64.50   173.0   
5207       1         1.0        50.0    1          36   68.25   164.0   

      Diastolic  Systolic    MRW  Smoking  AgeAtDeath  Cholesterol  \
11           68       108  118.0      0.0   70.536414   221.000000   
12           90       142  114.0      0.0   70.536414   188.000000   
13           76       128  118.0     15.0   70.536414   227.417441   
14           90       130  105.0     30.0   70.536414   292.000000   
17           72       120   87.0     15.0   72.000000   194.000000   
...         ...       ...    ...      ...         ...          ...   
5200        105       155  133.0      5.0   57.000000   199.000000   
5201         84       124  116.0     20.0   70.536414   223.000000   
5202         68       108  113.0     25.0   65.000000   246.000000   
5204         80       110  135.0     20.0   81.000000   228.000000   
5207         64       108  114.0     40.0   64.000000   238.000000   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
11              1          1              2               0  
12              0          2              2               0  
13              2          0              2               2  
14              2          2              0               4  
17              0          0              1               2  
...           ...        ...            ...             ...  
5200            0          2              2               1  
5201            1          0              2               3  
5202            2          1              2               3  
5204            1          0              2               3  
5207            1          1              2               4  

[1449 rows x 17 columns]

Outliers in AgeAtStart:
Empty DataFrame
Columns: [Status, DeathCause, AgeCHDdiag, Sex, AgeAtStart, Height, Weight, Diastolic, Systolic, MRW, Smoking, AgeAtDeath, Cholesterol, Chol_Status, BP_Status, Weight_Status, Smoking_Status]
Index: []

Outliers in Height:
      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height  Weight  \
418        0         NaN   63.302968    1          44   75.50   177.0   
678        0         4.0   55.000000    1          29   75.50   204.0   
1893       1         2.0   63.302968    1          34   76.00   220.0   
2249       1         4.0   63.302968    0          51   51.50    72.0   
3355       0         2.0   63.302968    1          44   76.00   169.0   
3508       1         4.0   63.302968    0          60   53.75   119.0   
3882       0         1.0   63.302968    1          33   76.50   221.0   

      Diastolic  Systolic    MRW  Smoking  AgeAtDeath  Cholesterol  \
418          62       122  101.0      0.0   70.536414        199.0   
678          90       140  116.0      0.0   70.536414        246.0   
1893         80       138  122.0     10.0   62.000000        192.0   
2249         92       126   88.0      1.0   79.000000        234.0   
3355         70       102   93.0      5.0   70.536414        205.0   
3508        100       190  135.0      0.0   76.000000        250.0   
3882        100       155  122.0     20.0   70.536414        225.0   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
418             0          0              0               0  
678             2          2              2               0  
1893            0          0              2               2  
2249            1          2              1               1  
3355            1          1              0               1  
3508            2          2              2               0  
3882            1          2              2               3  

Outliers in Weight:
      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height  Weight  \
154        0         4.0   63.000000    0          37   63.00   236.0   
436        1         3.0   63.302968    0          50   62.75   241.0   
491        1         3.0   63.302968    1          52   72.00   247.0   
671        0         3.0   63.302968    1          29   70.25   243.0   
765        1         4.0   63.302968    0          33   56.25    71.0   
836        1         3.0   63.302968    1          48   66.00   250.0   
1236       1         1.0   59.000000    1          39   73.50   244.0   
1623       0         1.0   63.302968    0          51   65.25   239.0   
1647       0         3.0   32.000000    1          32   73.00   239.0   
1664       1         NaN   60.000000    1          34   72.50   245.0   
1679       1         1.0   75.000000    1          51   74.00   239.0   
1772       0         1.0   72.000000    1          44   73.25   238.0   
1778       1         2.0   62.000000    1          40   71.50   240.0   
1796       0         NaN   57.000000    1          31   69.00   238.0   
1913       0         1.0   33.000000    1          33   64.75   260.0   
1944       0         2.0   63.302968    1          44   68.25   244.0   
2099       0         2.0   63.302968    0          38   59.50   242.0   
2119       1         3.0   63.302968    1          59   68.25   276.0   
2307       1         4.0   63.302968    1          60   69.25   234.0   
2348       1         1.0   49.000000    1          33   70.50   235.0   
2437       1         1.0   75.000000    0          53   60.25   271.0   
2547       1         4.0   63.302968    1          50   66.50   237.0   
2576       0         NaN   43.000000    0          43   60.25   235.0   
2592       1         3.0   63.302968    1          38   72.25   273.0   
2609       0         2.0   63.302968    1          33   66.25   241.0   
2761       0         1.0   63.302968    1          38   72.25   234.0   
2913       0         3.0   63.302968    1          39   67.25   237.0   
2946       0         2.0   63.302968    1          47   71.75   236.0   
2991       1         3.0   63.302968    0          60   64.00   235.0   
3123       1         1.0   71.000000    0          49   62.00   261.0   
3124       1         1.0   64.000000    1          52   72.00   236.0   
3251       0         4.0   63.302968    0          57   67.00   293.0   
3314       0         NaN   63.302968    1          36   70.50   244.0   
3359       1         2.0   63.302968    0          36   63.75   300.0   
3615       1         4.0   63.302968    0          47   60.00   238.0   
3660       1         1.0   51.000000    1          51   69.00   239.0   
3844       0         2.0   63.302968    0          39   65.50   250.0   
3871       0         2.0   63.302968    1          33   73.75   245.0   
4239       1         NaN   63.302968    0          56   60.50   269.0   
4532       0         1.0   63.302968    1          38   73.00   246.0   
4703       0         1.0   63.000000    0          47   61.00   300.0   
4857       1         3.0   63.302968    0          59   62.75   281.0   
4960       1         1.0   41.000000    1          35   70.00   235.0   
5005       1         1.0   58.000000    1          54   68.00   256.0   
5062       0         4.0   63.302968    1          32   68.00   236.0   
5097       1         4.0   63.302968    0          46   57.75    67.0   
5133       0         4.0   63.302968    0          32   61.00   275.0   
5199       1         1.0   64.000000    1          58   72.75   255.0   

      Diastolic  Systolic    MRW  Smoking  AgeAtDeath  Cholesterol  \
154          96       178  197.0      0.0   70.536414   227.417441   
436         150       242  208.0      0.0   58.000000   213.000000   
491         104       154  153.0     20.0   68.000000   188.000000   
671          90       162  160.0     20.0   70.536414   163.000000   
765          90       116   73.0      0.0   61.000000   192.000000   
836          96       170  185.0      0.0   72.000000   180.000000   
1236        100       144  147.0     20.0   71.000000   276.000000   
1623         88       150  187.0      1.0   70.536414   175.000000   
1647         92       138  144.0      0.0   70.536414   171.000000   
1664         90       140  152.0      0.0   64.000000   271.000000   
1679        100       158  141.0      5.0   77.000000   243.000000   
1772        100       150  143.0     20.0   70.536414   237.000000   
1778         90       128  153.0     40.0   70.000000   228.000000   
1796         88       140  161.0      0.0   70.536414   250.000000   
1913        100       148  203.0      0.0   70.536414   227.417441   
1944        126       190  169.0      0.0   70.536414   292.000000   
2099         86       132  228.0      0.0   70.536414   157.000000   
2119         80       124  192.0      0.0   81.000000   182.000000   
2307         98       138  158.0     30.0   86.000000   175.000000   
2348         96       146  155.0     25.0   51.000000   224.000000   
2437        130       246  249.0      0.0   85.000000   200.000000   
2547        100       156  176.0      5.0   76.000000   247.000000   
2576        134       246  216.0      0.0   70.536414   300.000000   
2592        120       180  170.0      0.0   58.000000   227.000000   
2609         90       140  179.0      0.0   70.536414   259.000000   
2761         84       134  145.0      0.0   70.536414   155.000000   
2913        130       204  169.0     10.0   70.536414   204.000000   
2946         88       130  150.0      0.0   70.536414   248.000000   
2991         96       190  190.0      0.0   92.000000   227.417441   
3123         86       150  225.0      0.0   77.000000   245.000000   
3124        104       156  147.0      0.0   70.000000   255.000000   
3251        108       170  215.0      0.0   70.536414   242.000000   
3314         95       135  161.0      0.0   70.536414   204.000000   
3359        108       182  250.0     20.0   54.000000   215.000000   
3615        100       180  218.0      0.0   53.000000   227.417441   
3660        122       228  161.0     40.0   61.000000   226.000000   
3844        105       135  195.0      0.0   70.536414   242.000000   
3871         90       150  148.0      0.0   70.536414   179.000000   
4239        120       210  247.0     20.0   82.000000   150.000000   
4532         94       124  148.0      0.0   70.536414   174.000000   
4703        120       208  268.0      0.0   70.536414   185.000000   
4857        100       152  242.0      0.0   69.000000   188.000000   
4960         96       152  155.0     30.0   43.000000   285.000000   
5005        100       182  178.0     45.0   60.000000   286.000000   
5062        100       150  164.0      0.0   70.536414   226.000000   
5097         80       124   67.0      0.0   56.000000   234.000000   
5133         82       136  246.0      0.0   70.536414   154.000000   
5199         88       130  158.0     40.0   80.000000   260.000000   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
154             2          2              2               0  
436             1          2              2               0  
491             0          2              2               3  
671             0          2              2               3  
765             0          2              1               0  
836             0          2              2               0  
1236            2          2              2               3  
1623            0          2              2               1  
1647            0          2              2               0  
1664            2          2              2               0  
1679            2          2              2               1  
1772            1          2              2               3  
1778            1          2              2               4  
1796            2          0              2               0  
1913            1          2              2               0  
1944            2          2              2               0  
2099            0          0              2               0  
2119            0          0              2               0  
2307            0          2              2               4  
2348            1          2              2               3  
2437            1          2              2               0  
2547            2          2              2               1  
2576            2          2              2               0  
2592            1          2              2               0  
2609            2          2              2               0  
2761            0          0              2               0  
2913            1          2              2               2  
2946            2          0              2               0  
2991            1          2              2               0  
3123            2          2              2               0  
3124            2          2              2               0  
3251            2          2              2               0  
3314            1          2              2               0  
3359            1          2              2               3  
3615            1          2              2               0  
3660            1          2              2               4  
3844            2          2              2               0  
3871            0          2              2               0  
4239            0          2              2               3  
4532            0          2              2               0  
4703            0          2              2               0  
4857            0          2              2               0  
4960            2          2              2               4  
5005            2          2              2               4  
5062            1          2              2               0  
5097            1          0              1               0  
5133            0          0              2               0  
5199            2          0              2               4  

Outliers in Smoking:
      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height      Weight  \
498        0         2.0   63.302968    1          34   65.75  137.000000   
914        0         3.0   66.000000    1          38   68.00  174.000000   
1093       0         2.0   63.302968    1          32   70.00  173.000000   
1263       1         4.0   74.000000    1          54   68.00  153.086681   
1699       1         1.0   57.000000    1          53   66.25  150.000000   
2043       0         2.0   63.302968    1          32   70.50  203.000000   
2903       0         NaN   55.000000    1          35   68.25  158.000000   
2910       0         4.0   63.302968    1          34   70.50  164.000000   
3468       1         1.0   40.000000    1          36   70.50  190.000000   
4110       1         NaN   42.000000    1          32   67.25  222.000000   
4127       0         3.0   63.302968    1          36   68.25  152.000000   
4296       1         1.0   66.000000    1          46   67.00  141.000000   
4303       0         2.0   63.302968    1          51   62.75  134.000000   
4360       1         1.0   62.000000    1          42   67.50  184.000000   
4368       1         1.0   41.000000    1          35   68.50  185.000000   
4376       0         4.0   63.302968    1          37   67.00  176.000000   
4917       1         2.0   63.302968    1          40   64.00  126.000000   

      Diastolic  Systolic         MRW  Smoking  AgeAtDeath  Cholesterol  \
498          80       116  105.000000     60.0   70.536414   227.417441   
914          88       148  121.000000     60.0   70.536414   305.000000   
1093         80       120  114.000000     60.0   70.536414   234.000000   
1263         90       132  119.957525     60.0   80.000000   307.000000   
1699         86       120  111.000000     60.0   67.000000   209.000000   
2043         90       140  134.000000     60.0   70.536414   146.000000   
2903         80       130  110.000000     60.0   70.536414   177.000000   
2910         90       115  108.000000     60.0   70.536414   159.000000   
3468        100       140  125.000000     60.0   54.000000   362.000000   
4110        105       150  159.000000     55.0   52.000000   220.000000   
4127         70       124  106.000000     55.0   70.536414   163.000000   
4296         70       118  101.000000     60.0   68.000000   246.000000   
4303        100       150  110.000000     60.0   70.536414   314.000000   
4360         82       135  131.000000     60.0   64.000000   345.000000   
4368        104       176  128.000000     60.0   51.000000   263.000000   
4376         86       118  126.000000     60.0   70.536414   210.000000   
4917         66        98   98.000000     60.0   64.000000   215.000000   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
498             1          0              0               4  
914             2          2              2               4  
1093            1          0              2               4  
1263            2          2              0               4  
1699            1          0              2               4  
2043            0          2              2               4  
2903            0          0              2               4  
2910            0          2              0               4  
3468            2          2              2               4  
4110            1          2              2               4  
4127            0          0              0               4  
4296            2          1              0               4  
4303            2          2              2               4  
4360            2          0              2               4  
4368            2          2              2               4  
4376            1          0              2               4  
4917            1          1              0               4


# Drop outliers from df
for col, outliers in outliers_dict.items():
    try:
        df.drop(outliers.index, inplace=True)
    except KeyError:
        # Handle KeyError if the column doesn't exist in the DataFrame
        pass

# Reset index after dropping outliers
df.reset_index(drop=True, inplace=True)

# Verify the DataFrame after dropping outliers
print(df)

      Status  DeathCause  AgeCHDdiag  Sex  AgeAtStart  Height  Weight  \
0          1         4.0   63.302968    0          29   62.50   140.0   
1          1         2.0   63.302968    0          41   59.75   194.0   
2          0         4.0   63.302968    0          57   62.25   132.0   
3          0         2.0   63.302968    0          39   65.75   158.0   
4          0         1.0   63.302968    1          42   66.00   156.0   
...      ...         ...         ...  ...         ...     ...     ...   
5041       1         1.0   79.000000    1          49   64.50   173.0   
5042       0         2.0   63.302968    0          42   60.00   141.0   
5043       0         2.0   63.302968    0          51   58.25   123.0   
5044       1         1.0   50.000000    1          36   68.25   164.0   
5045       0         2.0   63.302968    1          36   70.50   177.0   

      Diastolic  Systolic    MRW  Smoking  AgeAtDeath  Cholesterol  \
0            78       124  121.0      0.0   55.000000   227.417441   
1            92       144  183.0      0.0   57.000000   181.000000   
2            90       170  114.0     10.0   70.536414   250.000000   
3            80       128  123.0      0.0   70.536414   242.000000   
4            76       110  116.0     20.0   70.536414   281.000000   
...         ...       ...    ...      ...         ...          ...   
5041         80       110  135.0     20.0   81.000000   228.000000   
5042         76       124  129.0      5.0   70.536414   209.000000   
5043         90       152  119.0      1.0   70.536414   197.000000   
5044         64       108  114.0     40.0   64.000000   238.000000   
5045         68        94  116.0     50.0   70.536414   240.000000   

      Chol_Status  BP_Status  Weight_Status  Smoking_Status  
0               2          0              2               0  
1               0          2              2               0  
2               2          2              2               2  
3               2          0              2               0  
4               2          1              2               3  
...           ...        ...            ...             ...  
5041            1          0              2               3  
5042            1          0              2               1  
5043            0          2              2               1  
5044            1          1              2               4  
5045            2          1              2               4  

[5046 rows x 17 columns]


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


r2 = r2_score(y_test, y_pred)
print("R-squared (R2) score:", r2)

Mean Squared Error: 1421.387817864974
R-squared (R2) score: 0.08504489192920817


# Extract coefficients
coefficients = model.coef_

# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)

# Print the top contributing features
print("Top contributing features:")
print(sorted_coefficients_df)

Top contributing features:
           Feature  Coefficient  Absolute Coefficient
3              Sex    -4.994471              4.994471
13   Weight_Status     2.317034              2.317034
5           Height    -2.234509              2.234509
14  Smoking_Status     2.134919              2.134919
0           Status     1.662626              1.662626
4       AgeAtStart     1.173081              1.173081
1       DeathCause    -0.777538              0.777538
12       BP_Status     0.495003              0.495003
6           Weight     0.411385              0.411385
9              MRW    -0.408968              0.408968
2       AgeCHDdiag    -0.322008              0.322008
7        Diastolic     0.203016              0.203016
11      AgeAtDeath    -0.120730              0.120730
8         Systolic     0.032830              0.032830
10         Smoking    -0.012846              0.012846


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Extract independent variables (features) and target variable (cholesterol)
X = df.drop(['Cholesterol','Chol_Status'], axis=1)
y = df['Cholesterol']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize linear regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Extract coefficients
coefficients = model.coef_

# Match coefficients with feature names
feature_names = X.columns
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort coefficients by absolute value
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
sorted_coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)

# Select top 7 features
top_features = sorted_coefficients_df.iloc[:7]['Feature'].tolist()

# Fit the model on training data with top 7 features
X_train_top = X_train[top_features]
model.fit(X_train_top, y_train)

# Evaluate the model on testing data
X_test_top = X_test[top_features]
score = model.score(X_test_top, y_test)
print("R-squared (R2) score using top 7 features:", score)

R-squared (R2) score using top 7 features: 0.07588634596412047


import seaborn as sns
import matplotlib.pyplot as plt

#Plot actual vs. predicted cholesterol values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Cholesterol')
plt.ylabel('Predicted Cholesterol')
plt.title('Actual vs. Predicted Cholesterol')
plt.grid(True)
plt.show()


# Calculate mean cholesterol for each DeathCause
mean_cholesterol = df.groupby('DeathCause')['Cholesterol'].mean()

# Create bar plot
plt.figure(figsize=(10, 6))
mean_cholesterol.plot(kind='bar', color='skyblue')
plt.title('Mean Cholesterol by Death Cause')
plt.xlabel('Death Cause')
plt.ylabel('Mean Cholesterol')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility

# Add mean values to the bars
for index, value in enumerate(mean_cholesterol):
    plt.text(index, value, str(round(value, 2)), ha='center', va='bottom')

plt.tight_layout()
plt.show()


import statsmodels.api as sma
X_train = sma.add_constant(X_train) ## let's add an intercept (beta_0) to our model
X_test = sma.add_constant(X_test)


import statsmodels.api as sm
lm2 = sm.OLS(y_train,X_train).fit()
lm2.summary()


influence = lm2.get_influence()  
resid_student = influence.resid_studentized_external
resid = pd.concat([X_train,pd.Series(resid_student,name = "Studentized Residuals")],axis = 1)
resid.head()


resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:]


ind = resid.loc[np.absolute(resid["Studentized Residuals"]) > 3,:].index
ind

Int64Index([3951, 1631, 766], dtype='int64')


y_train.drop(ind,axis = 0,inplace = True)
X_train.drop(ind,axis = 0,inplace = True)  #Intercept column is there


from statsmodels.stats.outliers_influence import variance_inflation_factor
[variance_inflation_factor(X_train.values, j) for j in range(X_train.shape[1])]

[10076.96450825812,
 1.39726232315385,
 1.0053644012541958,
 1.2038020419953424,
 2.9230720513123316,
 1.803760234929722,
 29.928615338015483,
 81.69961400686279,
 3.049245691697844,
 3.125390428579405,
 58.53830049938579,
 11.78698053542615,
 1.3524705474598187,
 1.5654284427754148,
 1.8889839119632796,
 11.765412868907452]


def calculate_vif(x):
    thresh = 5.0
    output = pd.DataFrame()
    k = x.shape[1]
    vif = [variance_inflation_factor(x.values, j) for j in range(x.shape[1])]
    for i in range(1,k):
        print("Iteration no.")
        print(i)
        print(vif)
        a = np.argmax(vif)
        print("Max VIF is for variable no.:")
        print(a)
        if vif[a] <= thresh :
            break
        if i == 1 :          
            output = x.drop(x.columns[a], axis = 1)
            vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
        elif i > 1 :
            output = output.drop(output.columns[a],axis = 1)
            vif = [variance_inflation_factor(output.values, j) for j in range(output.shape[1])]
    return(output)
train_out = calculate_vif(X_train)
train_out.head()

Iteration no.
1
[10076.96450825812, 1.39726232315385, 1.0053644012541958, 1.2038020419953424, 2.9230720513123316, 1.803760234929722, 29.928615338015483, 81.69961400686279, 3.049245691697844, 3.125390428579405, 58.53830049938579, 11.78698053542615, 1.3524705474598187, 1.5654284427754148, 1.8889839119632796, 11.765412868907452]
Max VIF is for variable no.:
0
Iteration no.
2
[2.2680498769708857, 5.149110964561775, 176.82044595057542, 4.649332188872404, 49.2469808444578, 315.0171692885646, 265.6236458740845, 134.17015476982604, 107.76448893577012, 246.04717480793195, 19.088826024637964, 157.8717427232369, 3.489293916740208, 6.169409804587098, 21.18440362788717]
Max VIF is for variable no.:
5
Iteration no.
3
[2.262635788435497, 5.081824736490531, 124.41869062271418, 4.626422881493438, 48.897284279602296, 173.46508836912358, 125.60382004306241, 107.66305900182297, 207.2352634160289, 19.088622808797087, 133.3033514041389, 3.3563561031028994, 5.8698907122947, 21.049054319376673]
Max VIF is for variable no.:
8
Iteration no.
4
[2.2616786161253035, 5.08098710347792, 123.45478216243256, 2.8600599588315094, 48.52879141558591, 60.837399537318774, 124.83482318326554, 107.32456708382153, 19.06523850752273, 132.39881541965138, 3.3563016552995055, 5.267727538963271, 21.02929240184767]
Max VIF is for variable no.:
9
Iteration no.
5
[2.1664387170234622, 5.046715432001964, 74.38589069594636, 2.855797469146641, 42.43294552405285, 59.044791898000476, 124.22052357503861, 107.2344041053779, 19.017411192284534, 3.3199762662978656, 5.219569377654603, 20.966049479003505]
Max VIF is for variable no.:
6
Iteration no.
6
[2.1656391445269842, 5.0456168509455095, 70.35891807565832, 2.855605507493809, 42.402813122929714, 56.30802544065065, 53.274845467319075, 19.006754040187634, 3.1995014606046905, 5.219555772173678, 20.94132301732525]
Max VIF is for variable no.:
2
Iteration no.
7
[2.022845552183428, 4.929164438927844, 2.7243301623714555, 31.113637801170885, 44.239055798919104, 47.37968401001749, 18.97091168595589, 3.018458559936772, 4.881825142420282, 20.786335930148596]
Max VIF is for variable no.:
5
Iteration no.
8
[2.0226828330207645, 4.8535852047183194, 2.595141043618071, 22.151900363818545, 32.204607975426754, 18.962525934811897, 2.4829710496776674, 4.786450705712884, 20.743703120726828]
Max VIF is for variable no.:
4
Iteration no.
9
[1.9150598219642052, 4.632084824282184, 2.1244727738654507, 9.229873214669365, 18.921457213227143, 2.4515496059020516, 3.517449764369447, 20.512890229520252]
Max VIF is for variable no.:
7
Iteration no.
10
[1.9106524815596229, 4.609844017796759, 2.123452072332671, 8.986091436972144, 1.8456877931063589, 2.4498443291984833, 3.4930015685842766]
Max VIF is for variable no.:
3
Iteration no.
11
[1.7480932699859404, 2.914224375241127, 2.073365152571798, 1.8309779894845701, 2.2545045236829746, 2.814688615351666]
Max VIF is for variable no.:
1


import statsmodels.api as sma
import statsmodels.api as sm
train_out = sma.add_constant(train_out) ## let's add an intercept (beta_0) to our model
#X_test.drop(["Status"],axis = 1,inplace = True)
X_test = sma.add_constant(X_test)
lm2 = sm.OLS(y_train,train_out).fit()
lm2.summary()

Dep. Variable:	Cholesterol	R-squared:	0.109
Model:	OLS	Adj. R-squared:	0.106
Method:	Least Squares	F-statistic:	32.81
Date:	Sun, 14 Apr 2024	Prob (F-statistic):	1.39e-89
Time:	15:18:19	Log-Likelihood:	-20347.
No. Observations:	4036	AIC:	4.073e+04
Df Residuals:	4020	BIC:	4.083e+04
Df Model:	15
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	308.3604	59.269	5.203	0.000	192.161	424.560
Status	1.6626	1.435	1.159	0.247	-1.150	4.476
DeathCause	-0.7775	0.538	-1.446	0.148	-1.832	0.277
AgeCHDdiag	-0.3220	0.124	-2.591	0.010	-0.566	-0.078
Sex	-4.9945	2.031	-2.459	0.014	-8.977	-1.012
AgeAtStart	1.1731	0.092	12.698	0.000	0.992	1.354
Height	-2.2345	0.915	-2.443	0.015	-4.028	-0.441
Weight	0.4114	0.195	2.115	0.035	0.030	0.793
Diastolic	0.2030	0.080	2.542	0.011	0.046	0.360
Systolic	0.0328	0.044	0.740	0.459	-0.054	0.120
MRW	-0.4090	0.238	-1.722	0.085	-0.875	0.057
Smoking	-0.0128	0.170	-0.076	0.940	-0.346	0.320
AgeAtDeath	-0.1207	0.105	-1.149	0.251	-0.327	0.085
BP_Status	0.4950	0.806	0.614	0.539	-1.086	2.076
Weight_Status	2.3170	0.895	2.589	0.010	0.562	4.072
Smoking_Status	2.1349	1.392	1.534	0.125	-0.593	4.863

Omnibus:	47.131	Durbin-Watson:	2.047
Prob(Omnibus):	0.000	Jarque-Bera (JB):	47.829
Skew:	0.255	Prob(JB):	4.11e-11
Kurtosis:	2.845	Cond. No.	2.83e+04

Dep. Variable:	Cholesterol	R-squared:	0.055
Model:	OLS	Adj. R-squared:	0.054
Method:	Least Squares	F-statistic:	39.38
Date:	Sun, 14 Apr 2024	Prob (F-statistic):	8.80e-47
Time:	16:01:48	Log-Likelihood:	-20449.
No. Observations:	4033	AIC:	4.091e+04
Df Residuals:	4026	BIC:	4.096e+04
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	211.8868	1.865	113.587	0.000	208.230	215.544
Status	11.3978	1.300	8.769	0.000	8.849	13.946
DeathCause	-0.7294	0.552	-1.320	0.187	-1.813	0.354
Sex	-4.3260	1.332	-3.248	0.001	-6.937	-1.715
Smoking	0.0748	0.055	1.350	0.177	-0.034	0.184
BP_Status	3.9552	0.690	5.735	0.000	2.603	5.307
Weight_Status	5.4018	0.688	7.853	0.000	4.053	6.750

Cholesterol Linear Regression Using Python¶

Data loading¶

Data cleaning¶

NA replace¶

Recoding data¶

Detect and drop the outliers in numeric variables¶

Linear regression model¶

select variables contributing most to the model¶

rewrite the model with top 7 variables most contribute to the model¶

graphic¶

Running linear regression using statsmodels¶

Detecting and Removing Multicollinearity¶

Running linear regression again on our new training set (without multicollinearity)¶

	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol	Chol_Status	BP_Status	Weight_Status	Smoking_Status
0	Dead	Other	NaN	Female	29	62.50	140.0	78	124	121.0	0.0	55.0	NaN	NaN	Normal	Overweight	Non-smoker
1	Dead	Cancer	NaN	Female	41	59.75	194.0	92	144	183.0	0.0	57.0	181.0	Desirable	High	Overweight	Non-smoker
2	Alive	NaN	NaN	Female	57	62.25	132.0	90	170	114.0	10.0	NaN	250.0	High	High	Overweight	Moderate (6-15)
3	Alive	NaN	NaN	Female	39	65.75	158.0	80	128	123.0	0.0	NaN	242.0	High	Normal	Overweight	Non-smoker
4	Alive	NaN	NaN	Male	42	66.00	156.0	76	110	116.0	20.0	NaN	281.0	High	Optimal	Overweight	Heavy (16-25)

	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol
count	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000	5209.000000
mean	63.302968	44.068727	64.813185	153.086681	85.358610	136.909580	119.957525	9.366518	70.536414	227.417441
std	5.282496	8.574954	3.580643	28.898765	12.973091	23.739596	19.971887	11.989796	6.527255	44.274927
min	32.000000	28.000000	51.500000	67.000000	50.000000	82.000000	67.000000	0.000000	36.000000	96.000000
25%	63.302968	37.000000	62.250000	132.000000	76.000000	120.000000	106.000000	0.000000	70.536414	197.000000
50%	63.302968	43.000000	64.500000	150.000000	84.000000	132.000000	118.000000	1.000000	70.536414	225.000000
75%	63.302968	51.000000	67.500000	172.000000	92.000000	148.000000	131.000000	20.000000	70.536414	251.000000
max	90.000000	62.000000	76.500000	300.000000	160.000000	300.000000	268.000000	60.000000	93.000000	568.000000

	const	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	BP_Status	Weight_Status	Smoking_Status	Studentized Residuals
3404	1.0	0.0	1.0	63.302968	1.0	48.0	68.00	190.0	102.0	154.0	132.0	0.0	70.536414	2.0	2.0	0.0	0.260895
463	1.0	1.0	1.0	55.000000	1.0	41.0	69.75	199.0	108.0	162.0	134.0	0.0	57.000000	2.0	2.0	0.0	-0.262146
2373	1.0	0.0	3.0	52.000000	0.0	36.0	62.25	150.0	70.0	115.0	129.0	10.0	70.536414	1.0	2.0	2.0	0.386980
3470	1.0	1.0	1.0	74.000000	0.0	54.0	60.50	160.0	102.0	186.0	147.0	10.0	76.000000	2.0	2.0	2.0	0.029897
4831	1.0	0.0	2.0	67.000000	0.0	37.0	59.25	202.0	88.0	120.0	191.0	0.0	70.536414	0.0	2.0	0.0	NaN

	const	DeathCause	AgeCHDdiag	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	AgeAtDeath	BP_Status	Weight_Status	Studentized Residuals
3951	1.0	4.0	63.302968	36.0	59.0	137.0	90.0	140.0	129.0	70.536414	2.0	2.0	3.386411
1631	1.0	1.0	63.302968	53.0	64.0	145.0	88.0	130.0	117.0	70.536414	0.0	2.0	3.004549
766	1.0	3.0	61.000000	31.0	63.0	108.0	74.0	116.0	90.0	70.536414	1.0	1.0	3.006256

Omnibus:	55.169	Durbin-Watson:	2.046
Prob(Omnibus):	0.000	Jarque-Bera (JB):	54.483
Skew:	0.262	Prob(JB):	1.48e-12
Kurtosis:	2.775	Cond. No.	49.3