# Load the necessary packages 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.92 loaded

Loading the data

This data set provides different factors that may or may not influence the chances of having heart attack. Since there are many possible predictors that may explain the risk of having heart attack, I am going to use multiple regression model:

# Load Heart attack data into R
Heart_Attack <- read_csv("https://raw.githubusercontent.com/SalouaDaouki/Data605/main/heart_attack_prediction_dataset2.csv",
                          col_types = cols(
                            Age = col_double(),
                            Cholesterol = col_double(),
                            `Heart Rate` = col_double(),
                            Diabetes = col_double(),
                            `Family History` = col_double(),
                            Smoking = col_double(),
                            Obesity = col_double(),
                            `Alcohol Consumption` = col_double(),
                            `Exercise Hours Per Week` = col_double(),
                            BMI = col_double(),
                            Triglycerides = col_double(),
                            `Physical Activity Days Per Week` = col_double(),
                            `Sleep Hours Per Day` = col_double(),
                            `Heart Attack Risk` = col_double()
                          ))
head(Heart_Attack)
## # A tibble: 6 × 26
##   `Patient ID`   Age Sex    Cholesterol `Blood Pressure` `Heart Rate` Diabetes
##   <chr>        <dbl> <chr>        <dbl> <chr>                   <dbl>    <dbl>
## 1 BMW7812         67 Male           208 158/88                     72        0
## 2 CZE1114         21 Male           389 165/93                     98        1
## 3 BNI9906         21 Female         324 174/99                     72        1
## 4 JLN3497         84 Male           383 163/100                    73        1
## 5 GFO8847         66 Male           318 91/88                      93        1
## 6 ZOO7941         54 Female         297 172/86                     48        1
## # ℹ 19 more variables: `Family History` <dbl>, Smoking <dbl>, Obesity <dbl>,
## #   `Alcohol Consumption` <dbl>, `Exercise Hours Per Week` <dbl>, Diet <chr>,
## #   `Previous Heart Problems` <dbl>, `Medication Use` <dbl>,
## #   `Stress Level` <dbl>, `Sedentary Hours Per Day` <dbl>, Income <dbl>,
## #   BMI <dbl>, Triglycerides <dbl>, `Physical Activity Days Per Week` <dbl>,
## #   `Sleep Hours Per Day` <dbl>, Country <chr>, Continent <chr>,
## #   Hemisphere <chr>, `Heart Attack Risk` <dbl>
tail(Heart_Attack)
## # A tibble: 6 × 26
##   `Patient ID`   Age Sex    Cholesterol `Blood Pressure` `Heart Rate` Diabetes
##   <chr>        <dbl> <chr>        <dbl> <chr>                   <dbl>    <dbl>
## 1 YDX2478         59 Female         378 93/78                      99        0
## 2 MSV9918         60 Male           121 94/76                      61        1
## 3 QSV6764         28 Female         120 157/102                    73        1
## 4 XKA5925         47 Male           250 161/75                    105        0
## 5 EPE6801         36 Male           178 119/67                     60        1
## 6 ZWN9666         25 Female         356 138/67                     75        1
## # ℹ 19 more variables: `Family History` <dbl>, Smoking <dbl>, Obesity <dbl>,
## #   `Alcohol Consumption` <dbl>, `Exercise Hours Per Week` <dbl>, Diet <chr>,
## #   `Previous Heart Problems` <dbl>, `Medication Use` <dbl>,
## #   `Stress Level` <dbl>, `Sedentary Hours Per Day` <dbl>, Income <dbl>,
## #   BMI <dbl>, Triglycerides <dbl>, `Physical Activity Days Per Week` <dbl>,
## #   `Sleep Hours Per Day` <dbl>, Country <chr>, Continent <chr>,
## #   Hemisphere <chr>, `Heart Attack Risk` <dbl>

The data has 26 variables with 8763 observations.

Visualizing the relationships in the Heart Attack data

First, let’s take a look at the description of the data:

summary(Heart_Attack)
##   Patient ID             Age            Sex             Cholesterol   
##  Length:8763        Min.   :18.00   Length:8763        Min.   :120.0  
##  Class :character   1st Qu.:35.00   Class :character   1st Qu.:192.0  
##  Mode  :character   Median :54.00   Mode  :character   Median :259.0  
##                     Mean   :53.71                      Mean   :259.9  
##                     3rd Qu.:72.00                      3rd Qu.:330.0  
##                     Max.   :90.00                      Max.   :400.0  
##  Blood Pressure       Heart Rate        Diabetes      Family History 
##  Length:8763        Min.   : 40.00   Min.   :0.0000   Min.   :0.000  
##  Class :character   1st Qu.: 57.00   1st Qu.:0.0000   1st Qu.:0.000  
##  Mode  :character   Median : 75.00   Median :1.0000   Median :0.000  
##                     Mean   : 75.02   Mean   :0.6523   Mean   :0.493  
##                     3rd Qu.: 93.00   3rd Qu.:1.0000   3rd Qu.:1.000  
##                     Max.   :110.00   Max.   :1.0000   Max.   :1.000  
##     Smoking          Obesity       Alcohol Consumption Exercise Hours Per Week
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000      Min.   : 0.002442      
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000      1st Qu.: 4.981579      
##  Median :1.0000   Median :1.0000   Median :1.0000      Median :10.069559      
##  Mean   :0.8968   Mean   :0.5014   Mean   :0.5981      Mean   :10.014284      
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000      3rd Qu.:15.050018      
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000      Max.   :19.998709      
##      Diet           Previous Heart Problems Medication Use    Stress Level  
##  Length:8763        Min.   :0.0000          Min.   :0.0000   Min.   : 1.00  
##  Class :character   1st Qu.:0.0000          1st Qu.:0.0000   1st Qu.: 3.00  
##  Mode  :character   Median :0.0000          Median :0.0000   Median : 5.00  
##                     Mean   :0.4958          Mean   :0.4983   Mean   : 5.47  
##                     3rd Qu.:1.0000          3rd Qu.:1.0000   3rd Qu.: 8.00  
##                     Max.   :1.0000          Max.   :1.0000   Max.   :10.00  
##  Sedentary Hours Per Day     Income            BMI        Triglycerides  
##  Min.   : 0.001263       Min.   : 20062   Min.   :18.00   Min.   : 30.0  
##  1st Qu.: 2.998794       1st Qu.: 88310   1st Qu.:23.42   1st Qu.:225.5  
##  Median : 5.933622       Median :157866   Median :28.77   Median :417.0  
##  Mean   : 5.993690       Mean   :158263   Mean   :28.89   Mean   :417.7  
##  3rd Qu.: 9.019124       3rd Qu.:227749   3rd Qu.:34.32   3rd Qu.:612.0  
##  Max.   :11.999313       Max.   :299954   Max.   :40.00   Max.   :800.0  
##  Physical Activity Days Per Week Sleep Hours Per Day   Country         
##  Min.   :0.00                    Min.   : 4.000      Length:8763       
##  1st Qu.:2.00                    1st Qu.: 5.000      Class :character  
##  Median :3.00                    Median : 7.000      Mode  :character  
##  Mean   :3.49                    Mean   : 7.024                        
##  3rd Qu.:5.00                    3rd Qu.: 9.000                        
##  Max.   :7.00                    Max.   :10.000                        
##   Continent          Hemisphere        Heart Attack Risk
##  Length:8763        Length:8763        Min.   :0.0000   
##  Class :character   Class :character   1st Qu.:0.0000   
##  Mode  :character   Mode  :character   Median :0.0000   
##                                        Mean   :0.3582   
##                                        3rd Qu.:1.0000   
##                                        Max.   :1.0000

Then, we need to check for any missing data values to see if we need to do any necessary cleaning:

# Check for missing values
sum(is.na(Heart_Attack))
## [1] 0

Perfect! No missing values. Although, the “Blood Pressure” column needs some manipulations to make it included in the analysis; the original data was (160/70) in the form of (Systolic Pressure/Diastolic Pressure), let’s separate the column to two columns:

# Separate systolic and diastolic pressure
Heart_Attack <- separate(Heart_Attack, `Blood Pressure`, into = c("Systolic_Pressure", "Diastolic_Pressure"), sep = "/")

# Convert separated columns to numeric
Heart_Attack$Systolic_Pressure <- as.numeric(Heart_Attack$Systolic_Pressure)
Heart_Attack$Diastolic_Pressure <- as.numeric(Heart_Attack$Diastolic_Pressure)

Let’s compare all the data that we have in the data set and get a glimpse of different relationships between different variables. Since the data includes qualitative data as well, we need to select only the numerical data to compare their relationships, otherwise we’ll encounter an error;

# Select only numerical columns from Heart_Attack data
numerical_data <- select_if(Heart_Attack, is.numeric)

# Use pairs function to create scatter plot matrix for numerical variables
pairs(numerical_data, gap = 0.5)

Since the plots are very tiny and hard to see, let’s select only the numerical variables from the “Heart_Attack” data, and assign it to a subset “numerical_var”;

# Splitting the data into subdata sets for better visualization
# Selecting only numerical variables
numerical_var <- Heart_Attack[, sapply(Heart_Attack, is.numeric)]

# View the subset
View(numerical_var)

Then, split the “numercial_var” data into subdata sets where we include only 3 to 4 variables in each and apply pairs() function so we can visualize the plots better;

# Selecting columns from 1 to 3 and the last column (21)
subdata <- numerical_var[, c(1:3, 21)]
pairs(subdata)

# Selecting columns from 4 to 6 and the last column
subdata1 <- numerical_var[, c(4:6, 21)]
pairs(subdata1)

# Selecting columns from 7 to 9 and the last column
subdata2 <- numerical_var[, c(7:9, 21)]
pairs(subdata2)

# Selecting columns from 10 to 13 and the last column
subdata3 <- numerical_var[, c(10:13, 21)]
pairs(subdata3)

# Selecting columns from 14 to 17 and the last column
subdata4 <- numerical_var[, c(14:17, 21)]
pairs(subdata4)

# Selecting columns from 18 to 21
subdata5 <- numerical_var[, 18:21]
pairs(subdata5)

# Compute the correlation matrix
correlation_matrix <- cor(numerical_var)

# Print the correlation matrix
print(correlation_matrix)
##                                           Age   Cholesterol Systolic_Pressure
## Age                              1.0000000000 -9.107011e-03      0.0030702087
## Cholesterol                     -0.0091070106  1.000000e+00      0.0001326976
## Systolic_Pressure                0.0030702087  1.326976e-04      1.0000000000
## Diastolic_Pressure              -0.0098256171  2.083335e-03      0.0133370093
## Heart Rate                      -0.0038440129  3.149083e-04      0.0084818882
## Diabetes                        -0.0141052134 -1.342760e-02     -0.0053061373
## Family History                   0.0083525467 -2.160793e-02     -0.0097617900
## Smoking                          0.3948909143  1.634161e-02     -0.0095344334
## Obesity                         -0.0081402534 -1.484265e-02     -0.0019181865
## Alcohol Consumption             -0.0066655960 -7.261001e-03      0.0107636116
## Exercise Hours Per Week          0.0012056390  2.151714e-02     -0.0095055610
## Previous Heart Problems          0.0008683616 -6.070179e-03     -0.0119258199
## Medication Use                   0.0009803087 -9.054425e-04     -0.0011815384
## Stress Level                     0.0183066457 -2.448711e-02      0.0178478307
## Sedentary Hours Per Day          0.0172801340  1.891449e-02      0.0033926071
## Income                          -0.0017327899  6.750208e-06      0.0104143552
## BMI                             -0.0026118462  1.729187e-02      0.0042790432
## Triglycerides                    0.0034149567 -5.453721e-03      0.0051206953
## Physical Activity Days Per Week  0.0013836679  1.605594e-02     -0.0075739574
## Sleep Hours Per Day             -0.0021847037  4.456229e-03     -0.0046277633
## Heart Attack Risk                0.0064031871  1.933968e-02      0.0185852077
##                                 Diastolic_Pressure    Heart Rate      Diabetes
## Age                                   -0.009825617 -0.0038440129 -0.0141052134
## Cholesterol                            0.002083335  0.0003149083 -0.0134276004
## Systolic_Pressure                      0.013337009  0.0084818882 -0.0053061373
## Diastolic_Pressure                     1.000000000 -0.0181130570 -0.0005115390
## Heart Rate                            -0.018113057  1.0000000000  0.0067637485
## Diabetes                              -0.000511539  0.0067637485  1.0000000000
## Family History                         0.017817782 -0.0134695874 -0.0138436970
## Smoking                               -0.012293212 -0.0123306434  0.0005265323
## Obesity                               -0.020574211  0.0127248822  0.0128657609
## Alcohol Consumption                   -0.007282476  0.0034586719  0.0055512063
## Exercise Hours Per Week               -0.003468859  0.0082763293 -0.0070138312
## Previous Heart Problems                0.008813417 -0.0049559066  0.0008667311
## Medication Use                         0.004607425  0.0092439990 -0.0026558450
## Stress Level                          -0.008445057 -0.0045467686  0.0067191335
## Sedentary Hours Per Day               -0.006606069 -0.0102320484  0.0047052959
## Income                                 0.008816276  0.0048734774 -0.0007589881
## BMI                                    0.000805537  0.0052985748 -0.0028518365
## Triglycerides                          0.000544911  0.0122436948  0.0104313583
## Physical Activity Days Per Week        0.016294383  0.0008343817 -0.0024111151
## Sleep Hours Per Day                    0.010679456  0.0018112469 -0.0124572712
## Heart Attack Risk                     -0.007509192 -0.0042512016  0.0172252957
##                                 Family History       Smoking      Obesity
## Age                               0.0083525467  0.3948909143 -0.008140253
## Cholesterol                      -0.0216079303  0.0163416079 -0.014842648
## Systolic_Pressure                -0.0097617900 -0.0095344334 -0.001918186
## Diastolic_Pressure                0.0178177823 -0.0122932117 -0.020574211
## Heart Rate                       -0.0134695874 -0.0123306434  0.012724882
## Diabetes                         -0.0138436970  0.0005265323  0.012865761
## Family History                    1.0000000000  0.0117482875 -0.001443614
## Smoking                           0.0117482875  1.0000000000  0.003968986
## Obesity                          -0.0014436141  0.0039689860  1.000000000
## Alcohol Consumption               0.0127013167  0.0127537498 -0.024195420
## Exercise Hours Per Week          -0.0063777562 -0.0001498847  0.002098949
## Previous Heart Problems          -0.0045680682 -0.0005743184  0.005159194
## Medication Use                    0.0009806963 -0.0108769653 -0.006267008
## Stress Level                      0.0156365541 -0.0017567282  0.010625928
## Sedentary Hours Per Day           0.0025611570  0.0153110447 -0.001333256
## Income                           -0.0004012910  0.0030964480 -0.003870406
## BMI                              -0.0114921470  0.0076700789 -0.006058378
## Triglycerides                    -0.0019035518  0.0046500312  0.001466947
## Physical Activity Days Per Week   0.0095614489 -0.0064654906  0.005337385
## Sleep Hours Per Day              -0.0111987966 -0.0054244718 -0.005313827
## Heart Attack Risk                -0.0016519219 -0.0040512792 -0.013317553
##                                 Alcohol Consumption Exercise Hours Per Week
## Age                                   -0.0066655960            0.0012056390
## Cholesterol                           -0.0072610006            0.0215171356
## Systolic_Pressure                      0.0107636116           -0.0095055610
## Diastolic_Pressure                    -0.0072824758           -0.0034688589
## Heart Rate                             0.0034586719            0.0082763293
## Diabetes                               0.0055512063           -0.0070138312
## Family History                         0.0127013167           -0.0063777562
## Smoking                                0.0127537498           -0.0001498847
## Obesity                               -0.0241954196            0.0020989489
## Alcohol Consumption                    1.0000000000           -0.0085143204
## Exercise Hours Per Week               -0.0085143204            1.0000000000
## Previous Heart Problems                0.0103951916            0.0052525457
## Medication Use                         0.0033387438           -0.0071193499
## Stress Level                          -0.0050229973           -0.0091024194
## Sedentary Hours Per Day               -0.0128281458            0.0087556011
## Income                                -0.0223957093           -0.0234138473
## BMI                                    0.0105624546            0.0037769215
## Triglycerides                          0.0061686073            0.0017169491
## Physical Activity Days Per Week        0.0015934493            0.0077251861
## Sleep Hours Per Day                   -0.0008434151           -0.0012453363
## Heart Attack Risk                     -0.0137776983            0.0111328240
##                                 Previous Heart Problems Medication Use
## Age                                        0.0008683616   0.0009803087
## Cholesterol                               -0.0060701787  -0.0009054425
## Systolic_Pressure                         -0.0119258199  -0.0011815384
## Diastolic_Pressure                         0.0088134173   0.0046074246
## Heart Rate                                -0.0049559066   0.0092439990
## Diabetes                                   0.0008667311  -0.0026558450
## Family History                            -0.0045680682   0.0009806963
## Smoking                                   -0.0005743184  -0.0108769653
## Obesity                                    0.0051591937  -0.0062670079
## Alcohol Consumption                        0.0103951916   0.0033387438
## Exercise Hours Per Week                    0.0052525457  -0.0071193499
## Previous Heart Problems                    1.0000000000   0.0053361057
## Medication Use                             0.0053361057   1.0000000000
## Stress Level                              -0.0176285817   0.0008628768
## Sedentary Hours Per Day                   -0.0026944572   0.0225130353
## Income                                    -0.0032805602  -0.0034639224
## BMI                                        0.0157179207   0.0095140704
## Triglycerides                             -0.0190293799  -0.0110953595
## Physical Activity Days Per Week            0.0085367268  -0.0111388955
## Sleep Hours Per Day                        0.0044604459  -0.0203925832
## Heart Attack Risk                          0.0002735645   0.0022344065
##                                  Stress Level Sedentary Hours Per Day
## Age                              0.0183066457            1.728013e-02
## Cholesterol                     -0.0244871111            1.891449e-02
## Systolic_Pressure                0.0178478307            3.392607e-03
## Diastolic_Pressure              -0.0084450573           -6.606069e-03
## Heart Rate                      -0.0045467686           -1.023205e-02
## Diabetes                         0.0067191335            4.705296e-03
## Family History                   0.0156365541            2.561157e-03
## Smoking                         -0.0017567282            1.531104e-02
## Obesity                          0.0106259284           -1.333256e-03
## Alcohol Consumption             -0.0050229973           -1.282815e-02
## Exercise Hours Per Week         -0.0091024194            8.755601e-03
## Previous Heart Problems         -0.0176285817           -2.694457e-03
## Medication Use                   0.0008628768            2.251304e-02
## Stress Level                     1.0000000000           -5.397241e-03
## Sedentary Hours Per Day         -0.0053972409            1.000000e+00
## Income                          -0.0027604514            3.510621e-03
## BMI                             -0.0032504472           -2.356074e-05
## Triglycerides                   -0.0039213025           -5.784609e-03
## Physical Activity Days Per Week  0.0074046302           -6.178012e-03
## Sleep Hours Per Day             -0.0142054068            4.792013e-03
## Heart Attack Risk               -0.0041113217           -5.612975e-03
##                                        Income           BMI Triglycerides
## Age                             -1.732790e-03 -2.611846e-03   0.003414957
## Cholesterol                      6.750208e-06  1.729187e-02  -0.005453721
## Systolic_Pressure                1.041436e-02  4.279043e-03   0.005120695
## Diastolic_Pressure               8.816276e-03  8.055370e-04   0.000544911
## Heart Rate                       4.873477e-03  5.298575e-03   0.012243695
## Diabetes                        -7.589881e-04 -2.851837e-03   0.010431358
## Family History                  -4.012910e-04 -1.149215e-02  -0.001903552
## Smoking                          3.096448e-03  7.670079e-03   0.004650031
## Obesity                         -3.870406e-03 -6.058378e-03   0.001466947
## Alcohol Consumption             -2.239571e-02  1.056245e-02   0.006168607
## Exercise Hours Per Week         -2.341385e-02  3.776921e-03   0.001716949
## Previous Heart Problems         -3.280560e-03  1.571792e-02  -0.019029380
## Medication Use                  -3.463922e-03  9.514070e-03  -0.011095359
## Stress Level                    -2.760451e-03 -3.250447e-03  -0.003921303
## Sedentary Hours Per Day          3.510621e-03 -2.356074e-05  -0.005784609
## Income                           1.000000e+00  8.835838e-03   0.010738559
## BMI                              8.835838e-03  1.000000e+00  -0.005963607
## Triglycerides                    1.073856e-02 -5.963607e-03   1.000000000
## Physical Activity Days Per Week  1.302733e-04  8.110375e-03  -0.007556419
## Sleep Hours Per Day             -6.598343e-03 -1.003041e-02  -0.029215971
## Heart Attack Risk                9.627602e-03  2.027903e-05   0.010471454
##                                 Physical Activity Days Per Week
## Age                                                0.0013836679
## Cholesterol                                        0.0160559355
## Systolic_Pressure                                 -0.0075739574
## Diastolic_Pressure                                 0.0162943827
## Heart Rate                                         0.0008343817
## Diabetes                                          -0.0024111151
## Family History                                     0.0095614489
## Smoking                                           -0.0064654906
## Obesity                                            0.0053373847
## Alcohol Consumption                                0.0015934493
## Exercise Hours Per Week                            0.0077251861
## Previous Heart Problems                            0.0085367268
## Medication Use                                    -0.0111388955
## Stress Level                                       0.0074046302
## Sedentary Hours Per Day                           -0.0061780115
## Income                                             0.0001302733
## BMI                                                0.0081103748
## Triglycerides                                     -0.0075564192
## Physical Activity Days Per Week                    1.0000000000
## Sleep Hours Per Day                                0.0140334379
## Heart Attack Risk                                 -0.0050135111
##                                 Sleep Hours Per Day Heart Attack Risk
## Age                                   -0.0021847037      6.403187e-03
## Cholesterol                            0.0044562286      1.933968e-02
## Systolic_Pressure                     -0.0046277633      1.858521e-02
## Diastolic_Pressure                     0.0106794559     -7.509192e-03
## Heart Rate                             0.0018112469     -4.251202e-03
## Diabetes                              -0.0124572712      1.722530e-02
## Family History                        -0.0111987966     -1.651922e-03
## Smoking                               -0.0054244718     -4.051279e-03
## Obesity                               -0.0053138266     -1.331755e-02
## Alcohol Consumption                   -0.0008434151     -1.377770e-02
## Exercise Hours Per Week               -0.0012453363      1.113282e-02
## Previous Heart Problems                0.0044604459      2.735645e-04
## Medication Use                        -0.0203925832      2.234407e-03
## Stress Level                          -0.0142054068     -4.111322e-03
## Sedentary Hours Per Day                0.0047920126     -5.612975e-03
## Income                                -0.0065983433      9.627602e-03
## BMI                                   -0.0100304097      2.027903e-05
## Triglycerides                         -0.0292159711      1.047145e-02
## Physical Activity Days Per Week        0.0140334379     -5.013511e-03
## Sleep Hours Per Day                    1.0000000000     -1.852822e-02
## Heart Attack Risk                     -0.0185282174      1.000000e+00

That’s lots of data to process (with my eyes), let’s visualize this in a better way:

# Compute the correlation matrix 1
correlation_matrix1 <- cor(subdata)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix1, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices1 <- which(abs(correlation_matrix1) > 0.7 & correlation_matrix1 != 1, arr.ind = TRUE)
points(high_correlation_indices1, pch = 16, col = "red")

Cholesterol is the predictor that has little stronger correlation with the hrat attack risk.

# Compute the correlation matrix 2
correlation_matrix2 <- cor(subdata1)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix2, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices2 <- which(abs(correlation_matrix2) > 0.7 & correlation_matrix2 != 1, arr.ind = TRUE)
points(high_correlation_indices2, pch = 16, col = "red")

Based on this heatmap, diabetes is the predictor that has stronger correlation with the heart attack risk.

# Compute the correlation matrix 3
correlation_matrix3 <- cor(subdata2)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix3, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices3 <- which(abs(correlation_matrix3) > 0.7 & correlation_matrix3 != 1, arr.ind = TRUE)
points(high_correlation_indices3, pch = 16, col = "red")

Heart attack risk can be influenced with obesity as well.

# Compute the correlation matrix 4
correlation_matrix4 <- cor(subdata3)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix4, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices4 <- which(abs(correlation_matrix4) > 0.7 & correlation_matrix4 != 1, arr.ind = TRUE)
points(high_correlation_indices4, pch = 16, col = "red")

The number hours one exercises can lower their heat attack risk.

# Compute the correlation matrix 5
correlation_matrix5 <- cor(subdata4)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix5, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices5 <- which(abs(correlation_matrix5) > 0.7 & correlation_matrix5 != 1, arr.ind = TRUE)
points(high_correlation_indices5, pch = 16, col = "red")

There is very weak association between the heart attack risk and the income.

# Compute the correlation matrix 6
correlation_matrix6 <- cor(subdata5)

# Create a heatmap of the correlation matrix with annotations
corrplot(correlation_matrix6, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, 
         mar = c(0, 0, 0, 5),  # Increase spacing on the right side
         tl.cex = 0.8,         # Adjust label size
         cl.ratio = 0.2)       # Adjust the width of color legend

# Highlight variables with strong correlations (absolute correlation coefficient > 0.7)
high_correlation_indices6 <- which(abs(correlation_matrix6) > 0.7 & correlation_matrix6 != 1, arr.ind = TRUE)
points(high_correlation_indices6, pch = 16, col = "red")

Let’s look at the numbers using the full regression model across all numerical variables of the data set;

Identifying Potential Predictors:

Let’s define the regression model to all numerical variables in the data:

Heart_Attack_lm.Full <- lm(`Heart Attack Risk` ~ Age + Cholesterol + Systolic_Pressure + Diastolic_Pressure + `Heart Rate` + Diabetes + `Family History` + Smoking + Obesity + `Alcohol Consumption` + `Exercise Hours Per Week` + `Previous Heart Problems` + `Medication Use` + `Stress Level` + `Sedentary Hours Per Day` + Income + BMI + Triglycerides + `Physical Activity Days Per Week` + `Sleep Hours Per Day`, data = Heart_Attack)

Now, let’s look at the summary of the model:

summary(Heart_Attack_lm.Full)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Age + Cholesterol + Systolic_Pressure + 
##     Diastolic_Pressure + `Heart Rate` + Diabetes + `Family History` + 
##     Smoking + Obesity + `Alcohol Consumption` + `Exercise Hours Per Week` + 
##     `Previous Heart Problems` + `Medication Use` + `Stress Level` + 
##     `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4375 -0.3641 -0.3393  0.6279  0.7080 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.317e-01  6.378e-02   5.201 2.03e-07 ***
## Age                                2.208e-04  2.626e-04   0.841   0.4004    
## Cholesterol                        1.156e-04  6.346e-05   1.822   0.0684 .  
## Systolic_Pressure                  3.421e-04  1.945e-04   1.758   0.0787 .  
## Diastolic_Pressure                -2.657e-04  3.494e-04  -0.761   0.4469    
## `Heart Rate`                      -1.132e-04  2.494e-04  -0.454   0.6499    
## Diabetes                           1.794e-02  1.076e-02   1.667   0.0956 .  
## `Family History`                  -5.067e-04  1.026e-02  -0.049   0.9606    
## Smoking                           -1.271e-02  1.835e-02  -0.693   0.4884    
## Obesity                           -1.303e-02  1.025e-02  -1.270   0.2040    
## `Alcohol Consumption`             -1.371e-02  1.046e-02  -1.311   0.1900    
## `Exercise Hours Per Week`          9.257e-04  8.864e-04   1.044   0.2963    
## `Previous Heart Problems`          9.846e-04  1.025e-02   0.096   0.9235    
## `Medication Use`                   2.130e-03  1.025e-02   0.208   0.8354    
## `Stress Level`                    -7.319e-04  1.794e-03  -0.408   0.6832    
## `Sedentary Hours Per Day`         -9.079e-04  1.479e-03  -0.614   0.5394    
## Income                             5.523e-08  6.362e-08   0.868   0.3854    
## BMI                               -3.231e-05  8.112e-04  -0.040   0.9682    
## Triglycerides                      2.096e-05  2.292e-05   0.914   0.3605    
## `Physical Activity Days Per Week` -9.919e-04  2.246e-03  -0.442   0.6587    
## `Sleep Hours Per Day`             -4.314e-03  2.579e-03  -1.673   0.0944 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4795 on 8742 degrees of freedom
## Multiple R-squared:  0.002317,   Adjusted R-squared:  3.408e-05 
## F-statistic: 1.015 on 20 and 8742 DF,  p-value: 0.4396

Based on the summary of the full regression model, the only predictors that explain little bit (because the p value are greater than 0.05) the heart attack risk are: Cholesterol, Systolic_Pressure , Diabetes, and Sleep Hours Per Day. Now, let’s perform the backward process elimination:

Heart_Attack.lm.2 <- update(Heart_Attack_lm.Full, .~. - Age, data =
    Heart_Attack)
summary(Heart_Attack.lm.2)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diastolic_Pressure + `Heart Rate` + Diabetes + `Family History` + 
##     Smoking + Obesity + `Alcohol Consumption` + `Exercise Hours Per Week` + 
##     `Previous Heart Problems` + `Medication Use` + `Stress Level` + 
##     `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4369 -0.3639 -0.3396  0.6280  0.7132 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.383e-01  6.330e-02   5.345 9.29e-08 ***
## Cholesterol                        1.147e-04  6.345e-05   1.808   0.0706 .  
## Systolic_Pressure                  3.433e-04  1.945e-04   1.765   0.0777 .  
## Diastolic_Pressure                -2.674e-04  3.494e-04  -0.765   0.4441    
## `Heart Rate`                      -1.129e-04  2.494e-04  -0.453   0.6507    
## Diabetes                           1.779e-02  1.076e-02   1.654   0.0982 .  
## `Family History`                  -4.778e-04  1.026e-02  -0.047   0.9628    
## Smoking                           -6.615e-03  1.685e-02  -0.393   0.6947    
## Obesity                           -1.312e-02  1.025e-02  -1.280   0.2006    
## `Alcohol Consumption`             -1.382e-02  1.046e-02  -1.322   0.1863    
## `Exercise Hours Per Week`          9.269e-04  8.864e-04   1.046   0.2957    
## `Previous Heart Problems`          1.001e-03  1.025e-02   0.098   0.9222    
## `Medication Use`                   2.178e-03  1.025e-02   0.212   0.8318    
## `Stress Level`                    -7.014e-04  1.793e-03  -0.391   0.6957    
## `Sedentary Hours Per Day`         -8.924e-04  1.479e-03  -0.603   0.5463    
## Income                             5.504e-08  6.362e-08   0.865   0.3870    
## BMI                               -3.629e-05  8.111e-04  -0.045   0.9643    
## Triglycerides                      2.100e-05  2.292e-05   0.916   0.3596    
## `Physical Activity Days Per Week` -9.829e-04  2.246e-03  -0.438   0.6616    
## `Sleep Hours Per Day`             -4.314e-03  2.579e-03  -1.673   0.0944 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4795 on 8743 degrees of freedom
## Multiple R-squared:  0.002236,   Adjusted R-squared:  6.756e-05 
## F-statistic: 1.031 on 19 and 8743 DF,  p-value: 0.4197

There is a very tiny improvement on the p-value; it goes from 0.4396 to 0.4197. Let’s continue with backward process elimination:

Heart_Attack.lm.3 <- update(Heart_Attack.lm.2, .~. - Diastolic_Pressure, data =
    Heart_Attack)
summary(Heart_Attack.lm.3)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     `Heart Rate` + Diabetes + `Family History` + Smoking + Obesity + 
##     `Alcohol Consumption` + `Exercise Hours Per Week` + `Previous Heart Problems` + 
##     `Medication Use` + `Stress Level` + `Sedentary Hours Per Day` + 
##     Income + BMI + Triglycerides + `Physical Activity Days Per Week` + 
##     `Sleep Hours Per Day`, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4328 -0.3640 -0.3400  0.6286  0.7095 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.156e-01  5.595e-02   5.642 1.74e-08 ***
## Cholesterol                        1.146e-04  6.345e-05   1.807   0.0708 .  
## Systolic_Pressure                  3.412e-04  1.945e-04   1.754   0.0795 .  
## `Heart Rate`                      -1.095e-04  2.494e-04  -0.439   0.6607    
## Diabetes                           1.779e-02  1.076e-02   1.653   0.0983 .  
## `Family History`                  -6.201e-04  1.025e-02  -0.060   0.9518    
## Smoking                           -6.458e-03  1.685e-02  -0.383   0.7016    
## Obesity                           -1.296e-02  1.025e-02  -1.265   0.2061    
## `Alcohol Consumption`             -1.376e-02  1.046e-02  -1.316   0.1883    
## `Exercise Hours Per Week`          9.290e-04  8.864e-04   1.048   0.2946    
## `Previous Heart Problems`          9.315e-04  1.025e-02   0.091   0.9276    
## `Medication Use`                   2.138e-03  1.025e-02   0.209   0.8348    
## `Stress Level`                    -6.894e-04  1.793e-03  -0.385   0.7006    
## `Sedentary Hours Per Day`         -8.846e-04  1.479e-03  -0.598   0.5498    
## Income                             5.462e-08  6.362e-08   0.859   0.3906    
## BMI                               -3.677e-05  8.111e-04  -0.045   0.9638    
## Triglycerides                      2.097e-05  2.292e-05   0.915   0.3601    
## `Physical Activity Days Per Week` -1.011e-03  2.245e-03  -0.450   0.6526    
## `Sleep Hours Per Day`             -4.335e-03  2.579e-03  -1.681   0.0928 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4795 on 8744 degrees of freedom
## Multiple R-squared:  0.002169,   Adjusted R-squared:  0.0001149 
## F-statistic: 1.056 on 18 and 8744 DF,  p-value: 0.3916
Heart_Attack.lm.4 <- update(Heart_Attack.lm.3, .~. - `Heart Rate`, data =
    Heart_Attack)
summary(Heart_Attack.lm.4)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Family History` + Smoking + Obesity + `Alcohol Consumption` + 
##     `Exercise Hours Per Week` + `Previous Heart Problems` + `Medication Use` + 
##     `Stress Level` + `Sedentary Hours Per Day` + Income + BMI + 
##     Triglycerides + `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4353 -0.3639 -0.3402  0.6285  0.7100 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.076e-01  5.287e-02   5.818 6.16e-09 ***
## Cholesterol                        1.146e-04  6.344e-05   1.807   0.0708 .  
## Systolic_Pressure                  3.405e-04  1.945e-04   1.751   0.0801 .  
## Diabetes                           1.776e-02  1.076e-02   1.651   0.0989 .  
## `Family History`                  -5.618e-04  1.025e-02  -0.055   0.9563    
## Smoking                           -6.368e-03  1.685e-02  -0.378   0.7055    
## Obesity                           -1.302e-02  1.025e-02  -1.270   0.2040    
## `Alcohol Consumption`             -1.378e-02  1.046e-02  -1.317   0.1877    
## `Exercise Hours Per Week`          9.257e-04  8.863e-04   1.044   0.2963    
## `Previous Heart Problems`          9.545e-04  1.025e-02   0.093   0.9258    
## `Medication Use`                   2.094e-03  1.025e-02   0.204   0.8381    
## `Stress Level`                    -6.858e-04  1.793e-03  -0.382   0.7021    
## `Sedentary Hours Per Day`         -8.779e-04  1.479e-03  -0.594   0.5528    
## Income                             5.448e-08  6.362e-08   0.856   0.3918    
## BMI                               -3.864e-05  8.111e-04  -0.048   0.9620    
## Triglycerides                      2.085e-05  2.291e-05   0.910   0.3628    
## `Physical Activity Days Per Week` -1.012e-03  2.245e-03  -0.451   0.6523    
## `Sleep Hours Per Day`             -4.337e-03  2.579e-03  -1.682   0.0926 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4795 on 8745 degrees of freedom
## Multiple R-squared:  0.002147,   Adjusted R-squared:  0.0002072 
## F-statistic: 1.107 on 17 and 8745 DF,  p-value: 0.3394
Heart_Attack.lm.5 <- update(Heart_Attack.lm.4, .~. - `Family History`, data =
    Heart_Attack)
summary(Heart_Attack.lm.5)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + Smoking + Obesity + `Alcohol Consumption` + `Exercise Hours Per Week` + 
##     `Previous Heart Problems` + `Medication Use` + `Stress Level` + 
##     `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4355 -0.3640 -0.3401  0.6286  0.7103 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.073e-01  5.254e-02   5.848 5.15e-09 ***
## Cholesterol                        1.147e-04  6.343e-05   1.808   0.0706 .  
## Systolic_Pressure                  3.406e-04  1.945e-04   1.751   0.0799 .  
## Diabetes                           1.777e-02  1.076e-02   1.652   0.0986 .  
## Smoking                           -6.379e-03  1.685e-02  -0.379   0.7049    
## Obesity                           -1.302e-02  1.025e-02  -1.270   0.2040    
## `Alcohol Consumption`             -1.378e-02  1.046e-02  -1.318   0.1874    
## `Exercise Hours Per Week`          9.260e-04  8.862e-04   1.045   0.2961    
## `Previous Heart Problems`          9.570e-04  1.025e-02   0.093   0.9256    
## `Medication Use`                   2.094e-03  1.025e-02   0.204   0.8381    
## `Stress Level`                    -6.873e-04  1.793e-03  -0.383   0.7014    
## `Sedentary Hours Per Day`         -8.781e-04  1.479e-03  -0.594   0.5526    
## Income                             5.448e-08  6.361e-08   0.857   0.3917    
## BMI                               -3.813e-05  8.110e-04  -0.047   0.9625    
## Triglycerides                      2.085e-05  2.291e-05   0.910   0.3627    
## `Physical Activity Days Per Week` -1.013e-03  2.245e-03  -0.451   0.6519    
## `Sleep Hours Per Day`             -4.336e-03  2.578e-03  -1.682   0.0927 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8746 degrees of freedom
## Multiple R-squared:  0.002147,   Adjusted R-squared:  0.0003212 
## F-statistic: 1.176 on 16 and 8746 DF,  p-value: 0.2786
Heart_Attack.lm.6 <- update(Heart_Attack.lm.5, .~. - Smoking, data =
    Heart_Attack)
summary(Heart_Attack.lm.6)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + Obesity + `Alcohol Consumption` + `Exercise Hours Per Week` + 
##     `Previous Heart Problems` + `Medication Use` + `Stress Level` + 
##     `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4363 -0.3641 -0.3402  0.6285  0.7097 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.017e-01  5.040e-02   5.985 2.25e-09 ***
## Cholesterol                        1.143e-04  6.341e-05   1.803   0.0715 .  
## Systolic_Pressure                  3.413e-04  1.945e-04   1.755   0.0793 .  
## Diabetes                           1.777e-02  1.076e-02   1.652   0.0987 .  
## Obesity                           -1.304e-02  1.025e-02  -1.272   0.2033    
## `Alcohol Consumption`             -1.384e-02  1.046e-02  -1.323   0.1857    
## `Exercise Hours Per Week`          9.262e-04  8.862e-04   1.045   0.2960    
## `Previous Heart Problems`          9.594e-04  1.025e-02   0.094   0.9254    
## `Medication Use`                   2.138e-03  1.025e-02   0.209   0.8347    
## `Stress Level`                    -6.866e-04  1.793e-03  -0.383   0.7017    
## `Sedentary Hours Per Day`         -8.868e-04  1.478e-03  -0.600   0.5486    
## Income                             5.441e-08  6.361e-08   0.855   0.3924    
## BMI                               -4.041e-05  8.109e-04  -0.050   0.9603    
## Triglycerides                      2.082e-05  2.291e-05   0.909   0.3636    
## `Physical Activity Days Per Week` -1.007e-03  2.245e-03  -0.449   0.6537    
## `Sleep Hours Per Day`             -4.330e-03  2.578e-03  -1.680   0.0931 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8747 degrees of freedom
## Multiple R-squared:  0.00213,    Adjusted R-squared:  0.0004191 
## F-statistic: 1.245 on 15 and 8747 DF,  p-value: 0.2292
Heart_Attack.lm.7 <- update(Heart_Attack.lm.6, .~. - Obesity, data =
    Heart_Attack)
summary(Heart_Attack.lm.7)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Alcohol Consumption` + `Exercise Hours Per Week` + 
##     `Previous Heart Problems` + `Medication Use` + `Stress Level` + 
##     `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4295 -0.3635 -0.3411  0.6290  0.7161 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.946e-01  5.009e-02   5.880 4.25e-09 ***
## Cholesterol                        1.155e-04  6.341e-05   1.821   0.0686 .  
## Systolic_Pressure                  3.417e-04  1.945e-04   1.757   0.0789 .  
## Diabetes                           1.759e-02  1.076e-02   1.636   0.1020    
## `Alcohol Consumption`             -1.351e-02  1.045e-02  -1.293   0.1961    
## `Exercise Hours Per Week`          9.237e-04  8.862e-04   1.042   0.2973    
## `Previous Heart Problems`          8.868e-04  1.025e-02   0.087   0.9311    
## `Medication Use`                   2.219e-03  1.025e-02   0.216   0.8286    
## `Stress Level`                    -7.095e-04  1.793e-03  -0.396   0.6923    
## `Sedentary Hours Per Day`         -8.846e-04  1.478e-03  -0.598   0.5496    
## Income                             5.475e-08  6.361e-08   0.861   0.3894    
## BMI                               -3.468e-05  8.109e-04  -0.043   0.9659    
## Triglycerides                      2.077e-05  2.291e-05   0.907   0.3646    
## `Physical Activity Days Per Week` -1.023e-03  2.245e-03  -0.456   0.6486    
## `Sleep Hours Per Day`             -4.313e-03  2.578e-03  -1.673   0.0944 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8748 degrees of freedom
## Multiple R-squared:  0.001946,   Adjusted R-squared:  0.0003484 
## F-statistic: 1.218 on 14 and 8748 DF,  p-value: 0.2536
Heart_Attack.lm.8 <- update(Heart_Attack.lm.7, .~. - `Alcohol Consumption`, data =
    Heart_Attack)
summary(Heart_Attack.lm.8)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Exercise Hours Per Week` + `Previous Heart Problems` + 
##     `Medication Use` + `Stress Level` + `Sedentary Hours Per Day` + 
##     Income + BMI + Triglycerides + `Physical Activity Days Per Week` + 
##     `Sleep Hours Per Day`, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4244 -0.3633 -0.3417  0.6302  0.7109 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.866e-01  4.972e-02   5.765 8.44e-09 ***
## Cholesterol                        1.161e-04  6.341e-05   1.830   0.0672 .  
## Systolic_Pressure                  3.389e-04  1.945e-04   1.743   0.0814 .  
## Diabetes                           1.752e-02  1.076e-02   1.628   0.1035    
## `Exercise Hours Per Week`          9.338e-04  8.862e-04   1.054   0.2920    
## `Previous Heart Problems`          7.510e-04  1.025e-02   0.073   0.9416    
## `Medication Use`                   2.173e-03  1.025e-02   0.212   0.8321    
## `Stress Level`                    -6.969e-04  1.793e-03  -0.389   0.6974    
## `Sedentary Hours Per Day`         -8.605e-04  1.478e-03  -0.582   0.5606    
## Income                             5.663e-08  6.360e-08   0.890   0.3732    
## BMI                               -4.589e-05  8.109e-04  -0.057   0.9549    
## Triglycerides                      2.058e-05  2.291e-05   0.898   0.3690    
## `Physical Activity Days Per Week` -1.028e-03  2.245e-03  -0.458   0.6470    
## `Sleep Hours Per Day`             -4.311e-03  2.578e-03  -1.672   0.0945 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8749 degrees of freedom
## Multiple R-squared:  0.001755,   Adjusted R-squared:  0.0002717 
## F-statistic: 1.183 on 13 and 8749 DF,  p-value: 0.2844
Heart_Attack.lm.9 <- update(Heart_Attack.lm.8, .~. - `Exercise Hours Per Week`, data =
    Heart_Attack)
summary(Heart_Attack.lm.9)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Previous Heart Problems` + `Medication Use` + 
##     `Stress Level` + `Sedentary Hours Per Day` + Income + BMI + 
##     Triglycerides + `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4198 -0.3634 -0.3426  0.6304  0.7020 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.961e-01  4.891e-02   6.053 1.48e-09 ***
## Cholesterol                        1.175e-04  6.340e-05   1.853   0.0640 .  
## Systolic_Pressure                  3.370e-04  1.945e-04   1.733   0.0831 .  
## Diabetes                           1.744e-02  1.076e-02   1.621   0.1050    
## `Previous Heart Problems`          8.055e-04  1.025e-02   0.079   0.9374    
## `Medication Use`                   2.093e-03  1.025e-02   0.204   0.8382    
## `Stress Level`                    -7.127e-04  1.793e-03  -0.398   0.6909    
## `Sedentary Hours Per Day`         -8.469e-04  1.478e-03  -0.573   0.5667    
## Income                             5.506e-08  6.358e-08   0.866   0.3865    
## BMI                               -4.286e-05  8.109e-04  -0.053   0.9578    
## Triglycerides                      2.064e-05  2.291e-05   0.901   0.3677    
## `Physical Activity Days Per Week` -1.011e-03  2.245e-03  -0.450   0.6526    
## `Sleep Hours Per Day`             -4.317e-03  2.578e-03  -1.674   0.0941 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8750 degrees of freedom
## Multiple R-squared:  0.001628,   Adjusted R-squared:  0.0002591 
## F-statistic: 1.189 on 12 and 8750 DF,  p-value: 0.2839
Heart_Attack.lm.10 <- update(Heart_Attack.lm.9, .~. - `Previous Heart Problems`, data =
    Heart_Attack)
summary(Heart_Attack.lm.10)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Medication Use` + `Stress Level` + `Sedentary Hours Per Day` + 
##     Income + BMI + Triglycerides + `Physical Activity Days Per Week` + 
##     `Sleep Hours Per Day`, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4194 -0.3634 -0.3426  0.6305  0.7024 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.965e-01  4.861e-02   6.099 1.11e-09 ***
## Cholesterol                        1.174e-04  6.339e-05   1.852   0.0640 .  
## Systolic_Pressure                  3.369e-04  1.944e-04   1.733   0.0832 .  
## Diabetes                           1.744e-02  1.076e-02   1.621   0.1050    
## `Medication Use`                   2.097e-03  1.025e-02   0.205   0.8379    
## `Stress Level`                    -7.152e-04  1.792e-03  -0.399   0.6898    
## `Sedentary Hours Per Day`         -8.473e-04  1.478e-03  -0.573   0.5666    
## Income                             5.504e-08  6.358e-08   0.866   0.3866    
## BMI                               -4.186e-05  8.108e-04  -0.052   0.9588    
## Triglycerides                      2.060e-05  2.291e-05   0.899   0.3684    
## `Physical Activity Days Per Week` -1.009e-03  2.245e-03  -0.450   0.6530    
## `Sleep Hours Per Day`             -4.316e-03  2.578e-03  -1.674   0.0942 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8751 degrees of freedom
## Multiple R-squared:  0.001628,   Adjusted R-squared:  0.0003727 
## F-statistic: 1.297 on 11 and 8751 DF,  p-value: 0.2188
Heart_Attack.lm.11 <- update(Heart_Attack.lm.10, .~. - `Medication Use`, data =
    Heart_Attack)
summary(Heart_Attack.lm.11)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Stress Level` + `Sedentary Hours Per Day` + Income + 
##     BMI + Triglycerides + `Physical Activity Days Per Week` + 
##     `Sleep Hours Per Day`, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4183 -0.3635 -0.3426  0.6303  0.7014 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.976e-01  4.831e-02   6.160 7.61e-10 ***
## Cholesterol                        1.174e-04  6.339e-05   1.852   0.0640 .  
## Systolic_Pressure                  3.368e-04  1.944e-04   1.732   0.0832 .  
## Diabetes                           1.743e-02  1.076e-02   1.621   0.1051    
## `Stress Level`                    -7.149e-04  1.792e-03  -0.399   0.6899    
## `Sedentary Hours Per Day`         -8.404e-04  1.478e-03  -0.569   0.5696    
## Income                             5.500e-08  6.357e-08   0.865   0.3870    
## BMI                               -4.031e-05  8.107e-04  -0.050   0.9603    
## Triglycerides                      2.055e-05  2.290e-05   0.897   0.3696    
## `Physical Activity Days Per Week` -1.014e-03  2.244e-03  -0.452   0.6514    
## `Sleep Hours Per Day`             -4.327e-03  2.578e-03  -1.679   0.0933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8752 degrees of freedom
## Multiple R-squared:  0.001623,   Adjusted R-squared:  0.0004821 
## F-statistic: 1.423 on 10 and 8752 DF,  p-value: 0.1631
Heart_Attack.lm.12 <- update(Heart_Attack.lm.11, .~. - `Stress Level`, data =
    Heart_Attack)
summary(Heart_Attack.lm.12)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Sedentary Hours Per Day` + Income + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4156 -0.3632 -0.3427  0.6305  0.7010 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        2.936e-01  4.725e-02   6.213 5.43e-10 ***
## Cholesterol                        1.180e-04  6.337e-05   1.863   0.0626 .  
## Systolic_Pressure                  3.354e-04  1.944e-04   1.726   0.0845 .  
## Diabetes                           1.741e-02  1.076e-02   1.618   0.1056    
## `Sedentary Hours Per Day`         -8.375e-04  1.478e-03  -0.567   0.5709    
## Income                             5.507e-08  6.357e-08   0.866   0.3863    
## BMI                               -3.930e-05  8.107e-04  -0.048   0.9613    
## Triglycerides                      2.059e-05  2.290e-05   0.899   0.3686    
## `Physical Activity Days Per Week` -1.021e-03  2.244e-03  -0.455   0.6490    
## `Sleep Hours Per Day`             -4.312e-03  2.577e-03  -1.673   0.0943 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8753 degrees of freedom
## Multiple R-squared:  0.001605,   Adjusted R-squared:  0.0005781 
## F-statistic: 1.563 on 9 and 8753 DF,  p-value: 0.1201
Heart_Attack.lm.13 <- update(Heart_Attack.lm.12, .~. - Income, data =
    Heart_Attack)
summary(Heart_Attack.lm.13)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Sedentary Hours Per Day` + BMI + Triglycerides + 
##     `Physical Activity Days Per Week` + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4154 -0.3632 -0.3426  0.6308  0.6966 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.019e-01  4.627e-02   6.524 7.22e-11 ***
## Cholesterol                        1.180e-04  6.337e-05   1.862   0.0626 .  
## Systolic_Pressure                  3.371e-04  1.944e-04   1.735   0.0828 .  
## Diabetes                           1.740e-02  1.076e-02   1.618   0.1058    
## `Sedentary Hours Per Day`         -8.330e-04  1.478e-03  -0.564   0.5730    
## BMI                               -3.313e-05  8.106e-04  -0.041   0.9674    
## Triglycerides                      2.080e-05  2.290e-05   0.908   0.3637    
## `Physical Activity Days Per Week` -1.021e-03  2.244e-03  -0.455   0.6492    
## `Sleep Hours Per Day`             -4.326e-03  2.577e-03  -1.679   0.0933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8754 degrees of freedom
## Multiple R-squared:  0.001519,   Adjusted R-squared:  0.0006066 
## F-statistic: 1.665 on 8 and 8754 DF,  p-value: 0.1015
Heart_Attack.lm.14 <- update(Heart_Attack.lm.13, .~. - BMI, data =
    Heart_Attack)
summary(Heart_Attack.lm.14)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Sedentary Hours Per Day` + Triglycerides + `Physical Activity Days Per Week` + 
##     `Sleep Hours Per Day`, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4156 -0.3632 -0.3427  0.6307  0.6966 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.009e-01  4.001e-02   7.520    6e-14 ***
## Cholesterol                        1.180e-04  6.335e-05   1.862   0.0626 .  
## Systolic_Pressure                  3.371e-04  1.944e-04   1.735   0.0829 .  
## Diabetes                           1.740e-02  1.075e-02   1.618   0.1057    
## `Sedentary Hours Per Day`         -8.329e-04  1.478e-03  -0.564   0.5730    
## Triglycerides                      2.081e-05  2.290e-05   0.909   0.3635    
## `Physical Activity Days Per Week` -1.022e-03  2.244e-03  -0.455   0.6489    
## `Sleep Hours Per Day`             -4.325e-03  2.577e-03  -1.678   0.0933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4793 on 8755 degrees of freedom
## Multiple R-squared:  0.001519,   Adjusted R-squared:  0.0007206 
## F-statistic: 1.903 on 7 and 8755 DF,  p-value: 0.06487
Heart_Attack.lm.15 <- update(Heart_Attack.lm.14, .~. - `Physical Activity Days Per Week`, data =
    Heart_Attack)
summary(Heart_Attack.lm.15)
## 
## Call:
## lm(formula = `Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + 
##     Diabetes + `Sedentary Hours Per Day` + Triglycerides + `Sleep Hours Per Day`, 
##     data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4148 -0.3633 -0.3428  0.6311  0.6976 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.974e-01  3.927e-02   7.573    4e-14 ***
## Cholesterol                1.175e-04  6.334e-05   1.855   0.0636 .  
## Systolic_Pressure          3.378e-04  1.943e-04   1.738   0.0822 .  
## Diabetes                   1.741e-02  1.075e-02   1.619   0.1055    
## `Sedentary Hours Per Day` -8.285e-04  1.478e-03  -0.561   0.5750    
## Triglycerides              2.088e-05  2.290e-05   0.912   0.3618    
## `Sleep Hours Per Day`     -4.341e-03  2.576e-03  -1.685   0.0921 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4793 on 8756 degrees of freedom
## Multiple R-squared:  0.001495,   Adjusted R-squared:  0.000811 
## F-statistic: 2.185 on 6 and 8756 DF,  p-value: 0.04139

The p-value is less than 0.05, which is better. So, we can stop the backward process elimination and use this model even though all predictors that we have left have p-value that is greater than 0.05. The coefficients of each of the predictors are very small (the slope or the rate of change). That indicates that as any of those predictors change, there is a small change in the heart attack risk. Also, the Multiple R-squared value of 0.001495 indicates that only about 0.15% of the variance in Heart Attack Risk is explained by the predictor variables included in the model. In addition, the Adjusted R-squared value of 0.000811 means that approximately 0.0811% of the variance in the dependent variable (Heart Attack Risk) is explained by the predictors.

Residual Analysis:

par(mfrow=c(2,2))
plot(Heart_Attack.lm.15)

# Define the formula with additional terms
formula <- formula(`Heart Attack Risk` ~ Cholesterol + Systolic_Pressure + Diabetes + 
                   `Sedentary Hours Per Day` + Triglycerides + `Sleep Hours Per Day` +
                   I(Cholesterol^2) + as.factor(Diabetes) + Cholesterol:as.factor(Diabetes))

# Fit the linear regression model
model <- lm(formula, data = Heart_Attack)

# Print the summary of the model
summary(model)
## 
## Call:
## lm(formula = formula, data = Heart_Attack)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4116 -0.3638 -0.3433  0.6308  0.7029 
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.848e-01  6.973e-02   4.084 4.47e-05 ***
## Cholesterol                       1.797e-04  4.642e-04   0.387   0.6987    
## Systolic_Pressure                 3.380e-04  1.944e-04   1.739   0.0821 .  
## Diabetes                          3.321e-02  3.633e-02   0.914   0.3607    
## `Sedentary Hours Per Day`        -8.418e-04  1.478e-03  -0.570   0.5690    
## Triglycerides                     2.070e-05  2.290e-05   0.904   0.3662    
## `Sleep Hours Per Day`            -4.370e-03  2.578e-03  -1.695   0.0901 .  
## I(Cholesterol^2)                 -4.336e-08  8.653e-07  -0.050   0.9600    
## as.factor(Diabetes)1                     NA         NA      NA       NA    
## Cholesterol:as.factor(Diabetes)1 -6.064e-05  1.332e-04  -0.455   0.6489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4794 on 8754 degrees of freedom
## Multiple R-squared:  0.001519,   Adjusted R-squared:  0.0006066 
## F-statistic: 1.665 on 8 and 8754 DF,  p-value: 0.1015
# Conduct residual analysis
residuals <- residuals(model)

# Plot residuals vs. fitted values
plot(fitted(model), residuals)

Based on this residual plot, I think the predictors that are on the data already are not significantly not enough or the linear model is not good enough.

Different Models:

# Fit polynomial regression model
poly_model <- lm(`Heart Attack Risk` ~ poly(Cholesterol, 2), data = Heart_Attack)

# Create a data frame with predictor variable(s)
new_data <- data.frame(Cholesterol = Heart_Attack$Cholesterol)

# Predictions
predictions <- predict(poly_model, newdata = new_data)
# Create a scatter plot of actual vs. predicted values
plot(Heart_Attack$`Heart Attack Risk`, predictions, 
     xlab = "Actual Heart Attack Risk", ylab = "Predicted Heart Attack Risk",
     main = "Actual vs. Predicted Heart Attack Risk")
abline(a = 0, b = 1, col = "red")  # Add a 45-degree line for reference

Conclusion:

In conclusion, based on the analysis, it seems like the predictors are not explaining the variability in heart attack risk. since the R-squared value is low, it means that the predictors exaplin only small percentage of the response variable.