Calling the dataset

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double(),
##   sexfactor = col_character(),
##   riskfactor = col_character(),
##   exangfactor = col_character(),
##   fbsfactor = col_character()
## )
head(heartdata1)
## # A tibble: 6 x 19
##      X1   age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl>
## 1     1    63     1     3      145   233     1       0     150     0     2.3
## 2     2    37     1     2      130   250     0       1     187     0     3.5
## 3     3    41     0     1      130   204     0       0     172     0     1.4
## 4     4    56     1     1      120   236     0       1     178     0     0.8
## 5     5    57     0     0      120   354     0       1     163     1     0.6
## 6     6    57     1     0      140   192     0       1     148     0     0.4
## # … with 8 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## #   sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>
heartdata1$fbsfactor <- factor(heartdata1$fbs, levels=c(0,1),
                                   labels=c("No","Yes"))

Numeric Response Variable: Blood Pressure: mmHg (millimeters of mercury)

Numeric Predictor Variable: Age: years

Categorical Predictor Variable: Fasting Blood Sugar: 0 = Fasted Blood Sugar <120 mg/dl, 1 = Fasted Blood Sugar >120 mg/dl

Simple Linear Model

BPmod<-lm(trestbps~age, heartdata1)
summary(BPmod)
## 
## Call:
## lm(formula = trestbps ~ age, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.439 -11.499  -1.044  10.192  67.495 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 102.2961     5.8906  17.366  < 2e-16 ***
## age           0.5394     0.1069   5.048 7.76e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.87 on 301 degrees of freedom
## Multiple R-squared:  0.07804,    Adjusted R-squared:  0.07497 
## F-statistic: 25.48 on 1 and 301 DF,  p-value: 7.762e-07
ggplot(heartdata1, aes(age, trestbps))+
  geom_point()+
  geom_smooth(method = "lm", se = FALSE)+
  labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", title="Age and Blood Pressure")+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

The relationship appears to be significant (p = 7.76E-07).

Model with Categorical Variable

BPmod2<-lm(trestbps~fbsfactor, heartdata1)
contrasts(heartdata1$fbsfactor)
##     Yes
## No    0
## Yes   1
summary(BPmod2)
## 
## Call:
## lm(formula = trestbps ~ fbsfactor, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.067 -10.326  -0.326   9.674  61.674 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   130.326      1.076  121.08  < 2e-16 ***
## fbsfactorYes    8.741      2.793    3.13  0.00192 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.29 on 301 degrees of freedom
## Multiple R-squared:  0.03152,    Adjusted R-squared:  0.0283 
## F-statistic: 9.795 on 1 and 301 DF,  p-value: 0.001921
anova(BPmod2)
## Analysis of Variance Table
## 
## Response: trestbps
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## fbsfactor   1   2928 2927.66  9.7954 0.001921 **
## Residuals 301  89963  298.88                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(heartdata1, aes(x=fbsfactor, y=trestbps, fill=fbsfactor))+
  geom_boxplot()+
  labs(x="Fasted Blood Sugar >120 mg/dl", y="Resting Blood Pressure (mmHg)", title="Fasted Blood Sugar and Blood Pressure")+
  scale_fill_manual(breaks = c("No", "Yes"), 
                       values=c("#0099FF", "#FF0033"))+
  theme_classic()

The difference in means appears to be significant (p = 0.001921).

Multiple Linear Regression

multiBPmod<-lm(trestbps~age+fbsfactor, heartdata1)
multiBPmod
## 
## Call:
## lm(formula = trestbps ~ age + fbsfactor, data = heartdata1)
## 
## Coefficients:
##  (Intercept)           age  fbsfactorYes  
##     103.0867        0.5053        7.1782
ggplot(heartdata1, aes(x=age, y=trestbps, color=fbsfactor))+
  geom_point()+
  scale_color_manual(values = c("blue", "red"))+
  geom_abline(intercept = multiBPmod$coefficients[1], slope=multiBPmod$coefficients[2],
              color="blue", lwd=1)+
  geom_abline(intercept = multiBPmod$coefficients[1]+multiBPmod$coefficients[3], slope=multiBPmod$coefficients[2],
              color="red", lwd=1)+
  labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", color="Fasted Blood Sugar >120 mg/dl", title="Age, Fasted Blood Sugar, and Blood Pressure")+
  theme_classic()

Fasted Blood Sugar Below 120 mg/dl: Resting Blood Pressure = 0.5053(Age) + 103.0867

Fased Blood Sugar Above 120 mg/dl: Resting Blood Pressure = 0.5053(Age) + 110.2649

multiBPmod1<-lm(trestbps~age*fbsfactor, heartdata1)
multiBPmod1
## 
## Call:
## lm(formula = trestbps ~ age * fbsfactor, data = heartdata1)
## 
## Coefficients:
##      (Intercept)               age      fbsfactorYes  age:fbsfactorYes  
##         104.7495            0.4744          -12.3948            0.3451
summary(multiBPmod1)
## 
## Call:
## lm(formula = trestbps ~ age * fbsfactor, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.540 -11.209  -1.608  10.866  61.753 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      104.7495     6.1135  17.134  < 2e-16 ***
## age                0.4744     0.1118   4.245 2.92e-05 ***
## fbsfactorYes     -12.3948    21.3758  -0.580    0.562    
## age:fbsfactorYes   0.3451     0.3738   0.923    0.357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.71 on 299 degrees of freedom
## Multiple R-squared:  0.1015, Adjusted R-squared:  0.09252 
## F-statistic: 11.26 on 3 and 299 DF,  p-value: 5.065e-07
ggplot(heartdata1, aes(x=age, y=trestbps, color=fbsfactor))+
  geom_point()+
  scale_color_manual(values = c("blue", "red"))+
  geom_abline(intercept = multiBPmod1$coefficients[1], slope=multiBPmod1$coefficients[2],
              color="blue", lwd=1)+
  geom_abline(intercept = multiBPmod1$coefficients[1]+multiBPmod1$coefficients[3], slope=multiBPmod1$coefficients[2]+multiBPmod1$coefficients[4],
              color="red", lwd=1)+
  labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", color="Fasted Blood Sugar >120 mg/dl", title="The Interaction between Age and Fasted Blood Sugar,
  and its Impact on Resting Blood Pressure")+
  theme_classic()

Interaction between Age and Fasting Blood Sugar Below 120 mg/dl: Resting Blood Pressure = 0.4744(Age) + 104.7495

Interaction between Age and Fasting Blood Sugar Above 120 mg/dl: Resting Blood Pressure = 0.8195(Age) + 92.3547

Should we include the insignificant values of the model? If not, the interaction between Age and Fasting Blood Sugar Above 120 mg/dl would be: Resting Blood Pressure = 0.4744(Age) + 104.7495

anova(BPmod)
## Analysis of Variance Table
## 
## Response: trestbps
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## age         1   7249  7248.9  25.477 7.762e-07 ***
## Residuals 301  85642   284.5                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(BPmod2)
## Analysis of Variance Table
## 
## Response: trestbps
##            Df Sum Sq Mean Sq F value   Pr(>F)   
## fbsfactor   1   2928 2927.66  9.7954 0.001921 **
## Residuals 301  89963  298.88                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(multiBPmod)
## Analysis of Variance Table
## 
## Response: trestbps
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## age         1   7249  7248.9 25.9828 6.109e-07 ***
## fbsfactor   1   1945  1945.3  6.9726   0.00871 ** 
## Residuals 300  83697   279.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(multiBPmod1)
## Analysis of Variance Table
## 
## Response: trestbps
##                Df Sum Sq Mean Sq F value    Pr(>F)    
## age             1   7249  7248.9 25.9700 6.158e-07 ***
## fbsfactor       1   1945  1945.3  6.9692  0.008728 ** 
## age:fbsfactor   1    238   237.9  0.8522  0.356668    
## Residuals     299  83459   279.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The parallel line model has the lowest MSE (278.99) and all p-values are significant, which suggests that it is the best model at fitting the data. The multiple variable model including the interaction between age and fasted blood sugar has a similar MSE (279.1271), but the interaction is not significant (0.3566) thus we believe that the multiple variable model without including the interaction is better at predicting resting blood pressure. The single explanatory variable models are both significant but have higher MSE values (284.5249, 298.8804).

Conclusion

As stated above the single explanatory variable models are both significant and in the multiple variable model without interaction included, all p-values were significant. In the multiple linear model with interaction included, the p-values were significant for the slope and intercept of the group with Fasted Blood Sugar below 120 mg/dl but not for the group above 120 mg/dl. Thus, we learned that when interaction is not included the relationship between age, fasting blood sugar and resting blood pressure is significant but that including the interaction decreases the significance by increasing the p-values. Perhaps the interaction between other categorical variables and age could have a more significant impact on resting blood pressure than fasted blood sugar.