Calling the dataset
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## age = col_double(),
## sex = col_double(),
## cp = col_double(),
## trestbps = col_double(),
## chol = col_double(),
## fbs = col_double(),
## restecg = col_double(),
## thalach = col_double(),
## exang = col_double(),
## oldpeak = col_double(),
## slope = col_double(),
## ca = col_double(),
## thal = col_double(),
## target = col_double(),
## sexfactor = col_character(),
## riskfactor = col_character(),
## exangfactor = col_character(),
## fbsfactor = col_character()
## )
head(heartdata1)
## # A tibble: 6 x 19
## X1 age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 63 1 3 145 233 1 0 150 0 2.3
## 2 2 37 1 2 130 250 0 1 187 0 3.5
## 3 3 41 0 1 130 204 0 0 172 0 1.4
## 4 4 56 1 1 120 236 0 1 178 0 0.8
## 5 5 57 0 0 120 354 0 1 163 1 0.6
## 6 6 57 1 0 140 192 0 1 148 0 0.4
## # … with 8 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## # sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>
heartdata1$fbsfactor <- factor(heartdata1$fbs, levels=c(0,1),
labels=c("No","Yes"))
Numeric Response Variable: Blood Pressure: mmHg (millimeters of mercury)
Numeric Predictor Variable: Age: years
Categorical Predictor Variable: Fasting Blood Sugar: 0 = Fasted Blood Sugar <120 mg/dl, 1 = Fasted Blood Sugar >120 mg/dl
Simple Linear Model
BPmod<-lm(trestbps~age, heartdata1)
summary(BPmod)
##
## Call:
## lm(formula = trestbps ~ age, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.439 -11.499 -1.044 10.192 67.495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102.2961 5.8906 17.366 < 2e-16 ***
## age 0.5394 0.1069 5.048 7.76e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.87 on 301 degrees of freedom
## Multiple R-squared: 0.07804, Adjusted R-squared: 0.07497
## F-statistic: 25.48 on 1 and 301 DF, p-value: 7.762e-07
ggplot(heartdata1, aes(age, trestbps))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)+
labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", title="Age and Blood Pressure")+
theme_classic()
## `geom_smooth()` using formula 'y ~ x'

The relationship appears to be significant (p = 7.76E-07).
Model with Categorical Variable
BPmod2<-lm(trestbps~fbsfactor, heartdata1)
contrasts(heartdata1$fbsfactor)
## Yes
## No 0
## Yes 1
summary(BPmod2)
##
## Call:
## lm(formula = trestbps ~ fbsfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.067 -10.326 -0.326 9.674 61.674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 130.326 1.076 121.08 < 2e-16 ***
## fbsfactorYes 8.741 2.793 3.13 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.29 on 301 degrees of freedom
## Multiple R-squared: 0.03152, Adjusted R-squared: 0.0283
## F-statistic: 9.795 on 1 and 301 DF, p-value: 0.001921
anova(BPmod2)
## Analysis of Variance Table
##
## Response: trestbps
## Df Sum Sq Mean Sq F value Pr(>F)
## fbsfactor 1 2928 2927.66 9.7954 0.001921 **
## Residuals 301 89963 298.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(heartdata1, aes(x=fbsfactor, y=trestbps, fill=fbsfactor))+
geom_boxplot()+
labs(x="Fasted Blood Sugar >120 mg/dl", y="Resting Blood Pressure (mmHg)", title="Fasted Blood Sugar and Blood Pressure")+
scale_fill_manual(breaks = c("No", "Yes"),
values=c("#0099FF", "#FF0033"))+
theme_classic()

The difference in means appears to be significant (p = 0.001921).
Multiple Linear Regression
multiBPmod<-lm(trestbps~age+fbsfactor, heartdata1)
multiBPmod
##
## Call:
## lm(formula = trestbps ~ age + fbsfactor, data = heartdata1)
##
## Coefficients:
## (Intercept) age fbsfactorYes
## 103.0867 0.5053 7.1782
ggplot(heartdata1, aes(x=age, y=trestbps, color=fbsfactor))+
geom_point()+
scale_color_manual(values = c("blue", "red"))+
geom_abline(intercept = multiBPmod$coefficients[1], slope=multiBPmod$coefficients[2],
color="blue", lwd=1)+
geom_abline(intercept = multiBPmod$coefficients[1]+multiBPmod$coefficients[3], slope=multiBPmod$coefficients[2],
color="red", lwd=1)+
labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", color="Fasted Blood Sugar >120 mg/dl", title="Age, Fasted Blood Sugar, and Blood Pressure")+
theme_classic()

Fasted Blood Sugar Below 120 mg/dl: Resting Blood Pressure = 0.5053(Age) + 103.0867
Fased Blood Sugar Above 120 mg/dl: Resting Blood Pressure = 0.5053(Age) + 110.2649
multiBPmod1<-lm(trestbps~age*fbsfactor, heartdata1)
multiBPmod1
##
## Call:
## lm(formula = trestbps ~ age * fbsfactor, data = heartdata1)
##
## Coefficients:
## (Intercept) age fbsfactorYes age:fbsfactorYes
## 104.7495 0.4744 -12.3948 0.3451
summary(multiBPmod1)
##
## Call:
## lm(formula = trestbps ~ age * fbsfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.540 -11.209 -1.608 10.866 61.753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.7495 6.1135 17.134 < 2e-16 ***
## age 0.4744 0.1118 4.245 2.92e-05 ***
## fbsfactorYes -12.3948 21.3758 -0.580 0.562
## age:fbsfactorYes 0.3451 0.3738 0.923 0.357
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.71 on 299 degrees of freedom
## Multiple R-squared: 0.1015, Adjusted R-squared: 0.09252
## F-statistic: 11.26 on 3 and 299 DF, p-value: 5.065e-07
ggplot(heartdata1, aes(x=age, y=trestbps, color=fbsfactor))+
geom_point()+
scale_color_manual(values = c("blue", "red"))+
geom_abline(intercept = multiBPmod1$coefficients[1], slope=multiBPmod1$coefficients[2],
color="blue", lwd=1)+
geom_abline(intercept = multiBPmod1$coefficients[1]+multiBPmod1$coefficients[3], slope=multiBPmod1$coefficients[2]+multiBPmod1$coefficients[4],
color="red", lwd=1)+
labs(x="Age (years)", y="Resting Blood Pressure (mmHg)", color="Fasted Blood Sugar >120 mg/dl", title="The Interaction between Age and Fasted Blood Sugar,
and its Impact on Resting Blood Pressure")+
theme_classic()

Interaction between Age and Fasting Blood Sugar Below 120 mg/dl: Resting Blood Pressure = 0.4744(Age) + 104.7495
Interaction between Age and Fasting Blood Sugar Above 120 mg/dl: Resting Blood Pressure = 0.8195(Age) + 92.3547
Should we include the insignificant values of the model? If not, the interaction between Age and Fasting Blood Sugar Above 120 mg/dl would be: Resting Blood Pressure = 0.4744(Age) + 104.7495
anova(BPmod)
## Analysis of Variance Table
##
## Response: trestbps
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 7249 7248.9 25.477 7.762e-07 ***
## Residuals 301 85642 284.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(BPmod2)
## Analysis of Variance Table
##
## Response: trestbps
## Df Sum Sq Mean Sq F value Pr(>F)
## fbsfactor 1 2928 2927.66 9.7954 0.001921 **
## Residuals 301 89963 298.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(multiBPmod)
## Analysis of Variance Table
##
## Response: trestbps
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 7249 7248.9 25.9828 6.109e-07 ***
## fbsfactor 1 1945 1945.3 6.9726 0.00871 **
## Residuals 300 83697 279.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(multiBPmod1)
## Analysis of Variance Table
##
## Response: trestbps
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 7249 7248.9 25.9700 6.158e-07 ***
## fbsfactor 1 1945 1945.3 6.9692 0.008728 **
## age:fbsfactor 1 238 237.9 0.8522 0.356668
## Residuals 299 83459 279.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The parallel line model has the lowest MSE (278.99) and all p-values are significant, which suggests that it is the best model at fitting the data. The multiple variable model including the interaction between age and fasted blood sugar has a similar MSE (279.1271), but the interaction is not significant (0.3566) thus we believe that the multiple variable model without including the interaction is better at predicting resting blood pressure. The single explanatory variable models are both significant but have higher MSE values (284.5249, 298.8804).
Conclusion
As stated above the single explanatory variable models are both significant and in the multiple variable model without interaction included, all p-values were significant. In the multiple linear model with interaction included, the p-values were significant for the slope and intercept of the group with Fasted Blood Sugar below 120 mg/dl but not for the group above 120 mg/dl. Thus, we learned that when interaction is not included the relationship between age, fasting blood sugar and resting blood pressure is significant but that including the interaction decreases the significance by increasing the p-values. Perhaps the interaction between other categorical variables and age could have a more significant impact on resting blood pressure than fasted blood sugar.