dat <- read.csv(file.choose(),header = TRUE,stringsAsFactors = TRUE)
library(dplyr)
dat$x6.regular.irregular.sleep <- na_if(dat$x6.regular.irregular.sleep,"")
dat$smoking..Yes.No.Some. <- na_if(dat$smoking..Yes.No.Some.,"")
dat$Degree.Seeking..MS.PhD. <- na_if(dat$Degree.Seeking..MS.PhD.,"")
dat$Dietary..Veg.NoVeg. <- na_if(dat$Dietary..Veg.NoVeg.,"")
dat$Coffee..Yes.No.Some. <- na_if(dat$Coffee..Yes.No.Some.,"")
dat$HistHeartDisease..Yes.No. <- na_if(dat$HistHeartDisease..Yes.No.,"")
dat$Alcohol..Yes.No.Some. <- na_if(dat$Alcohol..Yes.No.Some.,"")
dat$x1.Continent <- as.factor(dat$x1.Continent)
model <- lm(y~x1.Continent,data = dat)
summary(model)
##
## Call:
## lm(formula = y ~ x1.Continent, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.556 -8.200 -0.556 5.444 43.444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.200 3.568 20.238 <2e-16 ***
## x1.ContinentAS 4.356 3.884 1.121 0.265
## x1.ContinentEU -4.771 5.560 -0.858 0.393
## x1.ContinentNAm 6.050 5.351 1.131 0.262
## x1.ContinentNAM 1.800 7.426 0.242 0.809
## x1.ContinentSA -1.400 6.179 -0.227 0.821
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.28 on 81 degrees of freedom
## Multiple R-squared: 0.07082, Adjusted R-squared: 0.01347
## F-statistic: 1.235 on 5 and 81 DF, p-value: 0.3005
No,
Neither continent, nor the regression is significant. Our regression has a R-squared of 0.07 and a p-value of 0.3.
dat$x2...Exercize.Freq <- as.integer(dat$x2...Exercize.Freq)
model2 <- lm(y~x2...Exercize.Freq,data = dat)
summary(model2)
##
## Call:
## lm(formula = y ~ x2...Exercize.Freq, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.419 -7.133 -2.847 6.724 44.867
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.2742 3.3715 22.030 <2e-16 ***
## x2...Exercize.Freq 0.2863 1.1483 0.249 0.804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.42 on 85 degrees of freedom
## Multiple R-squared: 0.0007309, Adjusted R-squared: -0.01103
## F-statistic: 0.06217 on 1 and 85 DF, p-value: 0.8037
No.
Similarly, exercise frequency with a p-value of 0.80 and R-squared of 0.0007 is not significant.
dat$x2...Exercize.Freq <- factor(dat$x2...Exercize.Freq,c('1','2','3','4','5'),ordered = TRUE)
model1 <- lm(y~x2...Exercize.Freq,data = dat)
summary(model1)
##
## Call:
## lm(formula = y ~ x2...Exercize.Freq, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.579 -6.900 -2.773 5.921 45.100
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.2555 1.6999 44.269 <2e-16 ***
## x2...Exercize.Freq.L 2.2415 4.8161 0.465 0.643
## x2...Exercize.Freq.Q -0.2941 4.2235 -0.070 0.945
## x2...Exercize.Freq.C 2.8551 3.2718 0.873 0.385
## x2...Exercize.Freq^4 -0.7149 2.4614 -0.290 0.772
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.55 on 82 degrees of freedom
## Multiple R-squared: 0.01425, Adjusted R-squared: -0.03384
## F-statistic: 0.2963 on 4 and 82 DF, p-value: 0.8796
No.
Changing to varied marginal response does not fix the issue and our model is still not significant. Our resuls show a R-squared of 0.01 and a p-value of 0.88
dat$x2...Exercize.Freq <- as.integer(dat$x2...Exercize.Freq)
model3 <- lm(y~x1.Continent+x2...Exercize.Freq+x3...Gender+x4...hrs.of.sleep+x5.early.night+x6.regular.irregular.sleep+smoking..Yes.No.Some.+Degree.Seeking..MS.PhD.+Coffee..Yes.No.Some.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model3)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x3...Gender +
## x4...hrs.of.sleep + x5.early.night + x6.regular.irregular.sleep +
## smoking..Yes.No.Some. + Degree.Seeking..MS.PhD. + Coffee..Yes.No.Some. +
## HistHeartDisease..Yes.No. + Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## 68 69 70 71 72 73 74
## 3.192e-16 -3.257e+00 -1.276e+00 -5.068e+00 4.486e+00 -2.979e+00 -1.229e+00
## 75 76 77 78 79 80 81
## 1.229e+00 -2.297e+00 1.536e+00 -6.022e+00 1.981e+00 -1.113e+00 5.108e+00
## 82 83 84 85 86 87
## 5.017e+00 5.276e+00 4.047e+00 -2.557e+00 -3.249e+00 3.685e-01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.019 23.360 0.215 0.8404
## x1.ContinentEU -11.183 7.864 -1.422 0.2281
## x1.ContinentNAM -12.903 8.000 -1.613 0.1821
## x1.ContinentSA -79.163 17.825 -4.441 0.0113 *
## x2...Exercize.Freq 14.108 3.217 4.386 0.0118 *
## x3...GenderM -3.320 5.038 -0.659 0.5459
## x4...hrs.of.sleep 3.867 2.494 1.550 0.1960
## x5.early.night -14.407 7.177 -2.007 0.1151
## x6.regular.irregular.sleepreg 9.535 6.481 1.471 0.2152
## smoking..Yes.No.Some.Yes -3.762 6.184 -0.608 0.5758
## Degree.Seeking..MS.PhD.PHD 22.831 8.440 2.705 0.0538 .
## Coffee..Yes.No.Some.Some 11.275 6.484 1.739 0.1570
## Coffee..Yes.No.Some.Yes -3.743 7.268 -0.515 0.6337
## HistHeartDisease..Yes.No.Yes 29.501 9.250 3.189 0.0332 *
## Alcohol..Yes.No.Some.Some 6.297 5.011 1.257 0.2773
## Alcohol..Yes.No.Some.Yes -23.372 7.548 -3.096 0.0364 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.611 on 4 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.9464, Adjusted R-squared: 0.7453
## F-statistic: 4.707 on 15 and 4 DF, p-value: 0.07254
Since coffee has the least p-value, we will remove it from the model.
model4 <- lm(y~x1.Continent+x2...Exercize.Freq+x3...Gender+x4...hrs.of.sleep+x5.early.night+x6.regular.irregular.sleep+smoking..Yes.No.Some.+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model4)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x3...Gender +
## x4...hrs.of.sleep + x5.early.night + x6.regular.irregular.sleep +
## smoking..Yes.No.Some. + Degree.Seeking..MS.PhD. + HistHeartDisease..Yes.No. +
## Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.6130 -3.9484 -0.3689 4.3660 9.2066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1710 22.9790 1.095 0.31536
## x1.ContinentEU -13.2748 8.6513 -1.534 0.17582
## x1.ContinentNAM -13.2314 8.0811 -1.637 0.15268
## x1.ContinentSA -59.0450 14.6480 -4.031 0.00687 **
## x2...Exercize.Freq 12.9035 3.4158 3.778 0.00921 **
## x3...GenderM -0.7075 5.4458 -0.130 0.90087
## x4...hrs.of.sleep 1.9789 2.4952 0.793 0.45794
## x5.early.night -10.8964 7.1173 -1.531 0.17666
## x6.regular.irregular.sleepreg 8.5075 6.8505 1.242 0.26062
## smoking..Yes.No.Some.Yes -6.2974 6.4702 -0.973 0.36800
## Degree.Seeking..MS.PhD.PHD 12.9007 6.6514 1.940 0.10050
## HistHeartDisease..Yes.No.Yes 32.9897 8.6270 3.824 0.00872 **
## Alcohol..Yes.No.Some.Some 2.5027 5.0868 0.492 0.64020
## Alcohol..Yes.No.Some.Yes -27.6017 8.0810 -3.416 0.01422 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.557 on 6 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.8984, Adjusted R-squared: 0.6781
## F-statistic: 4.079 on 13 and 6 DF, p-value: 0.04716
Thereafter, we will remove the gender factor; as it has the highest p-value.
model5 <- lm(y~x1.Continent+x2...Exercize.Freq+x4...hrs.of.sleep+x5.early.night+x6.regular.irregular.sleep+smoking..Yes.No.Some.+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model5)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x4...hrs.of.sleep +
## x5.early.night + x6.regular.irregular.sleep + smoking..Yes.No.Some. +
## Degree.Seeking..MS.PhD. + HistHeartDisease..Yes.No. + Alcohol..Yes.No.Some.,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9877 -3.8623 -0.4474 4.1897 9.0894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.164 20.057 1.205 0.26743
## x1.ContinentEU -13.075 7.894 -1.656 0.14160
## x1.ContinentNAM -13.440 7.343 -1.830 0.10988
## x1.ContinentSA -59.427 13.304 -4.467 0.00291 **
## x2...Exercize.Freq 13.099 2.845 4.605 0.00247 **
## x4...hrs.of.sleep 1.983 2.313 0.857 0.41978
## x5.early.night -11.225 6.168 -1.820 0.11160
## x6.regular.irregular.sleepreg 8.768 6.073 1.444 0.19198
## smoking..Yes.No.Some.Yes -6.560 5.698 -1.151 0.28738
## Degree.Seeking..MS.PhD.PHD 13.223 5.723 2.310 0.05415 .
## HistHeartDisease..Yes.No.Yes 33.512 7.078 4.735 0.00212 **
## Alcohol..Yes.No.Some.Some 2.562 4.697 0.546 0.60230
## Alcohol..Yes.No.Some.Yes -28.042 6.801 -4.123 0.00444 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.933 on 7 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.8981, Adjusted R-squared: 0.7233
## F-statistic: 5.14 on 12 and 7 DF, p-value: 0.01919
by removing these two factors, our models p-value has improved. Still, next we will remove hours of sleep.
model6 <- lm(y~x1.Continent+x2...Exercize.Freq+x5.early.night+x6.regular.irregular.sleep+smoking..Yes.No.Some.+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model6)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x5.early.night +
## x6.regular.irregular.sleep + smoking..Yes.No.Some. + Degree.Seeking..MS.PhD. +
## HistHeartDisease..Yes.No. + Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9932 -3.5178 -0.6934 4.4196 9.2893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.732 8.363 4.751 0.00144 **
## x1.ContinentEU -11.013 7.392 -1.490 0.17459
## x1.ContinentNAM -11.486 6.863 -1.674 0.13274
## x1.ContinentSA -58.367 13.025 -4.481 0.00205 **
## x2...Exercize.Freq 12.183 2.592 4.700 0.00154 **
## x5.early.night -10.562 6.017 -1.755 0.11729
## x6.regular.irregular.sleepreg 8.535 5.965 1.431 0.19036
## smoking..Yes.No.Some.Yes -7.195 5.555 -1.295 0.23138
## Degree.Seeking..MS.PhD.PHD 14.370 5.471 2.626 0.03035 *
## HistHeartDisease..Yes.No.Yes 31.644 6.621 4.779 0.00139 **
## Alcohol..Yes.No.Some.Some 3.219 4.556 0.706 0.49995
## Alcohol..Yes.No.Some.Yes -27.556 6.664 -4.135 0.00328 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.8 on 8 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.8874, Adjusted R-squared: 0.7325
## F-statistic: 5.73 on 11 and 8 DF, p-value: 0.01002
Next we will remove smooking from the model.
model7 <- lm(y~x1.Continent+x2...Exercize.Freq+x5.early.night+x6.regular.irregular.sleep+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model7)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x5.early.night +
## x6.regular.irregular.sleep + Degree.Seeking..MS.PhD. + HistHeartDisease..Yes.No. +
## Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4084 -4.2455 0.4463 4.3035 8.0047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.974 8.133 4.423 0.00166 **
## x1.ContinentEU -9.788 7.602 -1.288 0.23003
## x1.ContinentNAM -11.263 7.114 -1.583 0.14783
## x1.ContinentSA -57.860 13.500 -4.286 0.00203 **
## x2...Exercize.Freq 12.356 2.684 4.603 0.00128 **
## x5.early.night -9.512 6.183 -1.539 0.15829
## x6.regular.irregular.sleepreg 10.899 5.888 1.851 0.09721 .
## Degree.Seeking..MS.PhD.PHD 14.563 5.671 2.568 0.03030 *
## HistHeartDisease..Yes.No.Yes 30.886 6.839 4.516 0.00145 **
## Alcohol..Yes.No.Some.Some 4.349 4.637 0.938 0.37281
## Alcohol..Yes.No.Some.Yes -26.056 6.805 -3.829 0.00404 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.088 on 9 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.8638, Adjusted R-squared: 0.7124
## F-statistic: 5.706 on 10 and 9 DF, p-value: 0.007552
next we will remove the early night factor.
model8 <- lm(y~x1.Continent+x2...Exercize.Freq+x6.regular.irregular.sleep+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model8)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + x6.regular.irregular.sleep +
## Degree.Seeking..MS.PhD. + HistHeartDisease..Yes.No. + Alcohol..Yes.No.Some.,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0091 -4.8454 0.2928 4.8773 12.4265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.224 8.669 4.178 0.00189 **
## x1.ContinentEU -10.116 8.102 -1.249 0.24026
## x1.ContinentNAM -11.623 7.581 -1.533 0.15625
## x1.ContinentSA -47.538 12.490 -3.806 0.00345 **
## x2...Exercize.Freq 10.698 2.621 4.081 0.00221 **
## x6.regular.irregular.sleepreg 11.656 6.256 1.863 0.09204 .
## Degree.Seeking..MS.PhD.PHD 9.867 5.096 1.936 0.08161 .
## HistHeartDisease..Yes.No.Yes 25.007 6.047 4.135 0.00203 **
## Alcohol..Yes.No.Some.Some 4.383 4.944 0.887 0.39617
## Alcohol..Yes.No.Some.Yes -22.180 6.740 -3.291 0.00814 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.624 on 10 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.8279, Adjusted R-squared: 0.6731
## F-statistic: 5.346 on 9 and 10 DF, p-value: 0.007528
Thereafter, we will remove regular/iregular sleep factor.
model9 <- lm(y~x1.Continent+x2...Exercize.Freq+Degree.Seeking..MS.PhD.+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model9)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + Degree.Seeking..MS.PhD. +
## HistHeartDisease..Yes.No. + Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8412 -4.3430 -0.8448 4.6084 16.9641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.591 6.816 6.982 2.32e-05 ***
## x1.ContinentEU -12.272 8.874 -1.383 0.19413
## x1.ContinentNAM -4.535 7.257 -0.625 0.54472
## x1.ContinentSA -34.498 11.448 -3.013 0.01179 *
## x2...Exercize.Freq 7.904 2.379 3.322 0.00680 **
## Degree.Seeking..MS.PhD.PHD 8.291 5.561 1.491 0.16414
## HistHeartDisease..Yes.No.Yes 19.759 5.922 3.337 0.00663 **
## Alcohol..Yes.No.Some.Some 3.683 5.456 0.675 0.51357
## Alcohol..Yes.No.Some.Yes -17.554 6.934 -2.531 0.02790 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.543 on 11 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.7682, Adjusted R-squared: 0.5996
## F-statistic: 4.557 on 8 and 11 DF, p-value: 0.01161
In the next step we will remove the degree factor.
model10 <- lm(y~x1.Continent+x2...Exercize.Freq+HistHeartDisease..Yes.No.+Alcohol..Yes.No.Some.,data = dat)
summary(model10)
##
## Call:
## lm(formula = y ~ x1.Continent + x2...Exercize.Freq + HistHeartDisease..Yes.No. +
## Alcohol..Yes.No.Some., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.953 -6.364 1.348 3.418 22.075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.694 6.813 7.441 7.83e-06 ***
## x1.ContinentEU -14.257 9.210 -1.548 0.14759
## x1.ContinentNAM -7.553 7.315 -1.033 0.32215
## x1.ContinentSA -29.581 11.507 -2.571 0.02452 *
## x2...Exercize.Freq 7.972 2.497 3.193 0.00774 **
## HistHeartDisease..Yes.No.Yes 21.538 6.089 3.537 0.00409 **
## Alcohol..Yes.No.Some.Some 1.777 5.567 0.319 0.75503
## Alcohol..Yes.No.Some.Yes -18.426 7.253 -2.540 0.02592 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.02 on 12 degrees of freedom
## (67 observations deleted due to missingness)
## Multiple R-squared: 0.7214, Adjusted R-squared: 0.5588
## F-statistic: 4.438 on 7 and 12 DF, p-value: 0.01184
At this point at least one level of every remaining factor is significant. Therefore, this model is the best fit model.
We considered response of the x2 to be fixed. The main reason was the ease of data interpretation.
Our R-squared of 0.72 of our final model suggests that, this model can describe 72% of variation in the data. Our full model with all the variable had a R-squared of 0.95, but as we removed factors, as expected, our R-squared value dropped.
Our results show that continent, exercise frequency, history of heart disease, as well as alcohol consumption effects are significant on the heart rate. For the continents, only south america was significantly different than Asia. This effect was surprising. It was expected that exercise frequency can affect the heart rate. Similarly, effect of the heart disease history on heart rate was expected. Although some alcohol consumption did not significantly affect the heart rate, “yes” alcohol consumption was significantly different than “No” level.