To get a view of the characteristics of data set, we use the
glimpse function
glimpse(Health_data)
## Rows: 1,475
## Columns: 9
## $ systolic <dbl> 100, 112, 134, 108, 128, 102, 126, 124, 166, 138, 118, 124, 9…
## $ weight <dbl> 98.6, 96.9, 108.2, 84.8, 97.0, 102.4, 99.4, 53.6, 78.6, 135.5…
## $ height <dbl> 172.0, 186.0, 154.4, 168.9, 175.3, 150.5, 157.8, 162.4, 156.9…
## $ bmi <dbl> 33.3, 28.0, 45.4, 29.7, 31.6, 45.2, 39.9, 20.3, 31.9, 41.7, 2…
## $ waist <dbl> 120.4, 107.8, 120.3, 109.0, 111.1, 130.7, 113.2, 74.6, 102.8,…
## $ age <dbl> 43, 57, 38, 75, 42, 63, 58, 26, 51, 61, 47, 52, 64, 55, 72, 8…
## $ diabetes <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ smoker <dbl> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1…
## $ fastfood <dbl> 5, 0, 2, 1, 1, 3, 6, 5, 0, 1, 0, 3, 0, 1, 0, 5, 0, 2, 1, 3, 2…
Since the features of diabetes and smoker are binary, we
convert to factors
Health_data$diabetes <- as.factor(Health_data$diabetes)
Health_data$smoker <- as.factor(Health_data$smoker)
glimpse(Health_data)
## Rows: 1,475
## Columns: 9
## $ systolic <dbl> 100, 112, 134, 108, 128, 102, 126, 124, 166, 138, 118, 124, 9…
## $ weight <dbl> 98.6, 96.9, 108.2, 84.8, 97.0, 102.4, 99.4, 53.6, 78.6, 135.5…
## $ height <dbl> 172.0, 186.0, 154.4, 168.9, 175.3, 150.5, 157.8, 162.4, 156.9…
## $ bmi <dbl> 33.3, 28.0, 45.4, 29.7, 31.6, 45.2, 39.9, 20.3, 31.9, 41.7, 2…
## $ waist <dbl> 120.4, 107.8, 120.3, 109.0, 111.1, 130.7, 113.2, 74.6, 102.8,…
## $ age <dbl> 43, 57, 38, 75, 42, 63, 58, 26, 51, 61, 47, 52, 64, 55, 72, 8…
## $ diabetes <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ smoker <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1…
## $ fastfood <dbl> 5, 0, 2, 1, 1, 3, 6, 5, 0, 1, 0, 3, 0, 1, 0, 5, 0, 2, 1, 3, 2…
Exploring the data set using the summary function
summary(Health_data)
## systolic weight height bmi
## Min. : 80.0 Min. : 29.10 Min. :141.2 Min. :13.40
## 1st Qu.:114.0 1st Qu.: 69.15 1st Qu.:163.8 1st Qu.:24.10
## Median :122.0 Median : 81.00 Median :170.3 Median :27.90
## Mean :124.7 Mean : 83.56 Mean :170.2 Mean :28.79
## 3rd Qu.:134.0 3rd Qu.: 94.50 3rd Qu.:176.8 3rd Qu.:32.10
## Max. :224.0 Max. :203.50 Max. :200.4 Max. :62.00
## waist age diabetes smoker fastfood
## Min. : 56.2 Min. :20.00 0:1265 0:770 Min. : 0.00
## 1st Qu.: 88.4 1st Qu.:34.00 1: 210 1:705 1st Qu.: 0.00
## Median : 98.9 Median :49.00 Median : 1.00
## Mean :100.0 Mean :48.89 Mean : 2.14
## 3rd Qu.:109.5 3rd Qu.:62.00 3rd Qu.: 3.00
## Max. :176.0 Max. :80.00 Max. :22.00
We could also use the describe function to explore the
dataset
describe(Health_data)
## vars n mean sd median trimmed mad min max range skew
## systolic 1 1475 124.73 17.62 122.0 123.40 14.83 80.0 224.0 144.0 0.92
## weight 2 1475 83.56 20.58 81.0 81.94 18.83 29.1 203.5 174.4 1.03
## height 3 1475 170.18 9.33 170.3 170.27 9.64 141.2 200.4 59.2 -0.08
## bmi 4 1475 28.79 6.47 27.9 28.19 5.93 13.4 62.0 48.6 1.04
## waist 5 1475 100.03 16.15 98.9 99.14 15.72 56.2 176.0 119.8 0.59
## age 6 1475 48.89 16.98 49.0 48.49 20.76 20.0 80.0 60.0 0.14
## diabetes* 7 1475 1.14 0.35 1.0 1.05 0.00 1.0 2.0 1.0 2.04
## smoker* 8 1475 1.48 0.50 1.0 1.47 0.00 1.0 2.0 1.0 0.09
## fastfood 9 1475 2.14 2.87 1.0 1.57 1.48 0.0 22.0 22.0 2.78
## kurtosis se
## systolic 1.79 0.46
## weight 2.20 0.54
## height -0.27 0.24
## bmi 1.67 0.17
## waist 0.58 0.42
## age -1.04 0.44
## diabetes* 2.18 0.01
## smoker* -1.99 0.01
## fastfood 10.71 0.07
Checking that the response is a normal distribution
we use the histogram diagram
ggplot(Health_data, aes(systolic)) + geom_histogram(fill= "blue") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Health_data, aes(systolic, fill = diabetes)) + geom_histogram() + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggplot(Health_data, aes(systolic, fill = diabetes)) + geom_density() + theme_classic()
ggplot(Health_data, aes(systolic, fill = smoker)) + geom_histogram( ) + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

To show the distribution of all the variables, we draw histograms
for all the other variables. The keep function keeps the
variable as
numeric, the gather function puts them in values and key to
make two columns, and the facet_wrap function #### uses all the
variables.
Health_data %>% select(-systolic) %>% keep(is.numeric) %>% gather() %>%
ggplot(aes(value, fill = key)) + geom_histogram(bin = 30) + facet_wrap(~ key, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using boxplot function to view any outliers
Health_data %>% select(-systolic) %>% keep(is.numeric) %>% gather() %>%
ggplot(aes(value, fill = key)) + geom_boxplot() + facet_wrap(~ key, scales = "free")

Finding the correlation between all numeric variables, we use the
cor function and the corrplot function,
with various methods to give different output.
Health_data_cor <- cor(Health_data[, c(1:6,9)])
corrplot(Health_data_cor, method = "number")

corrplot(Health_data_cor, method = "pie")

corrplot(Health_data_cor, method = "circle")

From the corrplot function, age and waist appears to be the
biggest relationship with the response variable.
so we build a simple linear model with the two variables
Model1 <- lm(data = Health_data, systolic~age)
summary(Model1)
##
## Call:
## lm(formula = systolic ~ age, data = Health_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.028 -10.109 -1.101 8.223 98.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.34474 1.28169 81.41 <0.0000000000000002 ***
## age 0.41698 0.02477 16.84 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.14 on 1473 degrees of freedom
## Multiple R-squared: 0.1614, Adjusted R-squared: 0.1608
## F-statistic: 283.4 on 1 and 1473 DF, p-value: < 0.00000000000000022
### using age and waist predictors
modlew <- lm(data = Health_data, systolic ~ age + waist)
summary(modlew)
##
## Call:
## lm(formula = systolic ~ age + waist, data = Health_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.798 -10.070 -1.038 8.080 101.355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.08630 2.72023 34.955 < 0.0000000000000002 ***
## age 0.39808 0.02514 15.838 < 0.0000000000000002 ***
## waist 0.10179 0.02641 3.854 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.07 on 1472 degrees of freedom
## Multiple R-squared: 0.1697, Adjusted R-squared: 0.1686
## F-statistic: 150.5 on 2 and 1472 DF, p-value: < 0.00000000000000022
Next we do the multiple regression with all the
variables.
Model2 <- lm(data = Health_data, systolic~.)
summary(Model2)
##
## Call:
## lm(formula = systolic ~ ., data = Health_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.463 -10.105 -0.765 8.148 100.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163.30026 33.52545 4.871 0.00000123 ***
## weight 0.55135 0.19835 2.780 0.00551 **
## height -0.39201 0.19553 -2.005 0.04516 *
## bmi -1.36839 0.57574 -2.377 0.01759 *
## waist -0.00955 0.08358 -0.114 0.90905
## age 0.43345 0.03199 13.549 < 0.0000000000000002 ***
## diabetes1 2.20636 1.26536 1.744 0.08143 .
## smoker1 1.13983 0.90964 1.253 0.21039
## fastfood 0.17638 0.15322 1.151 0.24985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.99 on 1466 degrees of freedom
## Multiple R-squared: 0.1808, Adjusted R-squared: 0.1763
## F-statistic: 40.44 on 8 and 1466 DF, p-value: < 0.00000000000000022
Since our model only shows a slightly better adjusted R square
and lower standard residual error,
we do additional diagnostic tests
### mean of the residual is close to zero
mean(Model2$residuals)
## [1] -0.0000000000000008500251
#### histogram for normality on the simple regression model
ols_plot_resid_hist(Model2)

#### checking the presence of heteroscedasticity
ols_plot_resid_fit(Model1)

We run a test for a residual autocorrelation.
durbinWatsonTest(Model2)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01985291 2.038055 0.464
## Alternative hypothesis: rho != 0
with a high value for our Durbin-Watsion statistics and a
p_value greater than 0.05,
we cannot reject the null hypothesis that “no first
order aucorrelation exists.”
Next we check for influential outliers
ols_plot_cooksd_chart(Model2)

ols_plot_cooksd_bar(Model2)

We then use the cooks.distance function to get the
influential points.
cooksD <- cooks.distance(Model2)
inflen_point <- cooksD[(cooksD > (4 * mean(cooksD, na.rm = TRUE)))]
inflen_point
## 6 9 31 67 77 86
## 0.007020304 0.005322880 0.004112379 0.008190460 0.002990138 0.005075734
## 93 112 122 164 205 299
## 0.005618945 0.015922334 0.006505294 0.006056846 0.002919255 0.003311288
## 308 315 316 325 338 360
## 0.005874396 0.007217741 0.003931164 0.007391959 0.008316128 0.007674077
## 370 400 427 432 465 486
## 0.003309850 0.007978015 0.024467760 0.004547622 0.006086379 0.007455618
## 503 514 560 570 573 576
## 0.002886103 0.002992637 0.006705003 0.004355419 0.004270226 0.010528662
## 617 632 659 667 703 714
## 0.003737142 0.007282459 0.003121849 0.003739032 0.004079417 0.003367488
## 752 805 859 867 887 900
## 0.002852804 0.011047520 0.014597132 0.005072518 0.004106747 0.007747307
## 904 910 977 1005 1080 1109
## 0.011780741 0.004910020 0.002934982 0.003119560 0.003091754 0.002928871
## 1116 1120 1158 1170 1223 1230
## 0.005056472 0.020477018 0.005621453 0.002828982 0.003501275 0.010028861
## 1293 1299 1313 1315 1330 1356
## 0.011743896 0.002988971 0.003212125 0.003467555 0.016062467 0.004303508
## 1358 1393 1398 1448
## 0.039067674 0.004977908 0.003542732 0.008216068
Getting the index of the influential points.
inflen_point2 <- as.numeric(names(cooksD[(cooksD > (4 * mean(cooksD, na.rm = TRUE)))]))
inflen_point2
## [1] 6 9 31 67 77 86 93 112 122 164 205 299 308 315 316
## [16] 325 338 360 370 400 427 432 465 486 503 514 560 570 573 576
## [31] 617 632 659 667 703 714 752 805 859 867 887 900 904 910 977
## [46] 1005 1080 1109 1116 1120 1158 1170 1223 1230 1293 1299 1313 1315 1330 1356
## [61] 1358 1393 1398 1448
length(inflen_point2)
## [1] 64
Finding the summary of only the influential points
in the dataset
inflen_point2_summary <- Health_data[inflen_point2, ]
summary(inflen_point2_summary)
## systolic weight height bmi
## Min. : 86.0 Min. : 29.10 Min. :144.2 Min. :13.40
## 1st Qu.:110.0 1st Qu.: 66.45 1st Qu.:159.1 1st Qu.:23.57
## Median :164.0 Median : 80.40 Median :167.2 Median :32.00
## Mean :150.6 Mean : 91.23 Mean :166.9 Mean :32.15
## 3rd Qu.:174.5 3rd Qu.:109.03 3rd Qu.:174.2 3rd Qu.:38.42
## Max. :224.0 Max. :203.50 Max. :193.3 Max. :62.00
## waist age diabetes smoker fastfood
## Min. : 56.2 Min. :21.00 0:42 0:27 Min. : 0.000
## 1st Qu.: 91.8 1st Qu.:42.00 1:22 1:37 1st Qu.: 0.000
## Median :109.8 Median :56.00 Median : 1.000
## Mean :109.3 Mean :55.36 Mean : 2.969
## 3rd Qu.:124.9 3rd Qu.:66.50 3rd Qu.: 3.000
## Max. :172.2 Max. :80.00 Max. :18.000
Summary of the datset without the influential
points.
inflen_point2_removed <- Health_data[-inflen_point2, ]
summary(inflen_point2_removed)
## systolic weight height bmi
## Min. : 80.0 Min. : 41.10 Min. :141.2 Min. :16.00
## 1st Qu.:114.0 1st Qu.: 69.20 1st Qu.:164.0 1st Qu.:24.10
## Median :122.0 Median : 81.10 Median :170.4 Median :27.80
## Mean :123.6 Mean : 83.22 Mean :170.3 Mean :28.64
## 3rd Qu.:134.0 3rd Qu.: 94.20 3rd Qu.:176.8 3rd Qu.:31.90
## Max. :182.0 Max. :180.20 Max. :200.4 Max. :59.00
## waist age diabetes smoker fastfood
## Min. : 65.60 Min. :20.00 0:1223 0:743 Min. : 0.000
## 1st Qu.: 88.25 1st Qu.:34.00 1: 188 1:668 1st Qu.: 0.000
## Median : 98.50 Median :48.00 Median : 1.000
## Mean : 99.61 Mean :48.59 Mean : 2.102
## 3rd Qu.:108.85 3rd Qu.:62.00 3rd Qu.: 3.000
## Max. :176.00 Max. :80.00 Max. :22.000
Creating a new data set without the influential
points.
Model3 <- Health_data[-inflen_point2, ]
Model3
## # A tibble: 1,411 × 9
## systolic weight height bmi waist age diabetes smoker fastfood
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl>
## 1 100 98.6 172 33.3 120. 43 0 1 5
## 2 112 96.9 186 28 108. 57 0 0 0
## 3 134 108. 154. 45.4 120. 38 0 1 2
## 4 108 84.8 169. 29.7 109 75 0 0 1
## 5 128 97 175. 31.6 111. 42 0 1 1
## 6 126 99.4 158. 39.9 113. 58 0 0 6
## 7 124 53.6 162. 20.3 74.6 26 0 1 5
## 8 138 136. 180. 41.7 138. 61 1 0 1
## 9 118 72.3 159 28.6 98.9 47 0 0 0
## 10 124 99.3 178. 31.4 110 52 0 1 3
## # ℹ 1,401 more rows
Final diagnostic test for multicollinearity.
ols_vif_tol(Model2)
## Variables Tolerance VIF
## 1 weight 0.01041041 96.057711
## 2 height 0.05217187 19.167417
## 3 bmi 0.01250176 79.988712
## 4 waist 0.09518197 10.506192
## 5 age 0.58828067 1.699869
## 6 diabetes1 0.88685760 1.127577
## 7 smoker1 0.83978882 1.190776
## 8 fastfood 0.89558690 1.116586
since we have VIF above 5 in 4 variables, it
confirms there is multicollinearity problem.
since weight has the lowest tolerance, we then drop the
other variables and run the model again
Model4 <- lm(data = Model3, systolic ~ weight + age + diabetes)
summary(Model4)
##
## Call:
## lm(formula = systolic ~ weight + age + diabetes, data = Model3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.777 -9.028 -0.184 8.272 49.702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.47479 1.94008 49.727 < 0.0000000000000002 ***
## weight 0.09785 0.01879 5.208 0.000000219 ***
## age 0.38256 0.02226 17.183 < 0.0000000000000002 ***
## diabetes1 2.60625 1.12055 2.326 0.0202 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.68 on 1407 degrees of freedom
## Multiple R-squared: 0.2108, Adjusted R-squared: 0.2091
## F-statistic: 125.3 on 3 and 1407 DF, p-value: < 0.00000000000000022
We then add two more variables to the model
Model3 <- Model3 %>% mutate(age2 =age^2, lage = log(age))
Model3
## # A tibble: 1,411 × 11
## systolic weight height bmi waist age diabetes smoker fastfood age2 lage
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl>
## 1 100 98.6 172 33.3 120. 43 0 1 5 1849 3.76
## 2 112 96.9 186 28 108. 57 0 0 0 3249 4.04
## 3 134 108. 154. 45.4 120. 38 0 1 2 1444 3.64
## 4 108 84.8 169. 29.7 109 75 0 0 1 5625 4.32
## 5 128 97 175. 31.6 111. 42 0 1 1 1764 3.74
## 6 126 99.4 158. 39.9 113. 58 0 0 6 3364 4.06
## 7 124 53.6 162. 20.3 74.6 26 0 1 5 676 3.26
## 8 138 136. 180. 41.7 138. 61 1 0 1 3721 4.11
## 9 118 72.3 159 28.6 98.9 47 0 0 0 2209 3.85
## 10 124 99.3 178. 31.4 110 52 0 1 3 2704 3.95
## # ℹ 1,401 more rows
Building the last model with the new variables
ols_step_both_p(model = lm(data = Model3,
systolic ~ weight*diabetes + age*diabetes + age2*diabetes
+ lage*diabetes
),
pent = 0.2,
prem = 0.01,
details = FALSE
)
##
##
## Stepwise Summary
## --------------------------------------------------------------------------------
## Step Variable AIC SBC SBIC R2 Adj. R2
## --------------------------------------------------------------------------------
## 0 Base Model 11719.593 11730.098 7711.525 0.00000 0.00000
## 1 age2 (+) 11419.209 11434.965 7409.255 0.19290 0.19233
## 2 weight (+) 11387.498 11408.506 7375.626 0.21195 0.21083
## 3 diabetes (+) 11383.818 11410.078 7370.024 0.21512 0.21344
## --------------------------------------------------------------------------------
##
## Final Model Output
## ------------------
##
## Model Summary
## ------------------------------------------------------------------
## R 0.464 RMSE 13.619
## R-Squared 0.215 MSE 186.007
## Adj. R-Squared 0.213 Coef. Var 11.038
## Pred R-Squared 0.211 AIC 11383.818
## MAE 10.687 SBC 11410.078
## ------------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## -------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------------
## Regression 71729.168 3 23909.723 128.542 0.0000
## Residual 261711.326 1407 186.007
## Total 333440.493 1410
## -------------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------
## (Intercept) 104.565 1.707 61.243 0.000 101.216 107.915
## age2 0.004 0.000 0.427 17.453 0.000 0.003 0.004
## weight 0.102 0.019 0.130 5.450 0.000 0.065 0.139
## diabetes1 2.658 1.116 0.059 2.382 0.017 0.469 4.847
## -------------------------------------------------------------------------------------------