To get a view of the characteristics of data set, we use the glimpse function

glimpse(Health_data)
## Rows: 1,475
## Columns: 9
## $ systolic <dbl> 100, 112, 134, 108, 128, 102, 126, 124, 166, 138, 118, 124, 9…
## $ weight   <dbl> 98.6, 96.9, 108.2, 84.8, 97.0, 102.4, 99.4, 53.6, 78.6, 135.5…
## $ height   <dbl> 172.0, 186.0, 154.4, 168.9, 175.3, 150.5, 157.8, 162.4, 156.9…
## $ bmi      <dbl> 33.3, 28.0, 45.4, 29.7, 31.6, 45.2, 39.9, 20.3, 31.9, 41.7, 2…
## $ waist    <dbl> 120.4, 107.8, 120.3, 109.0, 111.1, 130.7, 113.2, 74.6, 102.8,…
## $ age      <dbl> 43, 57, 38, 75, 42, 63, 58, 26, 51, 61, 47, 52, 64, 55, 72, 8…
## $ diabetes <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ smoker   <dbl> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1…
## $ fastfood <dbl> 5, 0, 2, 1, 1, 3, 6, 5, 0, 1, 0, 3, 0, 1, 0, 5, 0, 2, 1, 3, 2…

Since the features of diabetes and smoker are binary, we convert to factors

Health_data$diabetes <- as.factor(Health_data$diabetes)

Health_data$smoker <- as.factor(Health_data$smoker)

glimpse(Health_data)
## Rows: 1,475
## Columns: 9
## $ systolic <dbl> 100, 112, 134, 108, 128, 102, 126, 124, 166, 138, 118, 124, 9…
## $ weight   <dbl> 98.6, 96.9, 108.2, 84.8, 97.0, 102.4, 99.4, 53.6, 78.6, 135.5…
## $ height   <dbl> 172.0, 186.0, 154.4, 168.9, 175.3, 150.5, 157.8, 162.4, 156.9…
## $ bmi      <dbl> 33.3, 28.0, 45.4, 29.7, 31.6, 45.2, 39.9, 20.3, 31.9, 41.7, 2…
## $ waist    <dbl> 120.4, 107.8, 120.3, 109.0, 111.1, 130.7, 113.2, 74.6, 102.8,…
## $ age      <dbl> 43, 57, 38, 75, 42, 63, 58, 26, 51, 61, 47, 52, 64, 55, 72, 8…
## $ diabetes <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ smoker   <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1…
## $ fastfood <dbl> 5, 0, 2, 1, 1, 3, 6, 5, 0, 1, 0, 3, 0, 1, 0, 5, 0, 2, 1, 3, 2…

Exploring the data set using the summary function

summary(Health_data)
##     systolic         weight           height           bmi       
##  Min.   : 80.0   Min.   : 29.10   Min.   :141.2   Min.   :13.40  
##  1st Qu.:114.0   1st Qu.: 69.15   1st Qu.:163.8   1st Qu.:24.10  
##  Median :122.0   Median : 81.00   Median :170.3   Median :27.90  
##  Mean   :124.7   Mean   : 83.56   Mean   :170.2   Mean   :28.79  
##  3rd Qu.:134.0   3rd Qu.: 94.50   3rd Qu.:176.8   3rd Qu.:32.10  
##  Max.   :224.0   Max.   :203.50   Max.   :200.4   Max.   :62.00  
##      waist            age        diabetes smoker     fastfood    
##  Min.   : 56.2   Min.   :20.00   0:1265   0:770   Min.   : 0.00  
##  1st Qu.: 88.4   1st Qu.:34.00   1: 210   1:705   1st Qu.: 0.00  
##  Median : 98.9   Median :49.00                    Median : 1.00  
##  Mean   :100.0   Mean   :48.89                    Mean   : 2.14  
##  3rd Qu.:109.5   3rd Qu.:62.00                    3rd Qu.: 3.00  
##  Max.   :176.0   Max.   :80.00                    Max.   :22.00

We could also use the describe function to explore the dataset

describe(Health_data)
##           vars    n   mean    sd median trimmed   mad   min   max range  skew
## systolic     1 1475 124.73 17.62  122.0  123.40 14.83  80.0 224.0 144.0  0.92
## weight       2 1475  83.56 20.58   81.0   81.94 18.83  29.1 203.5 174.4  1.03
## height       3 1475 170.18  9.33  170.3  170.27  9.64 141.2 200.4  59.2 -0.08
## bmi          4 1475  28.79  6.47   27.9   28.19  5.93  13.4  62.0  48.6  1.04
## waist        5 1475 100.03 16.15   98.9   99.14 15.72  56.2 176.0 119.8  0.59
## age          6 1475  48.89 16.98   49.0   48.49 20.76  20.0  80.0  60.0  0.14
## diabetes*    7 1475   1.14  0.35    1.0    1.05  0.00   1.0   2.0   1.0  2.04
## smoker*      8 1475   1.48  0.50    1.0    1.47  0.00   1.0   2.0   1.0  0.09
## fastfood     9 1475   2.14  2.87    1.0    1.57  1.48   0.0  22.0  22.0  2.78
##           kurtosis   se
## systolic      1.79 0.46
## weight        2.20 0.54
## height       -0.27 0.24
## bmi           1.67 0.17
## waist         0.58 0.42
## age          -1.04 0.44
## diabetes*     2.18 0.01
## smoker*      -1.99 0.01
## fastfood     10.71 0.07

Checking that the response is a normal distribution we use the histogram diagram

ggplot(Health_data, aes(systolic))  + geom_histogram(fill= "blue") + theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Health_data, aes(systolic, fill = diabetes))  + geom_histogram() + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#ggplot(Health_data, aes(systolic, fill = diabetes))  + geom_density() + theme_classic()
 
 
 ggplot(Health_data, aes(systolic, fill = smoker))  + geom_histogram( ) + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

To show the distribution of all the variables, we draw histograms for all the other variables. The keep function keeps the variable as

numeric, the gather function puts them in values and key to make two columns, and the facet_wrap function #### uses all the variables.

 Health_data %>% select(-systolic) %>% keep(is.numeric) %>% gather() %>% 
   ggplot(aes(value, fill = key)) + geom_histogram(bin = 30) + facet_wrap(~ key, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using boxplot function to view any outliers

Health_data %>% select(-systolic) %>% keep(is.numeric) %>% gather() %>% 
   ggplot(aes(value, fill = key)) + geom_boxplot() + facet_wrap(~ key, scales = "free")

Finding the correlation between all numeric variables, we use the cor function and the corrplot function,

with various methods to give different output.

Health_data_cor <- cor(Health_data[, c(1:6,9)])

corrplot(Health_data_cor, method = "number")

corrplot(Health_data_cor, method = "pie")

corrplot(Health_data_cor, method = "circle")

From the corrplot function, age and waist appears to be the biggest relationship with the response variable.

so we build a simple linear model with the two variables

Model1 <- lm(data = Health_data, systolic~age)
summary(Model1)
## 
## Call:
## lm(formula = systolic ~ age, data = Health_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.028 -10.109  -1.101   8.223  98.806 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 104.34474    1.28169   81.41 <0.0000000000000002 ***
## age           0.41698    0.02477   16.84 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.14 on 1473 degrees of freedom
## Multiple R-squared:  0.1614, Adjusted R-squared:  0.1608 
## F-statistic: 283.4 on 1 and 1473 DF,  p-value: < 0.00000000000000022
### using age and waist predictors


modlew <- lm(data = Health_data, systolic ~ age + waist)
summary(modlew)
## 
## Call:
## lm(formula = systolic ~ age + waist, data = Health_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.798 -10.070  -1.038   8.080 101.355 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 95.08630    2.72023  34.955 < 0.0000000000000002 ***
## age          0.39808    0.02514  15.838 < 0.0000000000000002 ***
## waist        0.10179    0.02641   3.854             0.000121 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.07 on 1472 degrees of freedom
## Multiple R-squared:  0.1697, Adjusted R-squared:  0.1686 
## F-statistic: 150.5 on 2 and 1472 DF,  p-value: < 0.00000000000000022

Next we do the multiple regression with all the variables.

Model2 <- lm(data = Health_data, systolic~.)
summary(Model2)
## 
## Call:
## lm(formula = systolic ~ ., data = Health_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.463 -10.105  -0.765   8.148 100.398 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 163.30026   33.52545   4.871           0.00000123 ***
## weight        0.55135    0.19835   2.780              0.00551 ** 
## height       -0.39201    0.19553  -2.005              0.04516 *  
## bmi          -1.36839    0.57574  -2.377              0.01759 *  
## waist        -0.00955    0.08358  -0.114              0.90905    
## age           0.43345    0.03199  13.549 < 0.0000000000000002 ***
## diabetes1     2.20636    1.26536   1.744              0.08143 .  
## smoker1       1.13983    0.90964   1.253              0.21039    
## fastfood      0.17638    0.15322   1.151              0.24985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.99 on 1466 degrees of freedom
## Multiple R-squared:  0.1808, Adjusted R-squared:  0.1763 
## F-statistic: 40.44 on 8 and 1466 DF,  p-value: < 0.00000000000000022

Since our model only shows a slightly better adjusted R square and lower standard residual error,

we do additional diagnostic tests

### mean of the residual is close to zero
mean(Model2$residuals)
## [1] -0.0000000000000008500251
#### histogram for normality on the simple regression model

ols_plot_resid_hist(Model2)

#### checking the presence of heteroscedasticity

ols_plot_resid_fit(Model1)

We run a test for a residual autocorrelation.

durbinWatsonTest(Model2)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01985291      2.038055   0.464
##  Alternative hypothesis: rho != 0

with a high value for our Durbin-Watsion statistics and a p_value greater than 0.05,

we cannot reject the null hypothesis that “no first order aucorrelation exists.”

Next we check for influential outliers

ols_plot_cooksd_chart(Model2)

ols_plot_cooksd_bar(Model2)

We then use the cooks.distance function to get the influential points.

cooksD <- cooks.distance(Model2)

inflen_point <- cooksD[(cooksD > (4 * mean(cooksD, na.rm = TRUE)))]
inflen_point
##           6           9          31          67          77          86 
## 0.007020304 0.005322880 0.004112379 0.008190460 0.002990138 0.005075734 
##          93         112         122         164         205         299 
## 0.005618945 0.015922334 0.006505294 0.006056846 0.002919255 0.003311288 
##         308         315         316         325         338         360 
## 0.005874396 0.007217741 0.003931164 0.007391959 0.008316128 0.007674077 
##         370         400         427         432         465         486 
## 0.003309850 0.007978015 0.024467760 0.004547622 0.006086379 0.007455618 
##         503         514         560         570         573         576 
## 0.002886103 0.002992637 0.006705003 0.004355419 0.004270226 0.010528662 
##         617         632         659         667         703         714 
## 0.003737142 0.007282459 0.003121849 0.003739032 0.004079417 0.003367488 
##         752         805         859         867         887         900 
## 0.002852804 0.011047520 0.014597132 0.005072518 0.004106747 0.007747307 
##         904         910         977        1005        1080        1109 
## 0.011780741 0.004910020 0.002934982 0.003119560 0.003091754 0.002928871 
##        1116        1120        1158        1170        1223        1230 
## 0.005056472 0.020477018 0.005621453 0.002828982 0.003501275 0.010028861 
##        1293        1299        1313        1315        1330        1356 
## 0.011743896 0.002988971 0.003212125 0.003467555 0.016062467 0.004303508 
##        1358        1393        1398        1448 
## 0.039067674 0.004977908 0.003542732 0.008216068

Getting the index of the influential points.

inflen_point2 <- as.numeric(names(cooksD[(cooksD > (4 * mean(cooksD, na.rm = TRUE)))]))

inflen_point2
##  [1]    6    9   31   67   77   86   93  112  122  164  205  299  308  315  316
## [16]  325  338  360  370  400  427  432  465  486  503  514  560  570  573  576
## [31]  617  632  659  667  703  714  752  805  859  867  887  900  904  910  977
## [46] 1005 1080 1109 1116 1120 1158 1170 1223 1230 1293 1299 1313 1315 1330 1356
## [61] 1358 1393 1398 1448
length(inflen_point2)
## [1] 64

Finding the summary of only the influential points in the dataset

inflen_point2_summary <- Health_data[inflen_point2, ]

summary(inflen_point2_summary)
##     systolic         weight           height           bmi       
##  Min.   : 86.0   Min.   : 29.10   Min.   :144.2   Min.   :13.40  
##  1st Qu.:110.0   1st Qu.: 66.45   1st Qu.:159.1   1st Qu.:23.57  
##  Median :164.0   Median : 80.40   Median :167.2   Median :32.00  
##  Mean   :150.6   Mean   : 91.23   Mean   :166.9   Mean   :32.15  
##  3rd Qu.:174.5   3rd Qu.:109.03   3rd Qu.:174.2   3rd Qu.:38.42  
##  Max.   :224.0   Max.   :203.50   Max.   :193.3   Max.   :62.00  
##      waist            age        diabetes smoker    fastfood     
##  Min.   : 56.2   Min.   :21.00   0:42     0:27   Min.   : 0.000  
##  1st Qu.: 91.8   1st Qu.:42.00   1:22     1:37   1st Qu.: 0.000  
##  Median :109.8   Median :56.00                   Median : 1.000  
##  Mean   :109.3   Mean   :55.36                   Mean   : 2.969  
##  3rd Qu.:124.9   3rd Qu.:66.50                   3rd Qu.: 3.000  
##  Max.   :172.2   Max.   :80.00                   Max.   :18.000

Summary of the datset without the influential points.

inflen_point2_removed <- Health_data[-inflen_point2, ]

summary(inflen_point2_removed)
##     systolic         weight           height           bmi       
##  Min.   : 80.0   Min.   : 41.10   Min.   :141.2   Min.   :16.00  
##  1st Qu.:114.0   1st Qu.: 69.20   1st Qu.:164.0   1st Qu.:24.10  
##  Median :122.0   Median : 81.10   Median :170.4   Median :27.80  
##  Mean   :123.6   Mean   : 83.22   Mean   :170.3   Mean   :28.64  
##  3rd Qu.:134.0   3rd Qu.: 94.20   3rd Qu.:176.8   3rd Qu.:31.90  
##  Max.   :182.0   Max.   :180.20   Max.   :200.4   Max.   :59.00  
##      waist             age        diabetes smoker     fastfood     
##  Min.   : 65.60   Min.   :20.00   0:1223   0:743   Min.   : 0.000  
##  1st Qu.: 88.25   1st Qu.:34.00   1: 188   1:668   1st Qu.: 0.000  
##  Median : 98.50   Median :48.00                    Median : 1.000  
##  Mean   : 99.61   Mean   :48.59                    Mean   : 2.102  
##  3rd Qu.:108.85   3rd Qu.:62.00                    3rd Qu.: 3.000  
##  Max.   :176.00   Max.   :80.00                    Max.   :22.000

Creating a new data set without the influential points.

Model3 <- Health_data[-inflen_point2, ]

Model3
## # A tibble: 1,411 × 9
##    systolic weight height   bmi waist   age diabetes smoker fastfood
##       <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <fct>    <fct>     <dbl>
##  1      100   98.6   172   33.3 120.     43 0        1             5
##  2      112   96.9   186   28   108.     57 0        0             0
##  3      134  108.    154.  45.4 120.     38 0        1             2
##  4      108   84.8   169.  29.7 109      75 0        0             1
##  5      128   97     175.  31.6 111.     42 0        1             1
##  6      126   99.4   158.  39.9 113.     58 0        0             6
##  7      124   53.6   162.  20.3  74.6    26 0        1             5
##  8      138  136.    180.  41.7 138.     61 1        0             1
##  9      118   72.3   159   28.6  98.9    47 0        0             0
## 10      124   99.3   178.  31.4 110      52 0        1             3
## # ℹ 1,401 more rows

Final diagnostic test for multicollinearity.

ols_vif_tol(Model2)
##   Variables  Tolerance       VIF
## 1    weight 0.01041041 96.057711
## 2    height 0.05217187 19.167417
## 3       bmi 0.01250176 79.988712
## 4     waist 0.09518197 10.506192
## 5       age 0.58828067  1.699869
## 6 diabetes1 0.88685760  1.127577
## 7   smoker1 0.83978882  1.190776
## 8  fastfood 0.89558690  1.116586

since we have VIF above 5 in 4 variables, it confirms there is multicollinearity problem.

since weight has the lowest tolerance, we then drop the other variables and run the model again

Model4 <-  lm(data = Model3, systolic ~ weight + age + diabetes)
summary(Model4)
## 
## Call:
## lm(formula = systolic ~ weight + age + diabetes, data = Model3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.777  -9.028  -0.184   8.272  49.702 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 96.47479    1.94008  49.727 < 0.0000000000000002 ***
## weight       0.09785    0.01879   5.208          0.000000219 ***
## age          0.38256    0.02226  17.183 < 0.0000000000000002 ***
## diabetes1    2.60625    1.12055   2.326               0.0202 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.68 on 1407 degrees of freedom
## Multiple R-squared:  0.2108, Adjusted R-squared:  0.2091 
## F-statistic: 125.3 on 3 and 1407 DF,  p-value: < 0.00000000000000022

We then add two more variables to the model

Model3 <- Model3 %>% mutate(age2 =age^2, lage = log(age))
Model3
## # A tibble: 1,411 × 11
##    systolic weight height   bmi waist   age diabetes smoker fastfood  age2  lage
##       <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <fct>    <fct>     <dbl> <dbl> <dbl>
##  1      100   98.6   172   33.3 120.     43 0        1             5  1849  3.76
##  2      112   96.9   186   28   108.     57 0        0             0  3249  4.04
##  3      134  108.    154.  45.4 120.     38 0        1             2  1444  3.64
##  4      108   84.8   169.  29.7 109      75 0        0             1  5625  4.32
##  5      128   97     175.  31.6 111.     42 0        1             1  1764  3.74
##  6      126   99.4   158.  39.9 113.     58 0        0             6  3364  4.06
##  7      124   53.6   162.  20.3  74.6    26 0        1             5   676  3.26
##  8      138  136.    180.  41.7 138.     61 1        0             1  3721  4.11
##  9      118   72.3   159   28.6  98.9    47 0        0             0  2209  3.85
## 10      124   99.3   178.  31.4 110      52 0        1             3  2704  3.95
## # ℹ 1,401 more rows

Building the last model with the new variables

ols_step_both_p(model = lm(data = Model3,
                           systolic ~ weight*diabetes + age*diabetes + age2*diabetes 
                           + lage*diabetes
                           ),
                pent = 0.2, 
                prem = 0.01, 
                details = FALSE
                )
## 
## 
##                                  Stepwise Summary                                 
## --------------------------------------------------------------------------------
## Step    Variable           AIC          SBC         SBIC        R2       Adj. R2 
## --------------------------------------------------------------------------------
##  0      Base Model      11719.593    11730.098    7711.525    0.00000    0.00000 
##  1      age2 (+)        11419.209    11434.965    7409.255    0.19290    0.19233 
##  2      weight (+)      11387.498    11408.506    7375.626    0.21195    0.21083 
##  3      diabetes (+)    11383.818    11410.078    7370.024    0.21512    0.21344 
## --------------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                           Model Summary                            
## ------------------------------------------------------------------
## R                        0.464       RMSE                  13.619 
## R-Squared                0.215       MSE                  186.007 
## Adj. R-Squared           0.213       Coef. Var             11.038 
## Pred R-Squared           0.211       AIC                11383.818 
## MAE                     10.687       SBC                11410.078 
## ------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression     71729.168           3      23909.723    128.542    0.0000 
## Residual      261711.326        1407        186.007                      
## Total         333440.493        1410                                     
## -------------------------------------------------------------------------
## 
##                                     Parameter Estimates                                     
## -------------------------------------------------------------------------------------------
##       model       Beta    Std. Error    Std. Beta      t        Sig       lower      upper 
## -------------------------------------------------------------------------------------------
## (Intercept)    104.565         1.707                 61.243    0.000    101.216    107.915 
##        age2      0.004         0.000        0.427    17.453    0.000      0.003      0.004 
##      weight      0.102         0.019        0.130     5.450    0.000      0.065      0.139 
##   diabetes1      2.658         1.116        0.059     2.382    0.017      0.469      4.847 
## -------------------------------------------------------------------------------------------