library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
health <- read.csv("C:/Users/User/Downloads/Full Book/Chapter 4 Code/Student/Data/health.csv")
glimpse(health)
## Rows: 1,475
## Columns: 9
## $ systolic <int> 100, 112, 134, 108, 128, 102, 126, 124, 166, 138, 118, 124, 9~
## $ weight <dbl> 98.6, 96.9, 108.2, 84.8, 97.0, 102.4, 99.4, 53.6, 78.6, 135.5~
## $ height <dbl> 172.0, 186.0, 154.4, 168.9, 175.3, 150.5, 157.8, 162.4, 156.9~
## $ bmi <dbl> 33.3, 28.0, 45.4, 29.7, 31.6, 45.2, 39.9, 20.3, 31.9, 41.7, 2~
## $ waist <dbl> 120.4, 107.8, 120.3, 109.0, 111.1, 130.7, 113.2, 74.6, 102.8,~
## $ age <int> 43, 57, 38, 75, 42, 63, 58, 26, 51, 61, 47, 52, 64, 55, 72, 8~
## $ diabetes <int> 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0~
## $ smoker <int> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1~
## $ fastfood <int> 5, 0, 2, 1, 1, 3, 6, 5, 0, 1, 0, 3, 0, 1, 0, 5, 0, 2, 1, 3, 2~
health <- health %>%
mutate(diabetes=as.factor(diabetes)) %>%
mutate(smoker=as.factor(smoker))
summary(health)
## systolic weight height bmi
## Min. : 80.0 Min. : 29.10 Min. :141.2 Min. :13.40
## 1st Qu.:114.0 1st Qu.: 69.15 1st Qu.:163.8 1st Qu.:24.10
## Median :122.0 Median : 81.00 Median :170.3 Median :27.90
## Mean :124.7 Mean : 83.56 Mean :170.2 Mean :28.79
## 3rd Qu.:134.0 3rd Qu.: 94.50 3rd Qu.:176.8 3rd Qu.:32.10
## Max. :224.0 Max. :203.50 Max. :200.4 Max. :62.00
## waist age diabetes smoker fastfood
## Min. : 56.2 Min. :20.00 0:1265 0:770 Min. : 0.00
## 1st Qu.: 88.4 1st Qu.:34.00 1: 210 1:705 1st Qu.: 0.00
## Median : 98.9 Median :49.00 Median : 1.00
## Mean :100.0 Mean :48.89 Mean : 2.14
## 3rd Qu.:109.5 3rd Qu.:62.00 3rd Qu.: 3.00
## Max. :176.0 Max. :80.00 Max. :22.00
health %>%
ggplot() +
geom_histogram(mapping = aes(x=systolic), fill="lightblue", color="black") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
health %>%
select(-systolic) %>%
keep(is.numeric) %>%
gather() %>%
ggplot() +
geom_histogram(mapping = aes(x=value,fill=key), color="black") + facet_wrap(~key, scales = "free") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
corrplot(cor(health[, c("systolic","weight","height","bmi","waist","age","fastfood")]))
health_mod1 <- lm(data = health, systolic~age)
summary(health_mod1)
##
## Call:
## lm(formula = systolic ~ age, data = health)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.028 -10.109 -1.101 8.223 98.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 104.34474 1.28169 81.41 <2e-16 ***
## age 0.41698 0.02477 16.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.14 on 1473 degrees of freedom
## Multiple R-squared: 0.1614, Adjusted R-squared: 0.1608
## F-statistic: 283.4 on 1 and 1473 DF, p-value: < 2.2e-16
health_mod2 <- lm(data = health, systolic ~ .)
summary(health_mod2)
##
## Call:
## lm(formula = systolic ~ ., data = health)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.463 -10.105 -0.765 8.148 100.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163.30026 33.52545 4.871 1.23e-06 ***
## weight 0.55135 0.19835 2.780 0.00551 **
## height -0.39201 0.19553 -2.005 0.04516 *
## bmi -1.36839 0.57574 -2.377 0.01759 *
## waist -0.00955 0.08358 -0.114 0.90905
## age 0.43345 0.03199 13.549 < 2e-16 ***
## diabetes1 2.20636 1.26536 1.744 0.08143 .
## smoker1 1.13983 0.90964 1.253 0.21039
## fastfood 0.17638 0.15322 1.151 0.24985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.99 on 1466 degrees of freedom
## Multiple R-squared: 0.1808, Adjusted R-squared: 0.1763
## F-statistic: 40.44 on 8 and 1466 DF, p-value: < 2.2e-16
mean(health_mod2$residuals)
## [1] -1.121831e-15
ols_plot_resid_hist(health_mod2)
##### The residual plot is almost normally distributed, so it satisfies our test.
ols_plot_resid_fit(health_mod2)