Data Analysis Course at USTH (Section: Statistics, Part 2)
Biostatistics
Nguyen Chi Dung
#=====================================================
# Simple Regression
#=====================================================
rm(list = ls())
path <- dir("F:/usth/data", full.names = TRUE)
pima_small <- read.csv(path[4])
# Perform OLS Regression:
library(magrittr)
library(tidyverse)
set.seed(1)
train <- pima_small %>% sample_n(300)
test <- dplyr::setdiff(pima_small, train)
ols1 <- train %>% lm(glucose ~ bmi, data = .)
#---------------------------------------
# 1. Extract insights from lm object
#---------------------------------------
# OLS results:
ols1 %>% summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.69 -22.82 -6.34 21.81 77.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.4857 8.8426 10.911 < 2e-16 ***
## bmi 0.8226 0.2601 3.162 0.00173 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31 on 298 degrees of freedom
## Multiple R-squared: 0.03247, Adjusted R-squared: 0.02922
## F-statistic: 10 on 1 and 298 DF, p-value: 0.001727
# Extract Coefficients:
ols1$coefficients
## (Intercept) bmi
## 96.4856536 0.8225668
# Extract Residuals:
res <- ols1$residuals
res %>% head()
## 105 146 224 354 79 348
## -16.933546 4.261545 -39.972305 23.690486 -25.085140 -57.686776
# Extract fitted values:
fit_val <- ols1$fitted.values
fit_val %>% head()
## 105 146 224 354 79 348
## 125.9335 121.7385 122.9723 116.3095 127.0851 125.6868
# Use for prediction:
pred <- predict(ols1, test)
pred %>% head() # Predicted values
## 1 2 3 4 5 6
## 119.5998 131.9383 121.9852 134.1592 126.5916 115.5692
test$glucose %>% head() # Actual values
## [1] 89 137 78 118 143 97
# Regression line:
theme_set(theme_minimal())
train %>%
ggplot(aes(bmi, glucose)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm") +
labs(x = NULL, y = NULL,
title = "The Relationship between Glucose and BMI",
caption = "Data Source: The National Institute of Diabetes and Digestive and Kidney Diseases")

#---------------------------------------
# 2. Diagnostics
#---------------------------------------
# Use graphs:
library(ggfortify)
autoplot(ols1, which = 1:6, label.size = 2)

#----------------------------------------
# are error normally distributed?
#----------------------------------------
# Q-Q plot (method 1):
autoplot(ols1, which = 2, label.size = 2)

# Histogram (method 2):
train <- train %>% mutate(resd = ols1$residuals)
train %>%
ggplot(aes(resd)) +
geom_density(fill = "red", color = "red", alpha = 0.4) +
geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) +
geom_vline(xintercept = mean(train$resd), color = "yellow")

# Use an official test (https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test):
shapiro.test(train$resd)
##
## Shapiro-Wilk normality test
##
## data: train$resd
## W = 0.96683, p-value = 2.218e-06
#-----------------------------------------
# Remedies for Assumption Violations
#-----------------------------------------
# Use Tukey’s method (SPSS uses this method for handling outliers):
ct <- mean(train$resd) + 2*sd(train$resd)
cd <- mean(train$resd) - 2*sd(train$resd)
train <- train %>% mutate(out = case_when(resd > ct ~ "Outlier",
resd < cd ~ "Outlier",
resd <= ct & resd >= cd ~ "Normal"))
ols_re1 <- train %>%
filter(out == "Normal") %>%
lm(glucose ~ bmi, data = .)
ols_re1 %>% summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.116 -20.879 -5.213 21.821 63.745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.5702 8.1657 11.336 < 2e-16 ***
## bmi 0.8604 0.2400 3.586 0.000395 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.34 on 287 degrees of freedom
## Multiple R-squared: 0.04287, Adjusted R-squared: 0.03954
## F-statistic: 12.86 on 1 and 287 DF, p-value: 0.0003952
ols1 %>% summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.69 -22.82 -6.34 21.81 77.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.4857 8.8426 10.911 < 2e-16 ***
## bmi 0.8226 0.2601 3.162 0.00173 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31 on 298 degrees of freedom
## Multiple R-squared: 0.03247, Adjusted R-squared: 0.02922
## F-statistic: 10 on 1 and 298 DF, p-value: 0.001727
u <- ols_re1 %>% summary()
u$r.squared
## [1] 0.04287402
v <- ols1 %>% summary()
u$r.squared / v$r.squared - 1
## [1] 0.3205028
train %>%
ggplot(aes(bmi, glucose)) +
geom_point(data = train %>% filter(out == "Outlier"),
color = "red", size = 2) +
geom_smooth(method = "lm", se = FALSE) +
geom_point(data = train %>% filter(out != "Outlier"),
color = "purple", alpha = 0.3) +
geom_smooth(data = train %>% filter(out != "Outlier"),
method = "lm", se = FALSE, color = "green")

# Use Cook's Distance (https://en.wikipedia.org/wiki/Cook%27s_distance):
cut_off <- 1 / nrow(train)
train %<>% mutate(d = cooks.distance(ols1),
out_d = case_when(d > cut_off ~ "Outlier",
d <= cut_off ~ "Normal"))
train %>%
filter(out_d == "Outlier") %>%
head()
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 10 68 106 23 49 35.5 0.285 47 0
## 2 7 195 70 33 145 25.1 0.163 55 1
## 3 8 176 90 34 300 33.7 0.467 58 1
## 4 0 181 88 44 510 43.3 0.222 26 1
## 5 0 139 62 17 210 22.1 0.207 21 0
## 6 1 87 68 34 77 37.6 0.401 24 0
## resd out d out_d
## 1 -57.68678 Normal 0.006414617 Outlier
## 2 77.86792 Outlier 0.025842812 Outlier
## 3 51.79384 Normal 0.004701277 Outlier
## 4 48.89720 Normal 0.013197734 Outlier
## 5 24.33562 Normal 0.003838344 Outlier
## 6 -40.41417 Normal 0.003981352 Outlier
ols3 <- train %>%
filter(out_d != "Outlier") %>%
lm(glucose ~ bmi, data = .)
ols3 %>% summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.553 -14.525 -3.383 13.616 45.333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.9354 7.4638 12.452 < 2e-16 ***
## bmi 0.7956 0.2281 3.488 0.000592 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.54 on 213 degrees of freedom
## Multiple R-squared: 0.05402, Adjusted R-squared: 0.04958
## F-statistic: 12.16 on 1 and 213 DF, p-value: 0.0005916
autoplot(ols1, which = 4, ncol = 3, label.size = 1) # Method 1

train <- train %>%
mutate(N = 1:nrow(.))
train %>%
ggplot(aes(N, d)) +
geom_point(alpha = 0.2) +
geom_point(data = train %>% filter(d > cut_off), color = "red") +
labs(x = NULL, y = "Cook's Distance",
title = "Outliers based on Cook's Distance") # Method 2.

# Use data transformation:
train %<>% mutate(ln_bmi = log(bmi))
ols4 <- train %>% lm(glucose ~ ln_bmi, data = .)
ols4 %>% summary()
##
## Call:
## lm(formula = glucose ~ ln_bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.301 -23.168 -6.282 21.465 78.576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.603 30.254 0.813 0.41675
## ln_bmi 28.491 8.668 3.287 0.00113 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.96 on 298 degrees of freedom
## Multiple R-squared: 0.03499, Adjusted R-squared: 0.03175
## F-statistic: 10.8 on 1 and 298 DF, p-value: 0.001134
#---------------------------------------
# 3. Boostrap Method
#---------------------------------------
# Method 1:
pima_small <- read.csv(path[4])
train <- pima_small %>% sample_n(300)
test <- dplyr::setdiff(pima_small, train)
ols1 <- train %>% lm(glucose ~ bmi, data = .)
ols1 %>% summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.831 -21.393 -6.712 21.766 76.963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.8368 8.5097 11.145 <2e-16 ***
## bmi 0.8262 0.2507 3.296 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.41 on 298 degrees of freedom
## Multiple R-squared: 0.03517, Adjusted R-squared: 0.03193
## F-statistic: 10.86 on 1 and 298 DF, p-value: 0.0011
coef <- ols1$coefficients
coef[1]
## (Intercept)
## 94.83678
coef[2]
## bmi
## 0.8262213
a <- c()
b <- c()
for (i in 1:1000) {
set.seed(i)
train <- pima_small %>% sample_n(nrow(.), replace = TRUE)
ols1 <- train %>% lm(glucose ~ bmi, data = .)
ols1 %>% summary()
coef <- ols1$coefficients
m <- coef[1]
n <- coef[2]
a <- c(a, m)
b <- c(b, n)
}
my_bo <- data.frame(Intercept = a, Coeff = b)
my_bo %>%
gather(kieu_hs, giatri) %>%
ggplot(aes(giatri)) +
geom_density(fill = "red", color = "red", alpha = 0.4) +
geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) +
facet_wrap(~ kieu_hs, scales = "free") +
labs(x = NULL, y = NULL,
title = "Coefficients from 1000 bootstrap sampling",
subtitle = "Note: Use our function (Method 1)",
caption = "Data Sourse: NIDDKD")

my_bo %>% summary()
## Intercept Coeff
## Min. : 68.91 Min. :0.2746
## 1st Qu.: 87.27 1st Qu.:0.7844
## Median : 91.90 Median :0.9172
## Mean : 92.01 Mean :0.9253
## 3rd Qu.: 96.76 3rd Qu.:1.0708
## Max. :114.56 Max. :1.6703
# Use Shapiro test for checking normality:
shapiro.test(my_bo$Coeff)
##
## Shapiro-Wilk normality test
##
## data: my_bo$Coeff
## W = 0.99901, p-value = 0.8778
# Calculate 95% Confidence Interval for Coeff
# (https://www.mathsisfun.com/data/confidence-interval.html):
tinh_95_in <- function(x) {
ct <- mean(x) + 1.96*sd(x) / sqrt(length(x))
cd <- mean(x) - 1.96*sd(x) / sqrt(length(x))
print(paste("Lower bound:", round(cd, 3)))
print(paste("Upper bound:", round(ct, 3)))
}
tinh_95_in(my_bo$Coeff)
## [1] "Lower bound: 0.912"
## [1] "Upper bound: 0.938"
# Method 2:
library(broom)
set.seed(1)
boot1000 <- pima_small %>%
bootstrap(1000) %>%
do(tidy(lm(.$glucose ~ .$bmi, trace = F)))
boot1000 %<>% ungroup()
boot1000 %>% head()
## # A tibble: 6 x 6
## replicate term estimate std.error statistic p.value
## <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 (Intercept) 94.2 7.21 13.1 1.23e-32
## 2 1 .$bmi 0.851 0.212 4.01 7.20e- 5
## 3 2 (Intercept) 101 7.90 12.8 1.02e-31
## 4 2 .$bmi 0.667 0.239 2.79 5.51e- 3
## 5 3 (Intercept) 89.5 7.43 12.0 1.39e-28
## 6 3 .$bmi 1.07 0.222 4.80 2.24e- 6
boot1000 %>%
ggplot(aes(estimate)) +
geom_density(fill = "red", color = "red", alpha = 0.4) +
geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) +
facet_wrap(~ term, scales = "free") +
labs(x = NULL, y = NULL,
title = "Coefficients from 1000 bootstrap sampling",
subtitle = "Note: Use broom package (Method 2)",
caption = "Data Sourse: NIDDKD")

# 95% Confidence Interval for Coeff :
boot1000 %>%
filter(term == ".$bmi") %>%
pull(estimate) %>%
tinh_95_in()
## [1] "Lower bound: 0.905"
## [1] "Upper bound: 0.931"
#--------------------------------------------
# 4. Missing Data and Imputation Method
#--------------------------------------------
#------------
# Method 1
#------------
# Load data:
pima <- read.csv(path[3])
# Convert zero to NA and preprocessing data:
convert_NA <- function(x) {case_when(x == 0 ~ NA_integer_,
x != 0 ~ x)}
pima_na <- pima %>%
mutate_at(.vars = c("glucose", "diastolic", "triceps", "insulin"),
.funs = convert_NA)
pima_na %>% head()
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 6 148 72 35 NA 33.6 0.627 50 1
## 2 1 85 66 29 NA 26.6 0.351 31 0
## 3 8 183 64 NA NA 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 NA NA 25.6 0.201 30 0
# A function for calculating NA rate:
na_rate <- function(x) {
return(100*sum(is.na(x)) / length(x))
}
# Use our function:
pima_na %>%
summarise_all(na_rate)
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 0 0.6510417 4.557292 29.55729 48.69792 0 0 0 0
# A function for imputing NA by mean:
imputing_by_mean <- function(x) {
tb <- mean(x, na.rm = TRUE)
x[is.na(x)] <- tb
return(x)
}
pima_imp <- pima_na %>% mutate_all(imputing_by_mean)
pima_imp %>% head()
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 6 148 72 35.00000 155.5482 33.6 0.627 50 1
## 2 1 85 66 29.00000 155.5482 26.6 0.351 31 0
## 3 8 183 64 29.15342 155.5482 23.3 0.672 32 1
## 4 1 89 66 23.00000 94.0000 28.1 0.167 21 0
## 5 0 137 40 35.00000 168.0000 43.1 2.288 33 1
## 6 5 116 74 29.15342 155.5482 25.6 0.201 30 0
pima_imp %>% summarise_all(na_rate)
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 0 0 0 0 0 0 0 0 0
pima_na %>%
lm(glucose ~ bmi, data = .) %>%
summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.317 -21.277 -4.001 18.790 80.863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.5246 4.4900 20.61 < 2e-16 ***
## bmi 0.9117 0.1363 6.69 4.34e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.7 on 761 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.05554, Adjusted R-squared: 0.0543
## F-statistic: 44.75 on 1 and 761 DF, p-value: 4.338e-11
pima_imp %>%
lm(glucose ~ bmi, data = .) %>%
summary()
##
## Call:
## lm(formula = glucose ~ bmi, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.339 -21.178 -3.964 18.675 80.844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.6428 4.4668 20.740 < 2e-16 ***
## bmi 0.9078 0.1356 6.696 4.13e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.6 on 766 degrees of freedom
## Multiple R-squared: 0.0553, Adjusted R-squared: 0.05407
## F-statistic: 44.84 on 1 and 766 DF, p-value: 4.132e-11
#-------------------
# Method 2:
#-------------------
library(VIM)
plot_na <- aggr(pima_na,
col = c("navyblue", "yellow"),
numbers = TRUE,
sortVars = TRUE,
labels = names(pima_na),
cex.axis = .7, gap = 3,
ylab = c("Missing Data Rate", ""))

##
## Variables sorted by number of missings:
## Variable Count
## insulin 0.486979167
## triceps 0.295572917
## diastolic 0.045572917
## glucose 0.006510417
## pregnant 0.000000000
## bmi 0.000000000
## diabetes 0.000000000
## age 0.000000000
## test 0.000000000
library(mice)
df_impu <- mice(pima_na,
m = 2,
method = "pmm",
seed = 100)
##
## iter imp variable
## 1 1 glucose diastolic triceps insulin
## 1 2 glucose diastolic triceps insulin
## 2 1 glucose diastolic triceps insulin
## 2 2 glucose diastolic triceps insulin
## 3 1 glucose diastolic triceps insulin
## 3 2 glucose diastolic triceps insulin
## 4 1 glucose diastolic triceps insulin
## 4 2 glucose diastolic triceps insulin
## 5 1 glucose diastolic triceps insulin
## 5 2 glucose diastolic triceps insulin
data1 <- complete(df_impu, 1)
data2 <- complete(df_impu, 2)
summary(data1)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 44.00 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.75 1st Qu.: 64.00 1st Qu.:21.00
## Median : 3.000 Median :117.00 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.77 Mean : 72.32 Mean :28.88
## 3rd Qu.: 6.000 3rd Qu.:141.00 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.00 Max. :122.00 Max. :99.00
## insulin bmi diabetes age
## Min. : 14.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 77.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.0 Median :32.00 Median :0.3725 Median :29.00
## Mean :152.1 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:188.5 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## test
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
summary(data2)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:21.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :28.00
## Mean : 3.845 Mean :121.7 Mean : 72.29 Mean :28.74
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin bmi diabetes age
## Min. : 14.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median :123.5 Median :32.00 Median :0.3725 Median :29.00
## Mean :148.9 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:182.0 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## test
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
ols1 <- lm(glucose ~ bmi, data = data1)
ols2 <- lm(glucose ~ bmi, data = data2)
library(stargazer)
stargazer(ols1, ols2, title = "So sánh OLS1 và OLS2", type = "text")
##
## So sánh OLS1 và OLS2
## ===========================================================
## Dependent variable:
## ----------------------------
## glucose
## (1) (2)
## -----------------------------------------------------------
## bmi 0.912*** 0.942***
## (0.136) (0.137)
##
## Constant 92.593*** 91.601***
## (4.479) (4.504)
##
## -----------------------------------------------------------
## Observations 768 768
## R2 0.056 0.058
## Adjusted R2 0.054 0.057
## Residual Std. Error (df = 766) 29.680 29.846
## F Statistic (df = 1; 766) 45.014*** 47.475***
## ===========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
fit <- with(data = df_impu, exp = lm(glucose ~ bmi))
summary(fit)
##
## ## summary of imputation 1 :
##
## Call:
## lm(formula = glucose ~ bmi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.392 -21.261 -4.055 18.847 80.787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.5925 4.4787 20.674 < 2e-16 ***
## bmi 0.9120 0.1359 6.709 3.8e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.68 on 766 degrees of freedom
## Multiple R-squared: 0.0555, Adjusted R-squared: 0.05427
## F-statistic: 45.01 on 1 and 766 DF, p-value: 3.803e-11
##
##
## ## summary of imputation 2 :
##
## Call:
## lm(formula = glucose ~ bmi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.146 -21.356 -4.308 18.891 81.006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.6009 4.5037 20.34 < 2e-16 ***
## bmi 0.9418 0.1367 6.89 1.16e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.85 on 766 degrees of freedom
## Multiple R-squared: 0.05836, Adjusted R-squared: 0.05713
## F-statistic: 47.47 on 1 and 766 DF, p-value: 1.164e-11
tonghop <- pool(fit)
tonghop
## Call: pool(object = fit)
##
## Pooled coefficients:
## (Intercept) bmi
## 92.0967382 0.9268998
##
## Fraction of information about the coefficients missing due to nonresponse:
## (Intercept) bmi
## 0.04025164 0.03953616
(ols1$coefficients[2] + ols2$coefficients[2]) / 2
## bmi
## 0.9268998
# Write a function for visualizing missing data:
missing_vis <- function(x){
x %>%
is.na() %>%
melt() %>%
ggplot(aes(x = Var2, fill = value)) +
geom_bar(aes(y = ..count..), alpha = 0.5) +
coord_flip() + theme_bw() +
scale_fill_manual(values = c("blue", "red"),
name = NULL,
labels = c("Available", "Missing")) +
labs(x = NULL, y = NULL, title = "Missing Data Rate in our Data Set")
}
missing_vis(pima_na)
