Data Analysis Course at USTH (Section: Statistics, Part 2)

Biostatistics

Nguyen Chi Dung

#=====================================================
#                Simple Regression
#=====================================================

rm(list = ls())
path <- dir("F:/usth/data", full.names = TRUE)
pima_small <- read.csv(path[4])

# Perform OLS Regression: 
library(magrittr)
library(tidyverse)

set.seed(1)
train <- pima_small %>% sample_n(300)
test <- dplyr::setdiff(pima_small, train)

ols1 <- train %>% lm(glucose ~ bmi, data = .)

#---------------------------------------
#  1. Extract insights from lm object
#---------------------------------------

# OLS results: 
ols1 %>% summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57.69 -22.82  -6.34  21.81  77.87 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96.4857     8.8426  10.911  < 2e-16 ***
## bmi           0.8226     0.2601   3.162  0.00173 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31 on 298 degrees of freedom
## Multiple R-squared:  0.03247,    Adjusted R-squared:  0.02922 
## F-statistic:    10 on 1 and 298 DF,  p-value: 0.001727

# Extract Coefficients:  
ols1$coefficients

## (Intercept)         bmi 
##  96.4856536   0.8225668

# Extract Residuals: 
res <- ols1$residuals
res %>% head()

##        105        146        224        354         79        348 
## -16.933546   4.261545 -39.972305  23.690486 -25.085140 -57.686776

# Extract fitted values: 
fit_val <- ols1$fitted.values
fit_val %>% head()

##      105      146      224      354       79      348 
## 125.9335 121.7385 122.9723 116.3095 127.0851 125.6868

# Use for prediction: 
pred <- predict(ols1, test)
pred %>% head() # Predicted values

##        1        2        3        4        5        6 
## 119.5998 131.9383 121.9852 134.1592 126.5916 115.5692

test$glucose %>% head() # Actual values

## [1]  89 137  78 118 143  97

# Regression line: 
theme_set(theme_minimal())

train %>% 
  ggplot(aes(bmi, glucose)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm") + 
  labs(x = NULL, y = NULL, 
       title = "The Relationship between Glucose and BMI", 
       caption = "Data Source: The National Institute of Diabetes and Digestive and Kidney Diseases")

#---------------------------------------
#           2. Diagnostics
#---------------------------------------

# Use graphs: 
library(ggfortify)
autoplot(ols1, which = 1:6, label.size = 2)

#----------------------------------------
#     are error normally distributed?
#----------------------------------------
# Q-Q plot (method 1): 
autoplot(ols1, which = 2, label.size = 2)

# Histogram (method 2): 
train <- train %>% mutate(resd = ols1$residuals)

train %>% 
  ggplot(aes(resd)) + 
  geom_density(fill = "red", color = "red", alpha = 0.4) + 
  geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) + 
  geom_vline(xintercept = mean(train$resd), color = "yellow")

# Use an official test (https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test): 
shapiro.test(train$resd)

## 
##  Shapiro-Wilk normality test
## 
## data:  train$resd
## W = 0.96683, p-value = 2.218e-06

#-----------------------------------------
#    Remedies for Assumption Violations 
#-----------------------------------------

# Use Tukey’s method (SPSS uses this method for handling outliers): 
ct <- mean(train$resd) + 2*sd(train$resd)
cd <- mean(train$resd) - 2*sd(train$resd)

train <- train %>% mutate(out = case_when(resd > ct ~ "Outlier",
                                          resd < cd ~ "Outlier", 
                                          resd <= ct & resd >= cd ~ "Normal"))

ols_re1 <- train %>% 
  filter(out == "Normal") %>% 
  lm(glucose ~ bmi, data = .)

ols_re1 %>% summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.116 -20.879  -5.213  21.821  63.745 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.5702     8.1657  11.336  < 2e-16 ***
## bmi           0.8604     0.2400   3.586 0.000395 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.34 on 287 degrees of freedom
## Multiple R-squared:  0.04287,    Adjusted R-squared:  0.03954 
## F-statistic: 12.86 on 1 and 287 DF,  p-value: 0.0003952

ols1 %>% summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57.69 -22.82  -6.34  21.81  77.87 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96.4857     8.8426  10.911  < 2e-16 ***
## bmi           0.8226     0.2601   3.162  0.00173 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31 on 298 degrees of freedom
## Multiple R-squared:  0.03247,    Adjusted R-squared:  0.02922 
## F-statistic:    10 on 1 and 298 DF,  p-value: 0.001727

u <- ols_re1 %>% summary()
u$r.squared

## [1] 0.04287402

v <- ols1 %>% summary()
u$r.squared / v$r.squared - 1

## [1] 0.3205028

train %>% 
  ggplot(aes(bmi, glucose)) + 
  geom_point(data = train %>% filter(out == "Outlier"), 
             color = "red", size = 2) + 
  geom_smooth(method = "lm", se = FALSE) + 
  geom_point(data = train %>% filter(out != "Outlier"), 
             color = "purple", alpha = 0.3) + 
  geom_smooth(data = train %>% filter(out != "Outlier"), 
              method = "lm", se = FALSE, color = "green")

# Use Cook's Distance (https://en.wikipedia.org/wiki/Cook%27s_distance): 

cut_off <- 1 / nrow(train)

train %<>% mutate(d = cooks.distance(ols1),  
                  out_d = case_when(d > cut_off ~ "Outlier", 
                                    d <= cut_off ~ "Normal"))

train %>% 
  filter(out_d == "Outlier") %>% 
  head()

##   pregnant glucose diastolic triceps insulin  bmi diabetes age test
## 1       10      68       106      23      49 35.5    0.285  47    0
## 2        7     195        70      33     145 25.1    0.163  55    1
## 3        8     176        90      34     300 33.7    0.467  58    1
## 4        0     181        88      44     510 43.3    0.222  26    1
## 5        0     139        62      17     210 22.1    0.207  21    0
## 6        1      87        68      34      77 37.6    0.401  24    0
##        resd     out           d   out_d
## 1 -57.68678  Normal 0.006414617 Outlier
## 2  77.86792 Outlier 0.025842812 Outlier
## 3  51.79384  Normal 0.004701277 Outlier
## 4  48.89720  Normal 0.013197734 Outlier
## 5  24.33562  Normal 0.003838344 Outlier
## 6 -40.41417  Normal 0.003981352 Outlier

ols3 <- train %>% 
  filter(out_d != "Outlier") %>%
  lm(glucose ~ bmi, data = .)

ols3 %>% summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.553 -14.525  -3.383  13.616  45.333 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.9354     7.4638  12.452  < 2e-16 ***
## bmi           0.7956     0.2281   3.488 0.000592 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.54 on 213 degrees of freedom
## Multiple R-squared:  0.05402,    Adjusted R-squared:  0.04958 
## F-statistic: 12.16 on 1 and 213 DF,  p-value: 0.0005916

autoplot(ols1, which = 4, ncol = 3, label.size = 1) # Method 1

train <- train  %>%  
  mutate(N = 1:nrow(.))

train %>% 
  ggplot(aes(N, d)) +
  geom_point(alpha = 0.2) + 
  geom_point(data = train %>% filter(d > cut_off), color = "red") + 
  labs(x = NULL, y = "Cook's  Distance", 
       title = "Outliers based on Cook's Distance") # Method 2.

# Use data transformation: 

train %<>% mutate(ln_bmi = log(bmi))
ols4 <- train %>% lm(glucose ~ ln_bmi, data = .)
ols4 %>% summary()

## 
## Call:
## lm(formula = glucose ~ ln_bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.301 -23.168  -6.282  21.465  78.576 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   24.603     30.254   0.813  0.41675   
## ln_bmi        28.491      8.668   3.287  0.00113 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.96 on 298 degrees of freedom
## Multiple R-squared:  0.03499,    Adjusted R-squared:  0.03175 
## F-statistic:  10.8 on 1 and 298 DF,  p-value: 0.001134

#---------------------------------------
#         3. Boostrap Method
#---------------------------------------

# Method 1: 

pima_small <- read.csv(path[4])
train <- pima_small %>% sample_n(300)
test <- dplyr::setdiff(pima_small, train)
ols1 <- train %>% lm(glucose ~ bmi, data = .)
ols1 %>% summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.831 -21.393  -6.712  21.766  76.963 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  94.8368     8.5097  11.145   <2e-16 ***
## bmi           0.8262     0.2507   3.296   0.0011 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.41 on 298 degrees of freedom
## Multiple R-squared:  0.03517,    Adjusted R-squared:  0.03193 
## F-statistic: 10.86 on 1 and 298 DF,  p-value: 0.0011

coef <- ols1$coefficients
coef[1]

## (Intercept) 
##    94.83678

coef[2]

##       bmi 
## 0.8262213

a <- c()
b <- c()
for (i in 1:1000) {
  set.seed(i)
  train <- pima_small %>% sample_n(nrow(.), replace = TRUE)
  ols1 <- train %>% lm(glucose ~ bmi, data = .)
  ols1 %>% summary()
  coef <- ols1$coefficients
  m <- coef[1]
  n <- coef[2]
  a <- c(a, m)
  b <- c(b, n)
}


my_bo <- data.frame(Intercept = a, Coeff = b)
my_bo %>% 
  gather(kieu_hs, giatri) %>% 
  ggplot(aes(giatri)) + 
  geom_density(fill = "red", color = "red", alpha = 0.4) + 
  geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) + 
  facet_wrap(~ kieu_hs, scales = "free") + 
  labs(x = NULL,  y = NULL, 
       title = "Coefficients from 1000 bootstrap sampling", 
       subtitle = "Note: Use our function (Method 1)", 
       caption = "Data Sourse: NIDDKD")

my_bo %>% summary()

##    Intercept          Coeff       
##  Min.   : 68.91   Min.   :0.2746  
##  1st Qu.: 87.27   1st Qu.:0.7844  
##  Median : 91.90   Median :0.9172  
##  Mean   : 92.01   Mean   :0.9253  
##  3rd Qu.: 96.76   3rd Qu.:1.0708  
##  Max.   :114.56   Max.   :1.6703

# Use Shapiro test for checking normality: 
shapiro.test(my_bo$Coeff)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_bo$Coeff
## W = 0.99901, p-value = 0.8778

# Calculate 95% Confidence Interval for Coeff 
# (https://www.mathsisfun.com/data/confidence-interval.html): 

tinh_95_in <- function(x) {
  ct <- mean(x) + 1.96*sd(x) / sqrt(length(x))
  cd <- mean(x) - 1.96*sd(x) / sqrt(length(x))
  print(paste("Lower bound:", round(cd, 3)))
  print(paste("Upper bound:", round(ct, 3)))
}

tinh_95_in(my_bo$Coeff)

## [1] "Lower bound: 0.912"
## [1] "Upper bound: 0.938"

# Method 2: 
library(broom)

set.seed(1)
boot1000 <- pima_small %>% 
  bootstrap(1000) %>% 
  do(tidy(lm(.$glucose ~ .$bmi, trace = F)))

boot1000 %<>% ungroup()
boot1000 %>% head()

## # A tibble: 6 x 6
##   replicate term        estimate std.error statistic               p.value
##       <int> <chr>          <dbl>     <dbl>     <dbl>                 <dbl>
## 1         1 (Intercept)   94.2       7.21      13.1               1.23e-32
## 2         1 .$bmi          0.851     0.212      4.01              7.20e- 5
## 3         2 (Intercept)  101         7.90      12.8               1.02e-31
## 4         2 .$bmi          0.667     0.239      2.79              5.51e- 3
## 5         3 (Intercept)   89.5       7.43      12.0               1.39e-28
## 6         3 .$bmi          1.07      0.222      4.80              2.24e- 6

boot1000 %>% 
  ggplot(aes(estimate)) + 
  geom_density(fill = "red", color = "red", alpha = 0.4) + 
  geom_histogram(aes(y = ..density..), fill = "blue", color = "blue", alpha = 0.4) + 
  facet_wrap(~ term, scales = "free") + 
  labs(x = NULL,  y = NULL, 
       title = "Coefficients from 1000 bootstrap sampling", 
       subtitle = "Note: Use broom package (Method 2)", 
       caption = "Data Sourse: NIDDKD")

# 95% Confidence Interval for Coeff : 
boot1000 %>% 
  filter(term == ".$bmi") %>% 
  pull(estimate) %>% 
  tinh_95_in()

## [1] "Lower bound: 0.905"
## [1] "Upper bound: 0.931"

#--------------------------------------------
#  4. Missing Data and  Imputation Method 
#--------------------------------------------

#------------
#  Method 1
#------------

# Load data: 
pima <- read.csv(path[3])

# Convert zero to NA and preprocessing data: 
convert_NA <- function(x) {case_when(x == 0 ~ NA_integer_, 
                                     x != 0 ~ x)}

pima_na <- pima %>% 
  mutate_at(.vars = c("glucose", "diastolic", "triceps", "insulin"), 
            .funs = convert_NA)

pima_na %>% head()

##   pregnant glucose diastolic triceps insulin  bmi diabetes age test
## 1        6     148        72      35      NA 33.6    0.627  50    1
## 2        1      85        66      29      NA 26.6    0.351  31    0
## 3        8     183        64      NA      NA 23.3    0.672  32    1
## 4        1      89        66      23      94 28.1    0.167  21    0
## 5        0     137        40      35     168 43.1    2.288  33    1
## 6        5     116        74      NA      NA 25.6    0.201  30    0

# A function for calculating NA rate:  
na_rate <- function(x) {
  return(100*sum(is.na(x)) / length(x))
}


# Use our function: 
pima_na %>% 
  summarise_all(na_rate)

##   pregnant   glucose diastolic  triceps  insulin bmi diabetes age test
## 1        0 0.6510417  4.557292 29.55729 48.69792   0        0   0    0

# A function for imputing NA by mean: 

imputing_by_mean <- function(x) {
  tb <- mean(x, na.rm = TRUE)
  x[is.na(x)] <- tb
  return(x)
}

pima_imp <- pima_na %>% mutate_all(imputing_by_mean)
pima_imp %>% head()

##   pregnant glucose diastolic  triceps  insulin  bmi diabetes age test
## 1        6     148        72 35.00000 155.5482 33.6    0.627  50    1
## 2        1      85        66 29.00000 155.5482 26.6    0.351  31    0
## 3        8     183        64 29.15342 155.5482 23.3    0.672  32    1
## 4        1      89        66 23.00000  94.0000 28.1    0.167  21    0
## 5        0     137        40 35.00000 168.0000 43.1    2.288  33    1
## 6        5     116        74 29.15342 155.5482 25.6    0.201  30    0

pima_imp %>% summarise_all(na_rate)

##   pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1        0       0         0       0       0   0        0   0    0

pima_na %>% 
  lm(glucose ~ bmi, data = .) %>% 
  summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.317 -21.277  -4.001  18.790  80.863 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.5246     4.4900   20.61  < 2e-16 ***
## bmi           0.9117     0.1363    6.69 4.34e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.7 on 761 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.05554,    Adjusted R-squared:  0.0543 
## F-statistic: 44.75 on 1 and 761 DF,  p-value: 4.338e-11

pima_imp %>% 
  lm(glucose ~ bmi, data = .) %>% 
  summary()

## 
## Call:
## lm(formula = glucose ~ bmi, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.339 -21.178  -3.964  18.675  80.844 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.6428     4.4668  20.740  < 2e-16 ***
## bmi           0.9078     0.1356   6.696 4.13e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.6 on 766 degrees of freedom
## Multiple R-squared:  0.0553, Adjusted R-squared:  0.05407 
## F-statistic: 44.84 on 1 and 766 DF,  p-value: 4.132e-11

#-------------------
#  Method 2: 
#-------------------

library(VIM)

plot_na <- aggr(pima_na, 
                col = c("navyblue", "yellow"),
                numbers = TRUE, 
                sortVars = TRUE, 
                labels = names(pima_na), 
                cex.axis = .7, gap = 3, 
                ylab = c("Missing Data Rate", ""))

## 
##  Variables sorted by number of missings: 
##   Variable       Count
##    insulin 0.486979167
##    triceps 0.295572917
##  diastolic 0.045572917
##    glucose 0.006510417
##   pregnant 0.000000000
##        bmi 0.000000000
##   diabetes 0.000000000
##        age 0.000000000
##       test 0.000000000

library(mice)
df_impu <- mice(pima_na,
                m = 2, 
                method = "pmm", 
                seed = 100)

## 
##  iter imp variable
##   1   1  glucose  diastolic  triceps  insulin
##   1   2  glucose  diastolic  triceps  insulin
##   2   1  glucose  diastolic  triceps  insulin
##   2   2  glucose  diastolic  triceps  insulin
##   3   1  glucose  diastolic  triceps  insulin
##   3   2  glucose  diastolic  triceps  insulin
##   4   1  glucose  diastolic  triceps  insulin
##   4   2  glucose  diastolic  triceps  insulin
##   5   1  glucose  diastolic  triceps  insulin
##   5   2  glucose  diastolic  triceps  insulin

data1 <- complete(df_impu, 1) 
data2 <- complete(df_impu, 2) 
summary(data1)

##     pregnant         glucose         diastolic         triceps     
##  Min.   : 0.000   Min.   : 44.00   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.75   1st Qu.: 64.00   1st Qu.:21.00  
##  Median : 3.000   Median :117.00   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.77   Mean   : 72.32   Mean   :28.88  
##  3rd Qu.: 6.000   3rd Qu.:141.00   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.00   Max.   :122.00   Max.   :99.00  
##     insulin           bmi           diabetes           age       
##  Min.   : 14.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.: 77.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median :125.0   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   :152.1   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:188.5   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##       test      
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

summary(data2)

##     pregnant         glucose        diastolic         triceps     
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:21.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :28.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.29   Mean   :28.74  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           bmi           diabetes           age       
##  Min.   : 14.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.: 76.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median :123.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   :148.9   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:182.0   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##       test      
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

ols1 <- lm(glucose ~ bmi, data = data1)
ols2 <- lm(glucose ~ bmi, data = data2)
library(stargazer)
stargazer(ols1, ols2, title = "So sánh OLS1 và OLS2", type = "text")

## 
## So sánh OLS1 và OLS2
## ===========================================================
##                                    Dependent variable:     
##                                ----------------------------
##                                          glucose           
##                                     (1)            (2)     
## -----------------------------------------------------------
## bmi                               0.912***      0.942***   
##                                   (0.136)        (0.137)   
##                                                            
## Constant                         92.593***      91.601***  
##                                   (4.479)        (4.504)   
##                                                            
## -----------------------------------------------------------
## Observations                        768            768     
## R2                                 0.056          0.058    
## Adjusted R2                        0.054          0.057    
## Residual Std. Error (df = 766)     29.680        29.846    
## F Statistic (df = 1; 766)        45.014***      47.475***  
## ===========================================================
## Note:                           *p<0.1; **p<0.05; ***p<0.01

fit <- with(data = df_impu, exp = lm(glucose ~ bmi))
summary(fit)

## 
##  ## summary of imputation 1 :
## 
## Call:
## lm(formula = glucose ~ bmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.392 -21.261  -4.055  18.847  80.787 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  92.5925     4.4787  20.674  < 2e-16 ***
## bmi           0.9120     0.1359   6.709  3.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.68 on 766 degrees of freedom
## Multiple R-squared:  0.0555, Adjusted R-squared:  0.05427 
## F-statistic: 45.01 on 1 and 766 DF,  p-value: 3.803e-11
## 
## 
##  ## summary of imputation 2 :
## 
## Call:
## lm(formula = glucose ~ bmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.146 -21.356  -4.308  18.891  81.006 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  91.6009     4.5037   20.34  < 2e-16 ***
## bmi           0.9418     0.1367    6.89 1.16e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.85 on 766 degrees of freedom
## Multiple R-squared:  0.05836,    Adjusted R-squared:  0.05713 
## F-statistic: 47.47 on 1 and 766 DF,  p-value: 1.164e-11

tonghop <- pool(fit)
tonghop

## Call: pool(object = fit)
## 
## Pooled coefficients:
## (Intercept)         bmi 
##  92.0967382   0.9268998 
## 
## Fraction of information about the coefficients missing due to nonresponse: 
## (Intercept)         bmi 
##  0.04025164  0.03953616

(ols1$coefficients[2] + ols2$coefficients[2]) / 2

##       bmi 
## 0.9268998

# Write a function for visualizing missing data: 

missing_vis <- function(x){
  x %>% 
  is.na() %>% 
  melt() %>% 
  ggplot(aes(x = Var2, fill = value)) +
  geom_bar(aes(y = ..count..), alpha = 0.5) + 
  coord_flip() + theme_bw() + 
  scale_fill_manual(values = c("blue", "red"), 
                    name = NULL, 
                    labels = c("Available", "Missing")) + 
  labs(x = NULL, y = NULL, title = "Missing Data Rate in our Data Set")
}


missing_vis(pima_na)