Prepare useful packages.

library(tidyverse)
library(googlesheets4)

Import familiar dataset into R.

With read_sheet() function of googlesheets4 package we can import dataset into R directly with URL link of google sheet.

url <-"https://docs.google.com/spreadsheets/d/1RSX6W96XNyzzYg3kIAiRnKZON0sa1Wr3ybnw1n1klpo/edit?usp=sharing"
eclair <- read_sheet(url)

glimpse(eclair)

## Rows: 1,094
## Columns: 21
## $ No        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ INSTITUTE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ HOSPITAL  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ SEX       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ AGE       <dbl> 13, 14, 13, 15, 14, 11, 19, 17, 15, 20, 18, 17, 21, 16, 17, …
## $ OCC       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EXPTIME   <dttm> 1990-08-25 18:00:00, 1990-08-25 18:00:00, 1990-08-25 18:00:…
## $ BEEFCURRY <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ SALTEGG   <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ECLAIR    <dbl> 1, 0, 0, 50, 0, 0, 0, 1, 50, 0, 0, 1, 0, 0, 3, 1, 2, 1, 2, 1…
## $ WATER     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ OTHERS    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ONSET     <dttm> 1990-08-25 22:00:00, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ NAUSEA    <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ VOMITING  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ ABDPAIN   <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ DIARRHEA  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ OTHSYMP   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ INCPERIOD <dbl> 240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240, 0, 0, 180, 0, …
## $ CASECONT  <chr> "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", …
## $ BMI       <dbl> 16.47505, 18.47097, 23.71044, 20.10565, 16.99689, 21.83215, …

There are 21 variables and 1,094 observations in this dataset.
From the code book come with dataset in excel file tell us that 99 mean NA for AGE, 90 mean NA for ECLAIR, and 9 mean NA for other categorical variables.

Clean dataset.

eclair_clean <- eclair %>%
      #Identify NAs
      mutate(AGE = na_if(AGE, 99))%>%
      mutate_at(c("HOSPITAL","OCC", "BEEFCURRY", "SALTEGG", "WATER", "OTHERS", 
                    "NAUSEA", "VOMITING", "ABDPAIN", "DIARRHEA", "OTHSYMP"),
                na_if, 9) %>%
      mutate(ECLAIR = case_when(ECLAIR %in% c(1:80)~1,
                                ECLAIR == 90 ~ NA_real_,
                                TRUE ~ ECLAIR))%>%
      mutate(WATER = factor(WATER, labels = c("not drank","drank")))%>%
      mutate(CASECONT = factor(CASECONT, labels = c("non case","case")))%>%
      mutate_at(c("BEEFCURRY","SALTEGG","ECLAIR"),
                funs(factor(., labels = c("not ate","ate"))))%>%
      mutate_at(c("NAUSEA", "VOMITING", "ABDPAIN", "DIARRHEA"),
                funs(factor(.,labels = c("no","yes"))))%>%
      mutate(SEX = factor(SEX, labels = c("female","male")))%>%
      mutate(HOSPITAL = factor(HOSPITAL, labels = c("opd","admit")))%>%
      mutate_at(c("ONSET","EXPTIME"),as.Date)


colSums(is.na(eclair_clean)) ## Show how many missing data for each variables.

##        No INSTITUTE  HOSPITAL       SEX       AGE       OCC   EXPTIME BEEFCURRY 
##         0         0        54         0        60        28        39         5 
##   SALTEGG    ECLAIR     WATER    OTHERS     ONSET    NAUSEA  VOMITING   ABDPAIN 
##         5       117         6         6       629         0         0         0 
##  DIARRHEA   OTHSYMP INCPERIOD  CASECONT       BMI 
##         0         0         0         0         0

There are NAs in HOSPITAL, AGE, EXPTIME, BEEFCURRY, SALTEGG, WATER, OTHERS and ONSET.

Create complete cases data

We can create complete case dataset by subset original dataset with complete.cases() function.

complete_case_eclair <- eclair_clean[complete.cases(eclair_clean),]
nrow(complete_case_eclair)

## [1] 373

Now for complete case dataset we have only 441 records, around 700 records were deleted due to missing data. Most are missing in ONSET because of non cases. Difference in sample size will lead to difference analysis result.

Compare basic statistic between complete and available case analysis

We can define na.rm = TURE to perform available case analysis.

mean(complete_case_eclair$AGE)

## [1] 18.29223

mean(eclair_clean$AGE, na.rm = TRUE)

## [1] 19.32108

sd(complete_case_eclair$AGE)

## [1] 6.463119

sd(eclair_clean$AGE, na.rm = TRUE)

## [1] 7.810027

Note that there are difference in mean and SD between 2 methods.

Create cross table to examine the difference between 2 methods

table(complete_case_eclair$WATER,
      complete_case_eclair$CASECONT)

##            
##             non case case
##   not drank        0    5
##   drank            1  367

table(eclair_clean$WATER,
      eclair_clean$CASECONT)

##            
##             non case case
##   not drank       15   10
##   drank          604  459

This show how complete case analysis can alter the study result like odds ratio, especially when large number of NAs.

Mean imputation example

Now I will demonstrate mean imputation method for age variable.

sum(is.na(eclair_clean$AGE))

## [1] 60

There are 60 records that are missing in age variable. I will impute these with mean of age.

mean_age <- mean(eclair_clean$AGE, na.rm = TRUE)## obtained mean age by available case analysis first

mean_im_eclair <- eclair_clean %>%
                  mutate(AGE = replace_na(AGE,mean_age)) #Replace NAs with mean age

mean(mean_im_eclair$AGE)

## [1] 19.32108

sd(mean_im_eclair$AGE)

## [1] 7.592636

We can see that overall mean of age, after we imputed, dose not change from original dataset while SD is decrease due to more n.

Summary basic statistic differnce between these methods

result <-tibble(Method = c("complete case",
                           "available case",
                           "mean imputation"),
                  Mean = c(mean(complete_case_eclair$AGE), 
                           mean(eclair_clean$AGE, na.rm = TRUE), 
                           mean(mean_im_eclair$AGE)),
                    SD = c(sd(complete_case_eclair$AGE), 
                           sd(eclair_clean$AGE, na.rm = TRUE), 
                           sd(mean_im_eclair$AGE)))
result

## # A tibble: 3 × 3
##   Method           Mean    SD
##   <chr>           <dbl> <dbl>
## 1 complete case    18.3  6.46
## 2 available case   19.3  7.81
## 3 mean imputation  19.3  7.59

Note that both mean and SD are altered by complete case analysis method, while mean imputation method only decrease the SD, not alter the mean.

Difference in distribution for each method.

Histogram

ggplot()+ #Plot histograms
      geom_histogram(aes(mean_im_eclair$AGE, fill = "#ff9973"), color = "black")+
      geom_histogram(aes(eclair_clean$AGE,fill = "#ffa7bf"), color="black")+
      geom_histogram(aes(complete_case_eclair$AGE,fill = "#00ffcc"), color="black")+
      #label title and axises
      labs(title = "Distribution of participants age of each type of anlysis")+
      xlab("Age (years)")+
      ylab("Count")+
      #create plot legend
      scale_fill_identity( guide = "legend",
                           name ="",
                           labels = c("Complete cases analysis",
                                      "Mean imputation",
                                      "Available cases analysis"))+
      #adjust scale to start from (0,0)
      scale_y_continuous(expand = c(0,0))+
      scale_x_continuous(expand = c(0,0))+
      #adjust plot theme
      theme_bw()+
      theme(legend.position = "top",
            legend.text = element_text(size = 10),
            panel.background = element_rect(color = "white"))+
      #add annotation about mean imputation
      annotate(geom = "text",
               x = 40, y = 140,
               label = "Orange bar represent\n60 NAs that were imputed with age mean 19.3 years")

95% Confidence interval

#Calculate 95% confidence interval of mean age for each methods
ci_eclair_clean <- t.test(eclair_clean$AGE, conf.level = 0.95)$conf.int
ci_complete_case <- t.test(complete_case_eclair$AGE, conf.level = 0.95)$conf.int
ci_mean_imp <- t.test(mean_im_eclair$AGE, conf.level = 0.95)$conf.int
#Create dataframe
ci_table <- tibble( "methods" = c("available case", "complete case", "mean imputation"),
                    "lower_border" = c(ci_eclair_clean[1], ci_complete_case[1], ci_mean_imp[1]),
                    "upper_border" = c(ci_eclair_clean[2], ci_complete_case[2], ci_mean_imp[2]),
                    "mean" = c(mean(eclair_clean$AGE, na.rm = TRUE), mean(complete_case_eclair$AGE),
                               mean(mean_im_eclair$AGE)))

#Plot
ggplot(data = ci_table)+
   geom_point(aes(x=mean, y=methods, color = methods))+
   geom_point(aes(x=lower_border, y=methods, color = methods))+
   geom_point(aes(x=upper_border, y=methods, color = methods))+
   #Transform dataframe to long format before create line chart.
   geom_line(data = pivot_longer(ci_table, cols = c(2:4), 
                                 names_to = "point", values_to = "value"),
             aes(x= value, y= methods, color= methods))+
   #Label
   labs(title = "95% CI of mean age for each methods", x = "Mean age (years)" )+
   theme_bw()

As plots show, complete case analysis can affect the 95% confidence interval and may lead to problem in inferential analysis. While mean imputation method make 95% confidence interval slightly wider.

Basic methods for dealing with missing data with R

Jirapanakorn Sutham M.D.

2022-08-09