library(tidyverse)
library(googlesheets4)
With read_sheet() function of googlesheets4
package we can import dataset into R directly with URL link of google
sheet.
url <-"https://docs.google.com/spreadsheets/d/1RSX6W96XNyzzYg3kIAiRnKZON0sa1Wr3ybnw1n1klpo/edit?usp=sharing"
eclair <- read_sheet(url)
glimpse(eclair)
## Rows: 1,094
## Columns: 21
## $ No <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1…
## $ INSTITUTE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ HOSPITAL <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, …
## $ SEX <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ AGE <dbl> 13, 14, 13, 15, 14, 11, 19, 17, 15, 20, 18, 17, 21, 16, 17, …
## $ OCC <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ EXPTIME <dttm> 1990-08-25 18:00:00, 1990-08-25 18:00:00, 1990-08-25 18:00:…
## $ BEEFCURRY <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ SALTEGG <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ECLAIR <dbl> 1, 0, 0, 50, 0, 0, 0, 1, 50, 0, 0, 1, 0, 0, 3, 1, 2, 1, 2, 1…
## $ WATER <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ OTHERS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ONSET <dttm> 1990-08-25 22:00:00, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ NAUSEA <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ VOMITING <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ ABDPAIN <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ DIARRHEA <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ OTHSYMP <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ INCPERIOD <dbl> 240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240, 0, 0, 180, 0, …
## $ CASECONT <chr> "1", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", …
## $ BMI <dbl> 16.47505, 18.47097, 23.71044, 20.10565, 16.99689, 21.83215, …
There are 21 variables and 1,094 observations in this dataset.
From the code book come with dataset in excel file tell us that 99 mean
NA for AGE, 90 mean NA for ECLAIR, and 9 mean NA for other categorical
variables.
eclair_clean <- eclair %>%
#Identify NAs
mutate(AGE = na_if(AGE, 99))%>%
mutate_at(c("HOSPITAL","OCC", "BEEFCURRY", "SALTEGG", "WATER", "OTHERS",
"NAUSEA", "VOMITING", "ABDPAIN", "DIARRHEA", "OTHSYMP"),
na_if, 9) %>%
mutate(ECLAIR = case_when(ECLAIR %in% c(1:80)~1,
ECLAIR == 90 ~ NA_real_,
TRUE ~ ECLAIR))%>%
mutate(WATER = factor(WATER, labels = c("not drank","drank")))%>%
mutate(CASECONT = factor(CASECONT, labels = c("non case","case")))%>%
mutate_at(c("BEEFCURRY","SALTEGG","ECLAIR"),
funs(factor(., labels = c("not ate","ate"))))%>%
mutate_at(c("NAUSEA", "VOMITING", "ABDPAIN", "DIARRHEA"),
funs(factor(.,labels = c("no","yes"))))%>%
mutate(SEX = factor(SEX, labels = c("female","male")))%>%
mutate(HOSPITAL = factor(HOSPITAL, labels = c("opd","admit")))%>%
mutate_at(c("ONSET","EXPTIME"),as.Date)
colSums(is.na(eclair_clean)) ## Show how many missing data for each variables.
## No INSTITUTE HOSPITAL SEX AGE OCC EXPTIME BEEFCURRY
## 0 0 54 0 60 28 39 5
## SALTEGG ECLAIR WATER OTHERS ONSET NAUSEA VOMITING ABDPAIN
## 5 117 6 6 629 0 0 0
## DIARRHEA OTHSYMP INCPERIOD CASECONT BMI
## 0 0 0 0 0
There are NAs in HOSPITAL, AGE, EXPTIME, BEEFCURRY, SALTEGG, WATER, OTHERS and ONSET.
We can create complete case dataset by subset original dataset with
complete.cases() function.
complete_case_eclair <- eclair_clean[complete.cases(eclair_clean),]
nrow(complete_case_eclair)
## [1] 373
Now for complete case dataset we have only 441 records, around 700 records were deleted due to missing data. Most are missing in ONSET because of non cases. Difference in sample size will lead to difference analysis result.
We can define na.rm = TURE to perform available case
analysis.
mean(complete_case_eclair$AGE)
## [1] 18.29223
mean(eclair_clean$AGE, na.rm = TRUE)
## [1] 19.32108
sd(complete_case_eclair$AGE)
## [1] 6.463119
sd(eclair_clean$AGE, na.rm = TRUE)
## [1] 7.810027
Note that there are difference in mean and SD between 2 methods.
table(complete_case_eclair$WATER,
complete_case_eclair$CASECONT)
##
## non case case
## not drank 0 5
## drank 1 367
table(eclair_clean$WATER,
eclair_clean$CASECONT)
##
## non case case
## not drank 15 10
## drank 604 459
This show how complete case analysis can alter the study result like odds ratio, especially when large number of NAs.
Now I will demonstrate mean imputation method for age variable.
sum(is.na(eclair_clean$AGE))
## [1] 60
There are 60 records that are missing in age variable. I will impute these with mean of age.
mean_age <- mean(eclair_clean$AGE, na.rm = TRUE)## obtained mean age by available case analysis first
mean_im_eclair <- eclair_clean %>%
mutate(AGE = replace_na(AGE,mean_age)) #Replace NAs with mean age
mean(mean_im_eclair$AGE)
## [1] 19.32108
sd(mean_im_eclair$AGE)
## [1] 7.592636
We can see that overall mean of age, after we imputed, dose not change from original dataset while SD is decrease due to more n.Â
result <-tibble(Method = c("complete case",
"available case",
"mean imputation"),
Mean = c(mean(complete_case_eclair$AGE),
mean(eclair_clean$AGE, na.rm = TRUE),
mean(mean_im_eclair$AGE)),
SD = c(sd(complete_case_eclair$AGE),
sd(eclair_clean$AGE, na.rm = TRUE),
sd(mean_im_eclair$AGE)))
result
## # A tibble: 3 × 3
## Method Mean SD
## <chr> <dbl> <dbl>
## 1 complete case 18.3 6.46
## 2 available case 19.3 7.81
## 3 mean imputation 19.3 7.59
Note that both mean and SD are altered by complete case analysis method, while mean imputation method only decrease the SD, not alter the mean.
ggplot()+ #Plot histograms
geom_histogram(aes(mean_im_eclair$AGE, fill = "#ff9973"), color = "black")+
geom_histogram(aes(eclair_clean$AGE,fill = "#ffa7bf"), color="black")+
geom_histogram(aes(complete_case_eclair$AGE,fill = "#00ffcc"), color="black")+
#label title and axises
labs(title = "Distribution of participants age of each type of anlysis")+
xlab("Age (years)")+
ylab("Count")+
#create plot legend
scale_fill_identity( guide = "legend",
name ="",
labels = c("Complete cases analysis",
"Mean imputation",
"Available cases analysis"))+
#adjust scale to start from (0,0)
scale_y_continuous(expand = c(0,0))+
scale_x_continuous(expand = c(0,0))+
#adjust plot theme
theme_bw()+
theme(legend.position = "top",
legend.text = element_text(size = 10),
panel.background = element_rect(color = "white"))+
#add annotation about mean imputation
annotate(geom = "text",
x = 40, y = 140,
label = "Orange bar represent\n60 NAs that were imputed with age mean 19.3 years")
#Calculate 95% confidence interval of mean age for each methods
ci_eclair_clean <- t.test(eclair_clean$AGE, conf.level = 0.95)$conf.int
ci_complete_case <- t.test(complete_case_eclair$AGE, conf.level = 0.95)$conf.int
ci_mean_imp <- t.test(mean_im_eclair$AGE, conf.level = 0.95)$conf.int
#Create dataframe
ci_table <- tibble( "methods" = c("available case", "complete case", "mean imputation"),
"lower_border" = c(ci_eclair_clean[1], ci_complete_case[1], ci_mean_imp[1]),
"upper_border" = c(ci_eclair_clean[2], ci_complete_case[2], ci_mean_imp[2]),
"mean" = c(mean(eclair_clean$AGE, na.rm = TRUE), mean(complete_case_eclair$AGE),
mean(mean_im_eclair$AGE)))
#Plot
ggplot(data = ci_table)+
geom_point(aes(x=mean, y=methods, color = methods))+
geom_point(aes(x=lower_border, y=methods, color = methods))+
geom_point(aes(x=upper_border, y=methods, color = methods))+
#Transform dataframe to long format before create line chart.
geom_line(data = pivot_longer(ci_table, cols = c(2:4),
names_to = "point", values_to = "value"),
aes(x= value, y= methods, color= methods))+
#Label
labs(title = "95% CI of mean age for each methods", x = "Mean age (years)" )+
theme_bw()
As plots show, complete case analysis can affect the 95% confidence interval and may lead to problem in inferential analysis. While mean imputation method make 95% confidence interval slightly wider.