Libraries

library(tidyverse)
library(pacman)
library(MASS)
library(psych)
p_load(grid, gridExtra, robustbase) 

Dataset birthwt is from the MASS library

str(birthwt)
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Description of the variables

# variable name variable label coded levels
1 low indicator of birth weight less than 2.5 kg 0, 1
2 age mother’s age in years continuous variable
3 lwt mother’s weight in pounds at last menstrual period continuous variable
4 race mother’s race (1 = white, 2 = black, 3 = other) 1, 2, 3
5 smoke smoking status during pregnancy 0, 1
6 ptl number of previous premature labours 0, 1, 2, 3
7 ht history of hypertension 0, 1
8 ui presence of uterine irritability 0, 1
9 ftv number of physician visits during the first trimester 0, 1, 2, 3, 4, 6
10 bwt birth weight in grams continuous variable

Objective

To study the interrelationships of variables with respect to a continuous outcome variable bwt.

Save the dataset as a csv file and import as a tibble

write_csv(birthwt, "birthwt.csv")


tbwt <- read_csv("datalib/birthwt.csv")


str(tbwt)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 189 obs. of  10 variables:
##  $ low  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : num  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: num  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : num  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : num  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : num  2523 2551 2557 2594 2600 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   low = col_double(),
##   ..   age = col_double(),
##   ..   lwt = col_double(),
##   ..   race = col_double(),
##   ..   smoke = col_double(),
##   ..   ptl = col_double(),
##   ..   ht = col_double(),
##   ..   ui = col_double(),
##   ..   ftv = col_double(),
##   ..   bwt = col_double()
##   .. )

Drop a variable (i.e., low) that won’t be needed for analysis

tbwt <- tbwt[-1]

str(tbwt)
## Classes 'tbl_df', 'tbl' and 'data.frame':    189 obs. of  9 variables:
##  $ age  : num  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : num  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : num  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: num  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : num  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : num  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : num  2523 2551 2557 2594 2600 ...

Recode some variables (i.e., low, race, smoke, ptl, ht, ui, ftv) as factors

tbwt <- tbwt %>% mutate(
  race=as_factor(race), 
  smoke=as_factor(smoke), 
  ptl=as_factor(ptl),
  ht=as_factor(ht), 
  ui=as_factor(ui),
  ftv=as_factor(ftv))

glimpse(tbwt)
## Observations: 189
## Variables: 9
## $ age   <dbl> 19, 33, 20, 21, 18, 21, 22, 17, 29, 26, 19, 19, 22, 30, ...
## $ lwt   <dbl> 182, 155, 105, 108, 107, 124, 118, 103, 123, 113, 95, 15...
## $ race  <fct> 2, 3, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 3, 3, 1, 1, 2, 1, 3,...
## $ smoke <fct> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,...
## $ ptl   <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ ht    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ ui    <fct> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,...
## $ ftv   <fct> 0, 3, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 0, 3, 0,...
## $ bwt   <dbl> 2523, 2551, 2557, 2594, 2600, 2622, 2637, 2637, 2663, 26...

Plot each continuous variable (i.e, age, lwt, bwt): univariate analysis

Summary statistics could be generated for the continuous variables.

Quick summary of the continuous variables (i.e., age, lwt, bwt)

t(describe(tbwt[c(1,2,9)])[, -1])
##                  age        lwt          bwt
## n        189.0000000 189.000000  189.0000000
## mean      23.2380952 129.814815 2944.5873016
## sd         5.2986779  30.579380  729.2142952
## median    23.0000000 121.000000 2977.0000000
## trimmed   22.8954248 126.071895 2961.7581699
## mad        5.9304000  20.756400  834.7038000
## min       14.0000000  80.000000  709.0000000
## max       45.0000000 250.000000 4990.0000000
## range     31.0000000 170.000000 4281.0000000
## skew       0.7107606   1.379831   -0.2053370
## kurtosis   0.5307806   2.253148   -0.1436834
## se         0.3854221   2.224323   53.0425350

Histograms

grid.arrange(
  
ggplot(data=tbwt, mapping=aes(x=age)) +
  geom_histogram(bins = 30) +
  coord_cartesian(ylim = c(0, 20)),

ggplot(data=tbwt, mapping=aes(x=lwt)) +
  geom_histogram(bins = 30) +
  coord_cartesian(ylim = c(0, 20)),

ggplot(data=tbwt, mapping=aes(x=bwt)) +
  geom_histogram(bins = 30) +
  coord_cartesian(ylim = c(0, 20)),

nrow=1,

top = "Histograms: Univariates")

Density plots

grid.arrange(

ggplot(data=tbwt) +
  geom_density(mapping=aes(x=age), color="blue"),

ggplot(data=tbwt) +
  geom_density(mapping=aes(x=lwt), color="green"),

ggplot(data=tbwt) +
  geom_density(mapping=aes(x=bwt), color="red"),

nrow=1,

top="Density plots: Univariates")

Standard normal density plots

grid.arrange(

ggplot(data.frame(age=tbwt$age), aes(age)) +
  stat_function(fun = dnorm, colour = "blue", geom = "point", args = list(mean = 23.24, sd = 5.3)),

ggplot(data.frame(lwt=tbwt$lwt), aes(lwt)) +
  stat_function(fun = dnorm, colour = "green", geom = "point", args = list(mean = 129.81, sd = 30.58)),

ggplot(data.frame(bwt=tbwt$bwt), aes(bwt)) +
  stat_function(fun = dnorm, colour = "red", geom = "point", args = list(mean = 2944.59, sd = 729.21)),

nrow=1,

top="Normal density plots: Univariates")

Box plots

grid.arrange(

ggplot(data=tbwt, mapping=aes(y=age, x=1)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
  geom_text(aes(label = ifelse(age %in% 
         boxplot.stats(age)$out,
         as.character(age), "")), hjust = 1.5),

ggplot(data=tbwt, mapping=aes(y=lwt, x=1)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
  geom_text(aes(label = ifelse(lwt %in% 
         boxplot.stats(lwt)$out,
         as.character(lwt), "")), hjust = 1.5),

ggplot(data=tbwt, mapping=aes(y=bwt, x=1)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red") +
  geom_text(aes(label = ifelse(bwt %in% 
         boxplot.stats(bwt)$out,
         as.character(bwt), "")), hjust = 1.5),
nrow=1,

top="Box plots: Univariates")

Plot each categorical variable or factor (i.e., race, smoke, ptl, ht, ui, ftv): univariate analysis

Remember categorical variables have various levels.

Barplots with counts

grid.arrange(
  
ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = race, fill=race)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = smoke, fill=smoke)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ptl, fill=ptl)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ht, fill=ht)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ui, fill=ui)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ftv, fill=ftv)) +
  coord_flip(),

nrow=2,

top="Barplot plots: Univariates")

Barplots with proportions

grid.arrange(
  
ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = race, y = ..prop.., group=1)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = smoke, y = ..prop.., group=1)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ptl, y = ..prop.., group=1)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ht, y = ..prop.., group=1)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ui, y = ..prop.., group=1)) +
  coord_flip(),

ggplot(data = tbwt) + 
  geom_bar(mapping = aes(x = ftv, y = ..prop.., group=1)) +
  coord_flip(),

nrow=2,

top="Barplot plots: Univariates")

Plot two variables/ bivariate plots

Scatter plot for two continuous variables/bivariate scatter plot

grid.arrange(
  
  ggplot(data=tbwt, mapping=aes(x=age, y=lwt), position = "jitter" ) +
  geom_point() +
  stat_ellipse(level = .99, color = "red") +
  geom_text(aes(label = ifelse((age>35.4 | lwt> 190.1), as.character(lwt), "")), hjust = 1.5),
  
ggplot(data=tbwt, mapping=aes(x=age, y=bwt), position = "jitter" ) +
  geom_point() +
  stat_ellipse(level = .99, color = "red") +
  geom_text(aes(label = ifelse((age>36 | bwt<1200), as.character(bwt), "")), hjust = 1.5),

ggplot(data=tbwt, mapping=aes(x=lwt, y=bwt), position = "jitter" ) +
  geom_point() +
  stat_ellipse(level = .99, color = "red") +
  geom_text(aes(label = ifelse((lwt>190 | bwt<1200 | bwt> 4900), as.character(bwt), "")), hjust = 1.5),

nrow=1,

top="Scatter plots: Bivariates")

Scatter plots for two continuous variables with smoothed conditional means (method=loess, lm and splines)

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point() +
  geom_smooth(),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point() +
  geom_smooth(method = lm, se = TRUE),


ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point() +
  geom_smooth(method = lm, formula = y ~ splines::bs(x, 3), se = TRUE),

nrow=1,

top="Scatter plots with smoothed conditional means: method=loess, lm and splines ")

Scatter plots for 2 continuous variables in relation to another categorical variable

Two continuous variables are analyzed wrt the levels of another categorical variable.

Scatter plots for 2 continuous variables in relation to another categorical variable

grid.arrange(

ggplot(data = tbwt) + 
  geom_point(mapping = aes(x = age, y = bwt, color=race)),

ggplot(data=tbwt) +
  geom_point(mapping=aes(x=age, y=bwt, color=race)) +
  
  geom_hline(yintercept=mean(tbwt$bwt), color="dark grey") +
  annotate("text", label="Mean bwt", x=40, y=mean(tbwt$bwt)-100) +
  
  geom_vline(xintercept=mean(tbwt$age), color="dark grey") +
  annotate("text", label="Mean age", x=mean(tbwt$age)+3, y=5000) +
  
  scale_color_discrete(name="race") +
  
  scale_x_continuous(name="Age") +
  scale_y_continuous(name="bwt"),


nrow=1,

top="Scatter plots wrt a categorical variable")

scatter plots for 2 continuous variables in relation to another categorical variable (method= lm, loess and spline)

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color=race)) +
  geom_smooth(),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color=race)) +
  geom_smooth(method = "lm"),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color=race)) +
  geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)),

nrow=1,

top="Scatter plots in relation to a categorical variable: method=loess, lm and splines")

Scatter plots for two continuous variables for all the levels of a categorical variable (method= loess, lm and splines)

grid.arrange(
  
ggplot(data=tbwt, mapping=aes(x=age, y=bwt, color=race)) +
  geom_point() +
  geom_smooth(),

ggplot(data = tbwt, mapping = aes(x = age, y = bwt, color=race)) + 
  geom_point() +
  geom_smooth(method="lm"),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt, color=race)) +
  geom_point() +
  geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)),

nrow=1,

top="Scatter plots for the levels of a categorical variable: method=loess, lm and splines")

Scatter plots for two continuous variables for a particular level of a categorical variable (method= loess, lm and splines)

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color = race)) +
  geom_smooth(data=filter(tbwt, race=="3")),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color = race)) +
  geom_smooth(data=filter(tbwt, race=="3"), method = "lm"),

ggplot(data=tbwt, mapping=aes(x=age, y=bwt)) +
  geom_point(mapping=aes(color = race)) +
  geom_smooth(data=filter(tbwt, race=="3"), method = "lm", formula = y ~ splines::bs(x, 3)),

nrow=1,

top="Scatter plots for a particular level of a categorical variable: method=loess, lm and splines")

Subplots of scatter plot

Subplots for a categorical variable

ggplot(data = tbwt) + 
  geom_point(mapping = aes(x = age, y = bwt, color=race)) + 
  facet_wrap(~ race, ncol = 3)

Subplots with loess line

ggplot(tbwt, aes(age, bwt, color=race)) + 
  geom_smooth() + 
  geom_point() +
  theme(text = element_text(size = 14)) +
  facet_wrap(~race, ncol = 3)

Subplots with linear regression line

ggplot(tbwt, aes(age, bwt, color=race)) + 
  geom_smooth(method="lm") + 
  geom_point() +
  theme(text = element_text(size = 14)) +
  facet_wrap(~race, ncol = 3)

Subplots with spline

ggplot(tbwt, aes(age, bwt, color=race)) + 
  geom_smooth(method="lm", formula = y ~ splines::bs(x, 3)) + 
  geom_point() +
  theme(text = element_text(size = 14)) +
  facet_wrap(~race, ncol = 3)

Plot continuous outcome variable in relation to categorical variables (i.e., race, smoke, ptl, ht, ui, ftv)

Dot plots

ggplot(data=tbwt) +
  geom_point(mapping=aes(x=race, y=bwt))

Jitter plots

ggplot(data=tbwt) +
  geom_jitter(mapping=aes(x=race, y=bwt))

Boxplots

Boxplots for the outcome variable (i.e., bwt) in relation to one categorical variable (e.g., race)

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

ggplot(data=tbwt, mapping=aes(x=smoke, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

ggplot(data=tbwt, mapping=aes(x=ptl, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

ggplot(data=tbwt, mapping=aes(x=ht, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

ggplot(data=tbwt, mapping=aes(x=ui, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

ggplot(data=tbwt, mapping=aes(x=ftv, y=bwt)) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red"),

nrow=2, 

top="Box plots for the levels of a categorical variable")

Boxplots for the outcome variable (i.e., bwt) in relation to two categorical variables (e.g., race and smoke)

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(aes(colour = smoke)) +
  coord_flip()

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(aes(colour = ptl)) +
  coord_flip()

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(aes(colour = ht)) +
  coord_flip()

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(aes(colour = ui)) +
  coord_flip()

ggplot(data=tbwt, mapping=aes(x=race, y=bwt)) +
  geom_boxplot(aes(colour = ftv)) +
  coord_flip()

Reordering boxplots by the median values of the outcome variable

ggplot(data=tbwt, mapping=aes(x=reorder(race, bwt, FUN = median), y=bwt) ) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red")

ggplot(data=tbwt, mapping=aes(x=reorder(ftv, bwt, FUN = median), y=bwt) ) +
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.colour = "red")

Barplots

Use geom_col not geom_bar.

grid.arrange(

tbwt %>%
  group_by(race) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=race, y=average_bwt)) +
  geom_col(aes(fill=race)) +
  coord_flip(),

tbwt %>%
  group_by(smoke) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=smoke, y=average_bwt)) +
  geom_col(aes(fill=smoke)) +
  coord_flip(),

tbwt %>%
  group_by(ptl) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=ptl, y=average_bwt)) +
  geom_col(aes(fill=ptl)) +
  coord_flip(),

tbwt %>%
  group_by(ht) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=ht, y=average_bwt)) +
  geom_col(aes(fill=ht)) +
  coord_flip(),

tbwt %>%
  group_by(ui) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=ui, y=average_bwt)) +
  geom_col(aes(fill=ui)) +
  coord_flip(),

tbwt %>%
  group_by(ftv) %>%
  summarize(average_bwt=mean(bwt)) %>%
  ggplot(mapping=aes(x=ftv, y=average_bwt)) +
  geom_col(aes(fill=ftv)) +
  coord_flip(),

nrow=3,

top="Barplots for the continuous outcome variable in relation to various categorical variables")

Freqpoly plots with counts

ggplot(data = tbwt, mapping = aes(x = bwt)) + 
  geom_freqpoly(mapping = aes(colour = race), binwidth = 500)

Freqpoly plots with density

ggplot(data = tbwt, mapping = aes(x = bwt, y=..density..)) + 
  geom_freqpoly(mapping = aes(colour = race), binwidth = 500)

Multivariate plotting: Measure overall distance of cases from others using both Mahalanobis distance and a robust measures of distance

Create dataset with only quantitative variables

mcd = covMcd(tbwt[c(1, 2, 9)])

mcd
## Minimum Covariance Determinant (MCD) estimator approximation.
## Method: Fast MCD(alpha=0.5 ==> h=96); nsamp = 500; (n,k)mini = (300,5)
## Call:
## covMcd(x = tbwt[c(1, 2, 9)])
## Log(Det.):  19.65 
## 
## Robust Estimate of Location:
##     age      lwt      bwt  
##   22.82   119.32  2922.31  
## Robust Estimate of Covariance:
##         age      lwt       bwt
## age   33.00    20.64     184.5
## lwt   20.64   411.01    4577.6
## bwt  184.51  4577.64  688285.4

Mahalanobis vs. robust distance

plot(mcd, 
     which = "dd", 
     labels.id = as.character(tbwt$race))

QQ plot for robust distance

plot(mcd,
     which = "qqchi2",
     labels.id = as.character(tbwt$race))