library(tidyverse)
library(MASS)
library(grid)
library(gridExtra)

Obtain birthwt dataset from the MASS package for analysis

birthwt is a data.frame of 189 obs. and 10 variables.

Variables are described below.

data(birthwt)

str(birthwt)
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Description of the variables

# variable name variable label coded levels
1 low indicator of birth weight less than 2.5 kg 0, 1
2 age mother’s age in years continuous variable
3 lwt mother’s weight in pounds at last menstrual period continuous variable
4 race mother’s race (1 = white, 2 = black, 3 = other) 1, 2, 3
5 smoke smoking status during pregnancy 0, 1
6 ptl number of previous premature labours 0, 1, 2, 3
7 ht history of hypertension 0, 1
8 ui presence of uterine irritability 0, 1
9 ftv number of physician visits during the first trimester 0, 1, 2, 3, 4, 6
10 bwt birth weight in grams continuous variable

Objective

Analyze the interrelationship of the outcome variable (i.e., low) in relation to other (categorical and continous) variables.

Convert the dataframe to a tibble

write_csv(birthwt, "birthwt.csv")

tbwt <- read_csv("birthwt.csv")

str(tbwt)
## tibble [189 x 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ low  : num [1:189] 0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : num [1:189] 19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : num [1:189] 182 155 105 108 107 124 118 103 123 113 ...
##  $ race : num [1:189] 2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: num [1:189] 0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : num [1:189] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : num [1:189] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : num [1:189] 1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : num [1:189] 0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : num [1:189] 2523 2551 2557 2594 2600 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   low = col_double(),
##   ..   age = col_double(),
##   ..   lwt = col_double(),
##   ..   race = col_double(),
##   ..   smoke = col_double(),
##   ..   ptl = col_double(),
##   ..   ht = col_double(),
##   ..   ui = col_double(),
##   ..   ftv = col_double(),
##   ..   bwt = col_double()
##   .. )

Convert some numeric variables to factors

tbwt <- tbwt %>% mutate(
  low=as_factor(low), 
  race=as_factor(race), 
  smoke=as_factor(smoke), 
  ptl=as_factor(ptl),
  ht=as_factor(ht), 
  ui=as_factor(ui),
  ftv=as_factor(ftv))


head(tbwt)
low age lwt race smoke ptl ht ui ftv bwt
0 19 182 2 0 0 0 1 0 2523
0 33 155 3 0 0 0 0 3 2551
0 20 105 1 1 0 0 0 1 2557
0 21 108 1 1 0 0 1 2 2594
0 18 107 1 1 0 0 1 0 2600
0 21 124 3 0 0 0 0 0 2622

Drop a variable that won’t be used for analysis

tbwt <- tbwt %>% dplyr::select(-10)

head(tbwt)
low age lwt race smoke ptl ht ui ftv
0 19 182 2 0 0 0 1 0
0 33 155 3 0 0 0 0 3
0 20 105 1 1 0 0 0 1
0 21 108 1 1 0 0 1 2
0 18 107 1 1 0 0 1 0
0 21 124 3 0 0 0 0 0

Plot each categorical variable (e.g., low, race, smoke)

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=low, fill=low)) +
  geom_bar(),

ggplot(data=tbwt, mapping=aes(x=low, y = ..prop.., group=1)) +
  geom_bar(),

ggplot(data=tbwt, mapping=aes(x=race, fill=race)) +
  geom_bar(),

ggplot(data=tbwt, mapping=aes(x=race, y = ..prop.., group=1)) +
  geom_bar(),

ggplot(data=tbwt, mapping=aes(x=smoke, fill=smoke)) +
  geom_bar(),

ggplot(data=tbwt, mapping=aes(x=smoke, y = ..prop.., group=1)) +
  geom_bar(),

nrow=3,

top="Bar plots for categorical variables")

Plot categorical outcome variable (i.e., low) in relation to other categorical variables (e.g., race)

Bar positions are according to identity, fill and dodge method

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=low, fill=race)) +
  geom_bar(position = "identity"),

ggplot(data=tbwt, mapping=aes(x=low, fill=race)) +
  geom_bar(position = "fill") +
  scale_y_continuous(name="Prop"),


ggplot(data=tbwt, mapping=aes(x=low, fill=race)) +
  geom_bar(position = "dodge" ),

nrow=1,

top='Bar plots with "identity", "fill" and "dodge" position ')

Bar positions are according to identity, fill and dodge method but are flipped

grid.arrange(

ggplot(data=tbwt, mapping=aes(x=low, fill=ftv)) +
 geom_bar(position="identity") +
 coord_flip() +
 theme(legend.position = "top"),

ggplot(data=tbwt, mapping=aes(x=low, fill=ftv)) +
 geom_bar(position="fill") +
 coord_flip() +
 theme(legend.position = "top") +
  scale_y_continuous(name="Prop"),

ggplot(data=tbwt, mapping=aes(x=low, fill=ftv)) +
 geom_bar(position="dodge") +
 coord_flip() +
 theme(legend.position = "top"),

nrow=1,

top="Bar plots with flipped coordinates")

Weighted measurement of the outcome variable (in relation to continous variables)

Keep an eye on the y-axis.

grid.arrange(

ggplot(tbwt, aes(x=low, fill=low)) + 
  geom_bar(),

ggplot(tbwt, aes(x=low, fill=low)) + 
  geom_bar(aes(weight =age)) +
  scale_y_continuous(name="age"),


ggplot(tbwt, aes(x=low, fill=low)) + 
  geom_bar(aes(fill=low)),

ggplot(tbwt, aes(x=low, fill=low)) + 
  geom_bar(aes(weight =lwt)) +
  scale_y_continuous(name="lwt"),

nrow=2,

top="Weighted vs non-weighted bar plots")