Set up

To illustrate some examples, we will first create a random dataset from a hypothetical prospective cohort study.

#load the tidyverse library to make life easy!
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
#load the knitr and datatable libraries to make nice html tables
library(knitr)
library(DT)

#set a seed to allow reproductibility.
set.seed(20171031)

#create tibble of participants
df <- tibble(id = 1:1500,
             group = sample(c("Exposed", "Unexposed"), 1500, replace = TRUE),
             sex = sample(c("Male",
                            "Female"), 1500, replace=TRUE),
             age = round(runif(1500, min = 18, max = 60)),
             bmi = round(rnorm(1500, mean = 20, sd=4), digits = 1),
             marital_status = sample(c("Married", "Cohabiting",
                                       "Never married",
                                       "Widowed", "Separated", "Divorced"), 1500, replace = TRUE),
             educ_level = sample(c("No schooling",
                                   "Primary",
                                   "Secondary, never graduated",
                                   "Secondary, graduated",
                                   "Higher"), 1500, replace = TRUE),
             cough = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             nightswt = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             weightloss = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             fever = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             prev_tb = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             hiv = sample(c("HIV-positive", "HIV-negative", "Unknown"), 1500, replace = TRUE, prob = c(0.18,0.80,0.02))
)

#randomly introduce some missing values to make more realistic (5% of observations are missing)
df <- map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.95, 0.05), size = length(x), replace = TRUE)]})

datatable(df)


Recoding data

Categorical data

The forcats library (part of the tidyverse, but needs to be loaded separately) is extremely useful here.

What if we want to change the levels of education level to No education and Any education?

First, what is the distribution of this variable before recoding?

library(forcats)

df %>%
  group_by(educ_level) %>%
  summarise(n= n()) %>%
  kable()
educ_level n
Higher 257
No schooling 285
Primary 277
Secondary, graduated 302
Secondary, never graduated 289
NA 90

Now recode and see what has changed.

df <- df %>%
  mutate(educ_level2 = fct_recode(educ_level,
                                  "No education" = "No schooling",
                                  "Any education" = "Primary",
                                  "Any education" = "Secondary, never graduated",
                                  "Any education" = "Secondary, graduated",
                                  "Any education" = "Higher")
         )

df %>%
  group_by(educ_level2) %>%
  summarise(n= n()) %>%
  kable()
educ_level2 n
Any education 1125
No education 285
NA 90


Suppose we want to collapse marital status from 6 levels to 3 levels…

First, what does this variable look like

df$marital_status <- factor(df$marital_status)

df %>%
  group_by(marital_status) %>%
  summarise(n= n()) %>%
  kable()
marital_status n
Cohabiting 224
Divorced 222
Married 243
Never married 244
Separated 250
Widowed 235
NA 82

Now fct_lump, and look at distribution.

df <- df %>%
  mutate(marital_status2 =  fct_lump(marital_status, n=3))

df %>%
  group_by(marital_status2) %>%
  summarise(n= n()) %>%
  kable()
marital_status2 n
Married 243
Never married 244
Separated 250
Other 681
NA 82


Continuous data

To recode continous variables as categorical, use the case_when function within the dplyr package (part of the tidyverse).

For example, suppose we wanted to create age groups…

First summarise the variable,

summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   28.00   39.00   39.13   50.00   60.00      77

Now recode

(Note that the new variable levels are on the right hand side of the ~ symbol)

df <- df %>%
  mutate(age_group = case_when(age < 25 ~ "<25 years",
                               age >=25 & age <45 ~ ">=25 to <45 years",
                               age >=45 ~ ">=45 years"))
df$age_group <- factor(df$age_group)

df %>%
  group_by(age_group) %>%
  summarise(n= n()) %>%
  kable()
age_group n
<25 years 233
>=25 to <45 years 654
>=45 years 536
NA 77


Converting many variables at once

Suppose we want to convert all character variables to factors (e.g. for modelling). We can use the mutate_if command in dplyr.

df <- df %>%
  mutate_if(is.character, as.factor)

str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1500 obs. of  16 variables:
##  $ id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group          : Factor w/ 2 levels "Exposed","Unexposed": 2 2 2 1 1 1 1 2 1 1 ...
##  $ sex            : Factor w/ 2 levels "Female","Male": 1 1 1 NA 2 2 1 2 1 2 ...
##  $ age            : num  31 36 24 NA 22 34 23 40 41 NA ...
##  $ bmi            : num  16.9 14.2 17.7 16.9 26.9 22 18.6 15.6 NA 31.4 ...
##  $ marital_status : Factor w/ 6 levels "Cohabiting","Divorced",..: 3 1 2 1 2 4 4 NA 3 5 ...
##  $ educ_level     : Factor w/ 5 levels "Higher","No schooling",..: 4 1 5 2 1 1 1 5 5 3 ...
##  $ cough          : Factor w/ 2 levels "No","Yes": 2 NA 2 2 2 2 NA 1 2 2 ...
##  $ nightswt       : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 2 2 2 2 2 ...
##  $ weightloss     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ fever          : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ prev_tb        : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ hiv            : Factor w/ 3 levels "HIV-negative",..: NA 1 1 1 1 1 1 1 NA 1 ...
##  $ educ_level2    : Factor w/ 2 levels "Any education",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ marital_status2: Factor w/ 4 levels "Married","Never married",..: 1 4 4 4 4 2 2 NA 1 3 ...
##  $ age_group      : Factor w/ 3 levels "<25 years",">=25 to <45 years",..: 2 2 1 NA 1 2 1 2 2 NA ...


Summarising variables

A common task is to sumnmarise variables by some group (e.g. by exposure group here).

To do one-at-a time, can use the group_by and summarise functions in the dplyr package, e.g.:

df %>% 
  group_by(group, hiv) %>%
  summarise(n=n()) %>%
  mutate(freq = n / sum(n)) %>%
  kable()
group hiv n freq
Exposed HIV-negative 566 0.7536618
Exposed HIV-positive 125 0.1664447
Exposed Unknown 17 0.0226365
Exposed NA 43 0.0572570
Unexposed HIV-negative 491 0.7383459
Unexposed HIV-positive 122 0.1834586
Unexposed Unknown 15 0.0225564
Unexposed NA 37 0.0556391
NA HIV-negative 72 0.8571429
NA HIV-positive 10 0.1190476
NA NA 2 0.0238095

However, this quickly becomes quite tedious if we are doing a lot of variables. Instead, we could use the tableby function in the arsenal package to summarise everything at once.

#load the arsenal package
library(arsenal)
## 
## Attaching package: 'arsenal'
## The following object is masked from 'package:dplyr':
## 
##     count
#create a vector of variables to be summarised

sum_vars <- tableby(group ~ sex + age + age_group + bmi + marital_status + educ_level + cough + nightswt + weightloss + fever + prev_tb + hiv, data = df)

summary(sum_vars)
Exposed (N=751) Unexposed (N=665) Total (N=1416) p value
sex 0.110
    N-Miss 39 28 67
    Female 339 (47.6%) 332 (52.1%) 671 (49.7%)
    Male 373 (52.4%) 305 (47.9%) 678 (50.3%)
age 0.728
    N-Miss 39 33 72
    Mean (SD) 39.2 (12.5) 39 (12.4) 39.1 (12.5)
    Q1, Q3 28, 50 28, 50 28, 50
    Range 18 - 60 18 - 60 18 - 60
age_group 0.494
    N-Miss 39 33 72
    <25 years 123 (17.3%) 102 (16.1%) 225 (16.7%)
    >=25 to <45 years 315 (44.2%) 300 (47.5%) 615 (45.8%)
    >=45 years 274 (38.5%) 230 (36.4%) 504 (37.5%)
bmi 0.808
    N-Miss 45 25 70
    Mean (SD) 20.1 (4.1) 20.1 (4.03) 20.1 (4.06)
    Q1, Q3 17.6, 23 17.2, 22.8 17.4, 22.9
    Range 6.4 - 34.8 9.5 - 34 6.4 - 34.8
marital_status 0.410
    N-Miss 41 36 77
    Cohabiting 119 (16.8%) 99 (15.7%) 218 (16.3%)
    Divorced 118 (16.6%) 90 (14.3%) 208 (15.5%)
    Married 116 (16.3%) 110 (17.5%) 226 (16.9%)
    Never married 118 (16.6%) 114 (18.1%) 232 (17.3%)
    Separated 131 (18.5%) 101 (16.1%) 232 (17.3%)
    Widowed 108 (15.2%) 115 (18.3%) 223 (16.7%)
educ_level 0.766
    N-Miss 44 39 83
    Higher 135 (19.1%) 108 (17.3%) 243 (18.2%)
    No schooling 140 (19.8%) 121 (19.3%) 261 (19.6%)
    Primary 140 (19.8%) 122 (19.5%) 262 (19.7%)
    Secondary, graduated 153 (21.6%) 135 (21.6%) 288 (21.6%)
    Secondary, never graduated 139 (19.7%) 140 (22.4%) 279 (20.9%)
cough 0.011
    N-Miss 43 43 86
    No 83 (11.7%) 104 (16.7%) 187 (14.1%)
    Yes 625 (88.3%) 518 (83.3%) 1143 (85.9%)
nightswt 0.090
    N-Miss 38 34 72
    No 91 (12.8%) 102 (16.2%) 193 (14.4%)
    Yes 622 (87.2%) 529 (83.8%) 1151 (85.6%)
weightloss 0.342
    N-Miss 36 36 72
    No 106 (14.8%) 81 (12.9%) 187 (13.9%)
    Yes 609 (85.2%) 548 (87.1%) 1157 (86.1%)
fever 0.458
    N-Miss 29 29 58
    No 116 (16.1%) 92 (14.5%) 208 (15.3%)
    Yes 606 (83.9%) 544 (85.5%) 1150 (84.7%)
prev_tb 0.681
    N-Miss 36 34 70
    No 118 (16.5%) 98 (15.5%) 216 (16%)
    Yes 597 (83.5%) 533 (84.5%) 1130 (84%)
hiv 0.706
    N-Miss 43 37 80
    HIV-negative 566 (79.9%) 491 (78.2%) 1057 (79.1%)
    HIV-positive 125 (17.7%) 122 (19.4%) 247 (18.5%)
    Unknown 17 (2.4%) 15 (2.39%) 32 (2.4%)


Merging

Say we wanted to merge two dataframes together…

We can use the join functions from dplyr.

First, we create another data frame with outcome (died) data:

deaths <- tibble(id = 1:1500,
                 died = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.03, 0.97))
)


Now we merge the data frames together using id as the key.

merged <- left_join(df, deaths, by = "id")

datatable(merged)


Reshaping data from wide to long

It is often efficient to reshape the data frame from wide to long to facilitate visualisation.

Here we use the tidyr package (part of the tidyverse) to make a long data frame comprised of key-value pairs.

(This example also shows how to select columns, and filter rows using the dplyr package).

df2 <- df %>%
  #select some variables
  select(id, group, sex, age, hiv) %>%
  #remove participants with missing exposure status
  filter(!is.na(group)) %>%
  #gather from wide to long
  gather(variable, value, -id) %>%
  #sort by id
  arrange(id)
## Warning: attributes are not identical across measure variables;
## they will be dropped
datatable(df2)


Tidying up regression modelling

Writing code for regression modelling is often time-consuming and error-prone in R.

Here we output the results of several bivariate logistic regression models using the broom and purrr packages.

library(broom)

merged$died <- factor(merged$died)

output <- merged %>%
  select(-died, -id) %>%
  names() %>%
  paste('died~',.) %>%
  map_df(~tidy(glm(as.formula(.x), 
               data= merged, 
               family = "binomial"), 
               conf.int=TRUE, 
               exponentiate=TRUE)) %>%
  filter(term !="(Intercept)") %>%
  select(term, `Odds ratio` = estimate, everything())

kable(output, digits = 3, caption = "Bivariate associations with odds of death")
Bivariate associations with odds of death
term Odds ratio std.error statistic p.value conf.low conf.high
groupUnexposed 1.503 0.328 1.244 0.213 0.794 2.896
sexMale 1.037 0.325 0.111 0.911 0.546 1.974
age 0.985 0.013 -1.141 0.254 0.960 1.011
bmi 1.009 0.038 0.227 0.820 0.935 1.087
marital_statusDivorced 3.182 0.586 1.976 0.048 1.088 11.530
marital_statusMarried 1.358 0.652 0.469 0.639 0.383 5.372
marital_statusNever married 2.333 0.599 1.414 0.157 0.768 8.610
marital_statusSeparated 0.682 0.770 -0.498 0.619 0.133 3.127
marital_statusWidowed 1.479 0.653 0.599 0.549 0.417 5.854
educ_levelNo schooling 1.069 0.563 0.119 0.905 0.351 3.364
educ_levelPrimary 1.883 0.508 1.246 0.213 0.719 5.482
educ_levelSecondary, graduated 0.733 0.612 -0.507 0.612 0.209 2.464
educ_levelSecondary, never graduated 1.511 0.524 0.787 0.431 0.553 4.498
coughYes 0.635 0.404 -1.121 0.262 0.302 1.504
nightswtYes 0.672 0.402 -0.988 0.323 0.321 1.585
weightlossYes 1.839 0.607 1.004 0.315 0.654 7.686
feverYes 0.673 0.404 -0.982 0.326 0.319 1.590
prev_tbYes 0.914 0.423 -0.213 0.831 0.423 2.276
hivHIV-positive 1.188 0.404 0.426 0.670 0.502 2.504
hivUnknown 1.203 1.034 0.179 0.858 0.066 5.931
educ_level2No education 0.836 0.422 -0.423 0.672 0.336 1.803
marital_status2Never married 1.719 0.525 1.032 0.302 0.628 5.124
marital_status2Separated 0.502 0.713 -0.966 0.334 0.105 1.927
marital_status2Other 1.370 0.467 0.674 0.500 0.583 3.758
age_group>=25 to <45 years 1.004 0.446 0.009 0.992 0.438 2.590
age_group>=45 years 0.729 0.483 -0.655 0.512 0.289 1.982