R examples for Michael Marks

Set up

To illustrate some examples, we will first create a random dataset from a hypothetical prospective cohort study.

#load the tidyverse library to make life easy!
library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

#load the knitr and datatable libraries to make nice html tables
library(knitr)
library(DT)

#set a seed to allow reproductibility.
set.seed(20171031)

#create tibble of participants
df <- tibble(id = 1:1500,
             group = sample(c("Exposed", "Unexposed"), 1500, replace = TRUE),
             sex = sample(c("Male",
                            "Female"), 1500, replace=TRUE),
             age = round(runif(1500, min = 18, max = 60)),
             bmi = round(rnorm(1500, mean = 20, sd=4), digits = 1),
             marital_status = sample(c("Married", "Cohabiting",
                                       "Never married",
                                       "Widowed", "Separated", "Divorced"), 1500, replace = TRUE),
             educ_level = sample(c("No schooling",
                                   "Primary",
                                   "Secondary, never graduated",
                                   "Secondary, graduated",
                                   "Higher"), 1500, replace = TRUE),
             cough = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             nightswt = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             weightloss = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             fever = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             prev_tb = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.85,0.15)),
             hiv = sample(c("HIV-positive", "HIV-negative", "Unknown"), 1500, replace = TRUE, prob = c(0.18,0.80,0.02))
)

#randomly introduce some missing values to make more realistic (5% of observations are missing)
df <- map_df(df, function(x) {x[sample(c(TRUE, NA), prob = c(0.95, 0.05), size = length(x), replace = TRUE)]})

datatable(df)

Recoding data

Categorical data

The forcats library (part of the tidyverse, but needs to be loaded separately) is extremely useful here.

What if we want to change the levels of education level to No education and Any education?

First, what is the distribution of this variable before recoding?

library(forcats)

df %>%
  group_by(educ_level) %>%
  summarise(n= n()) %>%
  kable()

educ_level	n
Higher	257
No schooling	285
Primary	277
Secondary, graduated	302
Secondary, never graduated	289
NA	90

Now recode and see what has changed.

df <- df %>%
  mutate(educ_level2 = fct_recode(educ_level,
                                  "No education" = "No schooling",
                                  "Any education" = "Primary",
                                  "Any education" = "Secondary, never graduated",
                                  "Any education" = "Secondary, graduated",
                                  "Any education" = "Higher")
         )

df %>%
  group_by(educ_level2) %>%
  summarise(n= n()) %>%
  kable()

educ_level2	n
Any education	1125
No education	285
NA	90

Suppose we want to collapse marital status from 6 levels to 3 levels…

First, what does this variable look like

df$marital_status <- factor(df$marital_status)

df %>%
  group_by(marital_status) %>%
  summarise(n= n()) %>%
  kable()

marital_status	n
Cohabiting	224
Divorced	222
Married	243
Never married	244
Separated	250
Widowed	235
NA	82

Now fct_lump, and look at distribution.

df <- df %>%
  mutate(marital_status2 =  fct_lump(marital_status, n=3))

df %>%
  group_by(marital_status2) %>%
  summarise(n= n()) %>%
  kable()

marital_status2	n
Married	243
Never married	244
Separated	250
Other	681
NA	82

Continuous data

To recode continous variables as categorical, use the case_when function within the dplyr package (part of the tidyverse).

For example, suppose we wanted to create age groups…

First summarise the variable,

summary(df$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   28.00   39.00   39.13   50.00   60.00      77

Now recode

(Note that the new variable levels are on the right hand side of the ~ symbol)

df <- df %>%
  mutate(age_group = case_when(age < 25 ~ "<25 years",
                               age >=25 & age <45 ~ ">=25 to <45 years",
                               age >=45 ~ ">=45 years"))
df$age_group <- factor(df$age_group)

df %>%
  group_by(age_group) %>%
  summarise(n= n()) %>%
  kable()

age_group	n
<25 years	233
>=25 to <45 years	654
>=45 years	536
NA	77

Converting many variables at once

Suppose we want to convert all character variables to factors (e.g. for modelling). We can use the mutate_if command in dplyr.

df <- df %>%
  mutate_if(is.character, as.factor)

str(df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1500 obs. of  16 variables:
##  $ id             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ group          : Factor w/ 2 levels "Exposed","Unexposed": 2 2 2 1 1 1 1 2 1 1 ...
##  $ sex            : Factor w/ 2 levels "Female","Male": 1 1 1 NA 2 2 1 2 1 2 ...
##  $ age            : num  31 36 24 NA 22 34 23 40 41 NA ...
##  $ bmi            : num  16.9 14.2 17.7 16.9 26.9 22 18.6 15.6 NA 31.4 ...
##  $ marital_status : Factor w/ 6 levels "Cohabiting","Divorced",..: 3 1 2 1 2 4 4 NA 3 5 ...
##  $ educ_level     : Factor w/ 5 levels "Higher","No schooling",..: 4 1 5 2 1 1 1 5 5 3 ...
##  $ cough          : Factor w/ 2 levels "No","Yes": 2 NA 2 2 2 2 NA 1 2 2 ...
##  $ nightswt       : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 2 2 2 2 2 ...
##  $ weightloss     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ fever          : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ prev_tb        : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
##  $ hiv            : Factor w/ 3 levels "HIV-negative",..: NA 1 1 1 1 1 1 1 NA 1 ...
##  $ educ_level2    : Factor w/ 2 levels "Any education",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ marital_status2: Factor w/ 4 levels "Married","Never married",..: 1 4 4 4 4 2 2 NA 1 3 ...
##  $ age_group      : Factor w/ 3 levels "<25 years",">=25 to <45 years",..: 2 2 1 NA 1 2 1 2 2 NA ...

Summarising variables

A common task is to sumnmarise variables by some group (e.g. by exposure group here).

To do one-at-a time, can use the group_by and summarise functions in the dplyr package, e.g.:

df %>% 
  group_by(group, hiv) %>%
  summarise(n=n()) %>%
  mutate(freq = n / sum(n)) %>%
  kable()

group	hiv	n	freq
Exposed	HIV-negative	566	0.7536618
Exposed	HIV-positive	125	0.1664447
Exposed	Unknown	17	0.0226365
Exposed	NA	43	0.0572570
Unexposed	HIV-negative	491	0.7383459
Unexposed	HIV-positive	122	0.1834586
Unexposed	Unknown	15	0.0225564
Unexposed	NA	37	0.0556391
NA	HIV-negative	72	0.8571429
NA	HIV-positive	10	0.1190476
NA	NA	2	0.0238095

However, this quickly becomes quite tedious if we are doing a lot of variables. Instead, we could use the tableby function in the arsenal package to summarise everything at once.

#load the arsenal package
library(arsenal)

## 
## Attaching package: 'arsenal'

## The following object is masked from 'package:dplyr':
## 
##     count

#create a vector of variables to be summarised

sum_vars <- tableby(group ~ sex + age + age_group + bmi + marital_status + educ_level + cough + nightswt + weightloss + fever + prev_tb + hiv, data = df)

summary(sum_vars)

	Exposed (N=751)	Unexposed (N=665)	Total (N=1416)	p value
sex				0.110
N-Miss	39	28	67
Female	339 (47.6%)	332 (52.1%)	671 (49.7%)
Male	373 (52.4%)	305 (47.9%)	678 (50.3%)
age				0.728
N-Miss	39	33	72
Mean (SD)	39.2 (12.5)	39 (12.4)	39.1 (12.5)
Q1, Q3	28, 50	28, 50	28, 50
Range	18 - 60	18 - 60	18 - 60
age_group				0.494
N-Miss	39	33	72
<25 years	123 (17.3%)	102 (16.1%)	225 (16.7%)
>=25 to <45 years	315 (44.2%)	300 (47.5%)	615 (45.8%)
>=45 years	274 (38.5%)	230 (36.4%)	504 (37.5%)
bmi				0.808
N-Miss	45	25	70
Mean (SD)	20.1 (4.1)	20.1 (4.03)	20.1 (4.06)
Q1, Q3	17.6, 23	17.2, 22.8	17.4, 22.9
Range	6.4 - 34.8	9.5 - 34	6.4 - 34.8
marital_status				0.410
N-Miss	41	36	77
Cohabiting	119 (16.8%)	99 (15.7%)	218 (16.3%)
Divorced	118 (16.6%)	90 (14.3%)	208 (15.5%)
Married	116 (16.3%)	110 (17.5%)	226 (16.9%)
Never married	118 (16.6%)	114 (18.1%)	232 (17.3%)
Separated	131 (18.5%)	101 (16.1%)	232 (17.3%)
Widowed	108 (15.2%)	115 (18.3%)	223 (16.7%)
educ_level				0.766
N-Miss	44	39	83
Higher	135 (19.1%)	108 (17.3%)	243 (18.2%)
No schooling	140 (19.8%)	121 (19.3%)	261 (19.6%)
Primary	140 (19.8%)	122 (19.5%)	262 (19.7%)
Secondary, graduated	153 (21.6%)	135 (21.6%)	288 (21.6%)
Secondary, never graduated	139 (19.7%)	140 (22.4%)	279 (20.9%)
cough				0.011
N-Miss	43	43	86
No	83 (11.7%)	104 (16.7%)	187 (14.1%)
Yes	625 (88.3%)	518 (83.3%)	1143 (85.9%)
nightswt				0.090
N-Miss	38	34	72
No	91 (12.8%)	102 (16.2%)	193 (14.4%)
Yes	622 (87.2%)	529 (83.8%)	1151 (85.6%)
weightloss				0.342
N-Miss	36	36	72
No	106 (14.8%)	81 (12.9%)	187 (13.9%)
Yes	609 (85.2%)	548 (87.1%)	1157 (86.1%)
fever				0.458
N-Miss	29	29	58
No	116 (16.1%)	92 (14.5%)	208 (15.3%)
Yes	606 (83.9%)	544 (85.5%)	1150 (84.7%)
prev_tb				0.681
N-Miss	36	34	70
No	118 (16.5%)	98 (15.5%)	216 (16%)
Yes	597 (83.5%)	533 (84.5%)	1130 (84%)
hiv				0.706
N-Miss	43	37	80
HIV-negative	566 (79.9%)	491 (78.2%)	1057 (79.1%)
HIV-positive	125 (17.7%)	122 (19.4%)	247 (18.5%)
Unknown	17 (2.4%)	15 (2.39%)	32 (2.4%)

Merging

Say we wanted to merge two dataframes together…

We can use the join functions from dplyr.

First, we create another data frame with outcome (died) data:

deaths <- tibble(id = 1:1500,
                 died = sample(c("Yes", "No"), 1500, replace = TRUE, prob = c(0.03, 0.97))
)

Now we merge the data frames together using id as the key.

merged <- left_join(df, deaths, by = "id")

datatable(merged)

Reshaping data from wide to long

It is often efficient to reshape the data frame from wide to long to facilitate visualisation.

Here we use the tidyr package (part of the tidyverse) to make a long data frame comprised of key-value pairs.

(This example also shows how to select columns, and filter rows using the dplyr package).

df2 <- df %>%
  #select some variables
  select(id, group, sex, age, hiv) %>%
  #remove participants with missing exposure status
  filter(!is.na(group)) %>%
  #gather from wide to long
  gather(variable, value, -id) %>%
  #sort by id
  arrange(id)

## Warning: attributes are not identical across measure variables;
## they will be dropped

datatable(df2)

Tidying up regression modelling

Writing code for regression modelling is often time-consuming and error-prone in R.

Here we output the results of several bivariate logistic regression models using the broom and purrr packages.

library(broom)

merged$died <- factor(merged$died)

output <- merged %>%
  select(-died, -id) %>%
  names() %>%
  paste('died~',.) %>%
  map_df(~tidy(glm(as.formula(.x), 
               data= merged, 
               family = "binomial"), 
               conf.int=TRUE, 
               exponentiate=TRUE)) %>%
  filter(term !="(Intercept)") %>%
  select(term, `Odds ratio` = estimate, everything())

kable(output, digits = 3, caption = "Bivariate associations with odds of death")

Bivariate associations with odds of death
term	Odds ratio	std.error	statistic	p.value	conf.low	conf.high
groupUnexposed	1.503	0.328	1.244	0.213	0.794	2.896
sexMale	1.037	0.325	0.111	0.911	0.546	1.974
age	0.985	0.013	-1.141	0.254	0.960	1.011
bmi	1.009	0.038	0.227	0.820	0.935	1.087
marital_statusDivorced	3.182	0.586	1.976	0.048	1.088	11.530
marital_statusMarried	1.358	0.652	0.469	0.639	0.383	5.372
marital_statusNever married	2.333	0.599	1.414	0.157	0.768	8.610
marital_statusSeparated	0.682	0.770	-0.498	0.619	0.133	3.127
marital_statusWidowed	1.479	0.653	0.599	0.549	0.417	5.854
educ_levelNo schooling	1.069	0.563	0.119	0.905	0.351	3.364
educ_levelPrimary	1.883	0.508	1.246	0.213	0.719	5.482
educ_levelSecondary, graduated	0.733	0.612	-0.507	0.612	0.209	2.464
educ_levelSecondary, never graduated	1.511	0.524	0.787	0.431	0.553	4.498
coughYes	0.635	0.404	-1.121	0.262	0.302	1.504
nightswtYes	0.672	0.402	-0.988	0.323	0.321	1.585
weightlossYes	1.839	0.607	1.004	0.315	0.654	7.686
feverYes	0.673	0.404	-0.982	0.326	0.319	1.590
prev_tbYes	0.914	0.423	-0.213	0.831	0.423	2.276
hivHIV-positive	1.188	0.404	0.426	0.670	0.502	2.504
hivUnknown	1.203	1.034	0.179	0.858	0.066	5.931
educ_level2No education	0.836	0.422	-0.423	0.672	0.336	1.803
marital_status2Never married	1.719	0.525	1.032	0.302	0.628	5.124
marital_status2Separated	0.502	0.713	-0.966	0.334	0.105	1.927
marital_status2Other	1.370	0.467	0.674	0.500	0.583	3.758
age_group>=25 to <45 years	1.004	0.446	0.009	0.992	0.438	2.590
age_group>=45 years	0.729	0.483	-0.655	0.512	0.289	1.982