The aim of this document is to give a short introduction to the R software. During this tutorial I will briefly cover how to:
The topics will be covered using the collection of packages in tidyverse. The real data from a cohort of marathon runners will serve as working example.
Descriptive abstract
Hyponatremia has emerged as an important cause of race-related death and life-threatening illness among marathon runners. We studied a cohort of marathon runners to estimate the incidence of hyponatremia and to identify the principal risk factors.
The following packages are used in the tutorial. If you want to reproduce the code chunks below, be sure to install them (e.g. install.packages("tidyverse")).
library(tidyverse)
library(Epi)
library(knitr)
library(gridExtra)
The marathon data are available in a csv format at http://alecri.github.io/downloads/data/marathon.csv .
marathon <- read_csv("http://alecri.github.io/downloads/data/marathon.csv")
marathon
## # A tibble: 488 x 18
## id na nas135 female age urinat3p prewt postwt wtdiff height
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 138 na > … female 24.2 >=3 NA NA NA 1.73
## 2 2 142 na > … male 44.3 <3 NA NA NA NA
## 3 3 151 na > … male 42.0 <3 NA NA NA NA
## 4 4 139 na > … male 28.2 >=3 NA NA NA 1.73
## 5 5 145 na > … female 30.2 <3 NA 50.7 NA NA
## 6 6 140 na > … female 28.3 <3 NA 55.7 NA 1.61
## 7 7 142 na > … male 34.4 <3 NA 59.3 NA NA
## 8 8 140 na > … male 26.2 <3 NA 59.8 NA NA
## 9 9 141 na > … male 50.4 <3 NA 64.8 NA NA
## 10 10 138 na > … male 49.4 <3 NA 68.2 NA NA
## # … with 478 more rows, and 8 more variables: bmi <dbl>, runtime <dbl>,
## # trainpace <dbl>, prevmara <dbl>, fluidint <chr>, waterload <chr>,
## # nsaid <chr>, wtdiffc <chr>
Many other options exist, such as:
read.table, read.csv, read.delim)readxl::read_excelhaven::read_savhaven::read_sashaven::read_dtaA comprehensive tutorial can be found here.
# what is the dimension of the data
dim(marathon)
## [1] 488 18
# i.e. how many obs/rows
nrow(marathon)
## [1] 488
# how many cols/variables?
ncol(marathon)
## [1] 18
# what's the name of the variable
colnames(marathon)
## [1] "id" "na" "nas135" "female" "age"
## [6] "urinat3p" "prewt" "postwt" "wtdiff" "height"
## [11] "bmi" "runtime" "trainpace" "prevmara" "fluidint"
## [16] "waterload" "nsaid" "wtdiffc"
# what is the structure of the data
glimpse(marathon)
## Observations: 488
## Variables: 18
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ na <dbl> 138, 142, 151, 139, 145, 140, 142, 140, 141, 138, 141,…
## $ nas135 <chr> "na > 135", "na > 135", "na > 135", "na > 135", "na > …
## $ female <chr> "female", "male", "male", "male", "female", "female", …
## $ age <dbl> 24.20534, 44.28200, 41.96304, 28.19713, 30.18207, 28.2…
## $ urinat3p <chr> ">=3", "<3", "<3", ">=3", "<3", "<3", "<3", "<3", "<3"…
## $ prewt <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ postwt <dbl> NA, NA, NA, NA, 50.68182, 55.68182, 59.31818, 59.77273…
## $ wtdiff <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ height <dbl> 1.72720, NA, NA, 1.72720, NA, 1.60655, NA, NA, NA, NA,…
## $ bmi <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ runtime <dbl> NA, 161, 156, 346, 185, 233, 183, 162, 182, 190, 169, …
## $ trainpace <dbl> 480, 430, 360, 630, NA, NA, 435, 450, 435, 440, 410, 4…
## $ prevmara <dbl> 3, 40, 40, 1, 3, 25, 19, 2, 80, 10, 16, 3, 2, 8, 4, 3,…
## $ fluidint <chr> "Every mile", "Every mile", "Every other mile", "Every…
## $ waterload <chr> "Yes", "Yes", NA, "Yes", "Yes", "Yes", "Yes", "No", "Y…
## $ nsaid <chr> "Yes", "Yes", NA, "No", "Yes", "No", "Yes", "No", "Yes…
## $ wtdiffc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# show the first rows
head(marathon)
## # A tibble: 6 x 18
## id na nas135 female age urinat3p prewt postwt wtdiff height bmi
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 138 na > … female 24.2 >=3 NA NA NA 1.73 NA
## 2 2 142 na > … male 44.3 <3 NA NA NA NA NA
## 3 3 151 na > … male 42.0 <3 NA NA NA NA NA
## 4 4 139 na > … male 28.2 >=3 NA NA NA 1.73 NA
## 5 5 145 na > … female 30.2 <3 NA 50.7 NA NA NA
## 6 6 140 na > … female 28.3 <3 NA 55.7 NA 1.61 NA
## # … with 7 more variables: runtime <dbl>, trainpace <dbl>, prevmara <dbl>,
## # fluidint <chr>, waterload <chr>, nsaid <chr>, wtdiffc <chr>
# quick and dirty summary
summary(marathon)
## id na nas135 female
## Min. : 1.0 Min. :114.0 Length:488 Length:488
## 1st Qu.:122.8 1st Qu.:138.0 Class :character Class :character
## Median :244.5 Median :141.0 Mode :character Mode :character
## Mean :244.5 Mean :140.4
## 3rd Qu.:366.2 3rd Qu.:143.0
## Max. :488.0 Max. :158.0
##
## age urinat3p prewt postwt
## Min. :19.44 Length:488 Min. : 42.05 Min. : 42.73
## 1st Qu.:31.40 Class :character 1st Qu.: 60.00 1st Qu.: 60.17
## Median :38.80 Mode :character Median : 68.64 Median : 67.95
## Mean :38.85 Mean : 69.26 Mean : 68.62
## 3rd Qu.:45.71 3rd Qu.: 75.91 3rd Qu.: 75.23
## Max. :73.08 Max. :101.82 Max. :100.00
## NA's :2 NA's :19 NA's :20
## wtdiff height bmi runtime
## Min. :-7.0454 Min. :1.511 Min. :16.77 Min. :142.0
## 1st Qu.:-1.7756 1st Qu.:1.676 1st Qu.:21.16 1st Qu.:195.0
## Median :-0.6818 Median :1.727 Median :22.54 Median :220.0
## Mean :-0.6895 Mean :1.734 Mean :22.94 Mean :225.5
## 3rd Qu.: 0.4546 3rd Qu.:1.800 3rd Qu.:24.25 3rd Qu.:248.0
## Max. : 4.0909 Max. :2.108 Max. :32.21 Max. :400.0
## NA's :33 NA's :18 NA's :23 NA's :11
## trainpace prevmara fluidint waterload
## Min. :330.0 Min. : 0.000 Length:488 Length:488
## 1st Qu.:450.0 1st Qu.: 2.000 Class :character Class :character
## Median :480.0 Median : 5.000 Mode :character Mode :character
## Mean :488.8 Mean : 8.839
## 3rd Qu.:525.0 3rd Qu.: 10.000
## Max. :780.0 Max. :120.000
## NA's :59 NA's :3
## nsaid wtdiffc
## Length:488 Length:488
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
Data manipulations consist of a multiplicity and combinations of steps. The dplyr package in tidyverse aids this task with a collections of verbs.
I will show how to:
mutate verb.# run time in hour
marathon <- mutate(marathon, runtime_h = runtime/60)
# alternative code
# marathon$runtime_h <- marathon$runtime/60
# modify a character variable into a factor variable
str(marathon$nas135)
## chr [1:488] "na > 135" "na > 135" "na > 135" "na > 135" "na > 135" ...
marathon <- mutate(marathon, nas135 = factor(nas135, levels = c("na > 135", "na <= 135")))
str(marathon$nas135)
## Factor w/ 2 levels "na > 135","na <= 135": 1 1 1 1 1 1 1 1 1 1 ...
# categorize a continuous variable
summary(marathon$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 19.44 31.40 38.80 38.85 45.71 73.08 2
marathon <- mutate(marathon, age_cat = cut(age, breaks = c(19, 31, 38, 45, 75)))
table(marathon$age_cat)
##
## (19,31] (31,38] (38,45] (45,75]
## 115 114 117 140
select verb.# some variables + variables containing wt (shortage for weight)
marathon_sub <- select(marathon, id, na, female, bmi, runtime_h, contains("wt"))
head(marathon_sub)
## # A tibble: 6 x 9
## id na female bmi runtime_h prewt postwt wtdiff wtdiffc
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 138 female NA NA NA NA NA <NA>
## 2 2 142 male NA 2.68 NA NA NA <NA>
## 3 3 151 male NA 2.6 NA NA NA <NA>
## 4 4 139 male NA 5.77 NA NA NA <NA>
## 5 5 145 female NA 3.08 NA 50.7 NA <NA>
## 6 6 140 female NA 3.88 NA 55.7 NA <NA>
filter verb.female_27 <- filter(marathon_sub, female == "female", bmi > 27)
female_27
## # A tibble: 3 x 9
## id na female bmi runtime_h prewt postwt wtdiff wtdiffc
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 254 138 female 27.8 5.68 71.3 69.5 -1.73 -2.0 to -1.1
## 2 355 158 female 29.3 5.5 80 79.3 -0.682 -1.0 to -0.1
## 3 470 135 female 31.7 6.67 73.6 73.6 0 0.0 to 0.9
arrange verb.arrange(female_27, desc(na), bmi)
## # A tibble: 3 x 9
## id na female bmi runtime_h prewt postwt wtdiff wtdiffc
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 355 158 female 29.3 5.5 80 79.3 -0.682 -1.0 to -0.1
## 2 254 138 female 27.8 5.68 71.3 69.5 -1.73 -2.0 to -1.1
## 3 470 135 female 31.7 6.67 73.6 73.6 0 0.0 to 0.9
Chaining: wrap different functions inside each other
arrange(
filter(
select(
mutate(marathon,
runtime_h = runtime/60),
id, na, female, bmi, runtime_h, contains("wt")),
female == "female", bmi > 27), desc(na), bmi)
## # A tibble: 3 x 9
## id na female bmi runtime_h prewt postwt wtdiff wtdiffc
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 355 158 female 29.3 5.5 80 79.3 -0.682 -1.0 to -0.1
## 2 254 138 female 27.8 5.68 71.3 69.5 -1.73 -2.0 to -1.1
## 3 470 135 female 31.7 6.67 73.6 73.6 0 0.0 to 0.9
This operator allows you to pipe the output from one function to the input of another function.
marathon %>%
mutate(runtime_h = runtime/60) %>%
select(id, na, female, bmi, runtime_h, contains("wt")) %>%
filter(female == "female", bmi > 27) %>%
arrange(desc(na), bmi)
## # A tibble: 3 x 9
## id na female bmi runtime_h prewt postwt wtdiff wtdiffc
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 355 158 female 29.3 5.5 80 79.3 -0.682 -1.0 to -0.1
## 2 254 138 female 27.8 5.68 71.3 69.5 -1.73 -2.0 to -1.1
## 3 470 135 female 31.7 6.67 73.6 73.6 0 0.0 to 0.9
ggplot(marathon, aes(wtdiff, na, col = bmi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ female) +
labs(x = "Weight change, kg", y = "Sodium concentration, mmol per liter") +
theme_bw()
# change theme for next graphs
theme_set(theme_bw())
# for a continuous variable
summary(marathon$na)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 114.0 138.0 141.0 140.4 143.0 158.0
c(mean = mean(marathon$na), sd = sd(marathon$na))
## mean sd
## 140.405738 4.752102
quantile(marathon$na)
## 0% 25% 50% 75% 100%
## 114 138 141 143 158
summarise(marathon, mean = mean(na), sd = sd(na), median = median(na), iqr = IQR(na))
## # A tibble: 1 x 4
## mean sd median iqr
## <dbl> <dbl> <dbl> <dbl>
## 1 140. 4.75 141 5
# more concise
summarise_each(marathon, funs(mean, sd, median, IQR), na)
## # A tibble: 1 x 4
## mean sd median IQR
## <dbl> <dbl> <dbl> <dbl>
## 1 140. 4.75 141 5
# histogram (counts)
ggplot(marathon, aes(na)) +
geom_histogram() +
labs(x = "Sodium concentration, mmol per liter")
# histogram (density)
ggplot(marathon, aes(na)) +
geom_histogram(stat = "density", n = 2^5) +
geom_line(stat = "density") +
labs(x = "Sodium concentration, mmol per liter")
# boxplot
ggplot(marathon, aes(x = "", y = na)) +
geom_boxplot() +
labs(x = "", y = "Sodium concentration, mmol per liter")
# for a categorical variable
table(marathon$nas135)
##
## na > 135 na <= 135
## 426 62
prop.table(table(marathon$nas135))
##
## na > 135 na <= 135
## 0.8729508 0.1270492
# barplot (counts)
ggplot(marathon, aes(nas135)) +
geom_bar()
# barplot (proportion)
ggplot(marathon, aes(nas135)) +
geom_bar(aes(y = ..count../sum(..count..))) +
labs(x = "", y = "Prevalence of hyponatremia")
# Continuous + categorical
marathon %>%
group_by(female) %>%
summarise_each(funs(mean, sd, median, IQR), na)
## # A tibble: 2 x 5
## female mean sd median IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 female 139. 5.55 139 6.75
## 2 male 141. 4.15 141 5
ggplot(marathon, aes(female, na)) +
geom_boxplot() +
labs(x = "", y = "Sodium concentration, mmol per liter")
ggplot(marathon, aes(female, na, fill = female)) +
geom_violin() +
guides(fill = "none") +
labs(x = "", y = "Sodium concentration, mmol per liter")
ggplot(marathon, aes(na, col = female)) +
geom_line(stat = "density") +
labs(x = "Sodium concentration, mmol per liter", col = "Sex")
# statistical tests
t.test(na ~ female, data = marathon, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: na by female
## t = -3.8169, df = 262.95, p-value = 0.0001686
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.8281299 -0.9032178
## sample estimates:
## mean in group female mean in group male
## 139.1747 141.0404
wilcox.test(na ~ female, data = marathon)
##
## Wilcoxon rank sum test with continuity correction
##
## data: na by female
## W = 20450, p-value = 1.996e-05
## alternative hypothesis: true location shift is not equal to 0
# 2-by-2 table
tab1 <- with(marathon, table(nas135, female))
tab1
## female
## nas135 female male
## na > 135 129 297
## na <= 135 37 25
prop.table(tab1, margin = 2)
## female
## nas135 female male
## na > 135 0.77710843 0.92236025
## na <= 135 0.22289157 0.07763975
ggplot(marathon, aes(nas135, fill = female)) +
geom_bar(position = "dodge")
ggplot(marathon, aes(nas135, group = female, fill = female)) +
geom_bar(aes(y = ..prop..), position = "dodge")
# statistical tests
chisq.test(tab1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab1
## X-squared = 19.547, df = 1, p-value = 9.813e-06
# 2 continuous
cor_nawtd <- with(marathon, cor(na, wtdiff, method = "pearson", use = "complete.obs"))
cor_nawtd
## [1] -0.4106029
p1 <- ggplot(marathon, aes(wtdiff, na)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Weight change, kg", y = "Sodium concentration, mmol per liter")
library(ggExtra)
ggMarginal(p1, type = "histogram")
library(table1)
table1(~ age + female + wtdiff + bmi + runtime | nas135, data = marathon)
| na > 135 (n=426) |
na <= 135 (n=62) |
Overall (n=488) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 39.0 (9.40) | 38.1 (9.47) | 38.8 (9.41) |
| Median [Min, Max] | 39.1 [19.4, 73.1] | 38.1 [23.2, 60.2] | 38.8 [19.4, 73.1] |
| Missing | 2 (0.5%) | 0 (0%) | 2 (0.4%) |
| female | |||
| female | 129 (30.3%) | 37 (59.7%) | 166 (34.0%) |
| male | 297 (69.7%) | 25 (40.3%) | 322 (66.0%) |
| wtdiff | |||
| Mean (SD) | -0.892 (1.54) | 0.725 (1.44) | -0.689 (1.62) |
| Median [Min, Max] | -0.909 [-7.05, 4.09] | 0.682 [-2.50, 3.41] | -0.682 [-7.05, 4.09] |
| Missing | 28 (6.6%) | 5 (8.1%) | 33 (6.8%) |
| bmi | |||
| Mean (SD) | 23.0 (2.52) | 22.8 (3.71) | 22.9 (2.70) |
| Median [Min, Max] | 22.6 [17.0, 32.2] | 22.3 [16.8, 31.7] | 22.5 [16.8, 32.2] |
| Missing | 21 (4.9%) | 2 (3.2%) | 23 (4.7%) |
| runtime | |||
| Mean (SD) | 222 (39.4) | 252 (47.1) | 226 (41.6) |
| Median [Min, Max] | 216 [142, 360] | 242 [159, 400] | 220 [142, 400] |
| Missing | 9 (2.1%) | 2 (3.2%) | 11 (2.3%) |
\[E[Y|X] = \beta_0 + \beta_1X_1 + \dots + \beta_kX_k\]
# univariate linear regression
fit1 <- lm(na ~ female, data = marathon)
summary(fit1)
##
## Call:
## lm(formula = na ~ female, data = marathon)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.1747 -2.1747 -0.0404 2.9596 18.8253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 139.1747 0.3628 383.657 < 2e-16 ***
## femalemale 1.8657 0.4466 4.178 3.49e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.674 on 486 degrees of freedom
## Multiple R-squared: 0.03467, Adjusted R-squared: 0.03268
## F-statistic: 17.45 on 1 and 486 DF, p-value: 3.492e-05
ci.lin(fit1) %>% kable()
| Estimate | StdErr | z | P | 2.5% | 97.5% | |
|---|---|---|---|---|---|---|
| (Intercept) | 139.174699 | 0.3627577 | 383.657463 | 0.00e+00 | 138.4637068 | 139.885691 |
| femalemale | 1.865674 | 0.4465793 | 4.177699 | 2.94e-05 | 0.9903945 | 2.740953 |
# multivariable linear regression
fit2 <- lm(na ~ wtdiff + female, data = marathon)
ci.lin(fit2) %>% kable()
| Estimate | StdErr | z | P | 2.5% | 97.5% | |
|---|---|---|---|---|---|---|
| (Intercept) | 139.0629967 | 0.3483380 | 399.218548 | 0.0000000 | 138.3802667 | 139.7457266 |
| wtdiff | -1.1572580 | 0.1310845 | -8.828337 | 0.0000000 | -1.4141789 | -0.9003371 |
| femalemale | 0.8158008 | 0.4452803 | 1.832106 | 0.0669356 | -0.0569325 | 1.6885341 |
marathon <- mutate(marathon,
pred_fit2 = predict(fit2, newdata = marathon))
# multivariable linear regression with interaction
fit3 <- lm(na ~ wtdiff*female, data = marathon)
ci.lin(fit3, ctr.mat = rbind(c(0, 1, 0, 0),
c(0, 1, 0, 1)))
## Estimate StdErr z P 2.5% 97.5%
## [1,] -1.366193 0.2484292 -5.499325 3.812468e-08 -1.853105 -0.8792807
## [2,] -1.076637 0.1543198 -6.976661 3.022779e-12 -1.379098 -0.7741756
marathon <- mutate(marathon,
pred_fit3 = predict(fit3, newdata = marathon))
# graphical comparison
grid.arrange(
ggplot(marathon, aes(wtdiff, pred_fit2, col = female)) +
geom_line() +
labs(x = "Weight change, kg", y = "Predicted mean na"),
ggplot(marathon, aes(wtdiff, pred_fit3, col = female)) +
geom_line() +
labs(x = "Weight change, kg", y = "Predicted mean na"),
ncol = 2
)
\[\log(\textrm{odds}(Y|X)) = \beta_0 + \beta_1X_1 + \dots + \beta_kX_k\]
# univariate logistic regression
fit4 <- glm(nas135 ~ wtdiff, data = marathon, family = binomial)
ci.exp(fit4)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.1518239 0.1112437 0.2072074
## wtdiff 2.0717812 1.6689660 2.5718185
marathon <- marathon %>%
mutate(pred_p = predict(fit4, newdata = marathon, type = "response"),
pred_odds = predict(fit4, newdata = marathon, type = "link"))
grid.arrange(
ggplot(marathon, aes(wtdiff, pred_p)) +
geom_line() +
labs(x = "Weight change, kg", y = "Predicted probability"),
ggplot(marathon, aes(wtdiff, pred_odds)) +
geom_line() +
labs(x = "Weight change, kg", y = "Predicted odds"),
ncol = 2
)
# multivariable logistic regression
fit5 <- glm(nas135 ~ wtdiff + female, data = marathon, family = binomial)
ci.exp(fit5)
## exp(Est.) 2.5% 97.5%
## (Intercept) 0.2389491 0.1572127 0.3631810
## wtdiff 2.0189387 1.6152028 2.5235924
## femalemale 0.4191424 0.2278755 0.7709486
mutate(marathon, pred = predict(fit5, newdata = marathon, type = "response")) %>%
ggplot(aes(wtdiff, pred, col = female)) +
geom_line() +
labs(x = "Weight change, kg", y = "Predicted probability")