Agenda

We will learn about the following topics in this section.

Installing and loading tidyverse

Firstly, we will learn how to load a real-life dataset into R. These days using a group of packages called “tidyverse” github blog is the standard. Install tidyverse if you have never used it.

install.packages("tidyverse", dependencies = TRUE)

Loading a package (and related packages) is the following.

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

Loading and checking a dataset

Load a dataset in the current working directory. You need to explicitly assign it a name.

## CSV file
framingham <- read_csv("./framingham.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   SYSBP = col_double(),
##   DIABP = col_double(),
##   BMI = col_double(),
##   TIMEAP = col_double(),
##   TIMEMI = col_double(),
##   TIMEMIFC = col_double(),
##   TIMECHD = col_double(),
##   TIMESTRK = col_double(),
##   TIMECVD = col_double(),
##   TIMEDTH = col_double(),
##   TIMEHYP = col_double()
## )
## See spec(...) for full column specifications.
## Excel xlsx file
framinghamxl <- readxl::read_xlsx("./framingham.xlsx")

Evaluating the dataset name will give you a snapshot.

framingham
## # A tibble: 4,434 x 37
##      SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>
##  1     1     195    39 106.0    70        0       0 26.97        0      0
##  2     0     250    46 121.0    81        0       0 28.73        0      0
##  3     1     245    48 127.5    80        1      20 25.34        0      0
##  4     0     225    61 150.0    95        1      30 28.58        0      0
##  5     0     285    46 130.0    84        1      23 23.10        0      0
##  6     0     228    43 180.0   110        0       0 30.30        0      0
##  7     0     205    63 138.0    71        0       0 33.11        0      0
##  8     0     313    45 100.0    71        1      20 21.68        0      0
##  9     1     260    52 141.5    89        0       0 26.36        0      0
## 10     1     225    43 162.0   107        1      30 23.61        0      0
## # ... with 4,424 more rows, and 27 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>

If you want to see the all the columns.

print(framingham, width = Inf)
## # A tibble: 4,434 x 37
##      SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS HEARTRTE GLUCOSE PREVCHD PREVAP PREVMI PREVSTRK PREVHYP DEATH ANGINA HOSPMI MI_FCHD ANYCHD STROKE   CVD HYPERTEN    TIMEAP    TIMEMI  TIMEMIFC   TIMECHD TIMESTRK  TIMECVD   TIMEDTH   TIMEHYP bmicat agecat highbp packs
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>    <int>   <int>   <int>  <int>  <int>    <int>   <int> <int>  <int>  <int>   <int>  <int>  <int> <int>    <int>     <dbl>     <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>     <dbl>  <int>  <int>  <int> <int>
##  1     1     195    39 106.0    70        0       0 26.97        0      0       80      77       0      0      0        0       0     0      0      1       1      1      0     1        0 24.000000 17.626283 17.626283 17.626283 24.00000 17.62628 24.000000 24.000000      3      1      0     0
##  2     0     250    46 121.0    81        0       0 28.73        0      0       95      76       0      0      0        0       0     0      0      0       0      0      0     0        0 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000 24.000000      3      2      0     0
##  3     1     245    48 127.5    80        1      20 25.34        0      0       75      70       0      0      0        0       0     0      0      0       0      0      0     0        0 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000 24.000000      3      2      0     1
##  4     0     225    61 150.0    95        1      30 28.58        0      0       65     103       0      0      0        0       1     1      0      0       0      0      1     1        1  8.093087  8.093087  8.093087  8.093087  5.71937  5.71937  8.093087  0.000000      3      4      1     2
##  5     0     285    46 130.0    84        1      23 23.10        0      0       85      85       0      0      0        0       0     0      0      0       0      0      0     0        1 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000 11.731691      2      2      0     2
##  6     0     228    43 180.0   110        0       0 30.30        0      0       77      99       0      0      0        0       1     0      0      0       1      1      0     1        1 24.000000 24.000000 15.657769 15.657769 24.00000 15.65777 24.000000  0.000000      4      2      1     0
##  7     0     205    63 138.0    71        0       0 33.11        0      0       60      85       0      0      0        0       0     0      1      0       0      1      0     0        1  1.021218 24.000000 24.000000  1.021218 24.00000 24.00000 24.000000  6.056126      4      4      0     0
##  8     0     313    45 100.0    71        1      20 21.68        0      0       79      78       0      0      0        0       0     0      0      0       0      0      0     0        1 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000 23.761807      2      2      0     1
##  9     1     260    52 141.5    89        0       0 26.36        0      0       76      79       0      0      0        0       1     0      0      0       0      0      0     0        1 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000  0.000000      3      3      1     0
## 10     1     225    43 162.0   107        1      30 23.61        0      0       93      88       0      0      0        0       1     0      0      0       0      0      0     0        1 24.000000 24.000000 24.000000 24.000000 24.00000 24.00000 24.000000  0.000000      2      2      1     2
## # ... with 4,424 more rows

A “tidy” dataset (data frame) should have rows as observations and columns as variables like this dataset. glimpse() gives may be more useful in providing over view of all variables.

glimpse(framingham)
## Observations: 4,434
## Variables: 37
## $ SEX      <int> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,...
## $ TOTCHOL  <int> 195, 250, 245, 225, 285, 228, 205, 313, 260, 225, 254...
## $ AGE      <int> 39, 46, 48, 61, 46, 43, 63, 45, 52, 43, 50, 43, 46, 4...
## $ SYSBP    <dbl> 106.0, 121.0, 127.5, 150.0, 130.0, 180.0, 138.0, 100....
## $ DIABP    <dbl> 70.0, 81.0, 80.0, 95.0, 84.0, 110.0, 71.0, 71.0, 89.0...
## $ CURSMOKE <int> 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,...
## $ CIGPDAY  <int> 0, 0, 20, 30, 23, 0, 0, 20, 0, 30, 0, 0, 15, 0, 9, 20...
## $ BMI      <dbl> 26.97, 28.73, 25.34, 28.58, 23.10, 30.30, 33.11, 21.6...
## $ DIABETES <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BPMEDS   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ HEARTRTE <int> 80, 95, 75, 65, 85, 77, 60, 79, 76, 93, 75, 72, 98, 6...
## $ GLUCOSE  <int> 77, 76, 70, 103, 85, 99, 85, 78, 79, 88, 76, 61, 64, ...
## $ PREVCHD  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PREVAP   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PREVMI   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PREVSTRK <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ PREVHYP  <int> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,...
## $ DEATH    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,...
## $ ANGINA   <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ HOSPMI   <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ MI_FCHD  <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ANYCHD   <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ STROKE   <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CVD      <int> 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ HYPERTEN <int> 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,...
## $ TIMEAP   <dbl> 24.0000000, 24.0000000, 24.0000000, 8.0930869, 24.000...
## $ TIMEMI   <dbl> 17.6262834, 24.0000000, 24.0000000, 8.0930869, 24.000...
## $ TIMEMIFC <dbl> 17.6262834, 24.0000000, 24.0000000, 8.0930869, 24.000...
## $ TIMECHD  <dbl> 17.6262834, 24.0000000, 24.0000000, 8.0930869, 24.000...
## $ TIMESTRK <dbl> 24.0000000, 24.0000000, 24.0000000, 5.7193703, 24.000...
## $ TIMECVD  <dbl> 17.6262834, 24.0000000, 24.0000000, 5.7193703, 24.000...
## $ TIMEDTH  <dbl> 24.0000000, 24.0000000, 24.0000000, 8.0930869, 24.000...
## $ TIMEHYP  <dbl> 24.000000, 24.000000, 24.000000, 0.000000, 11.731691,...
## $ bmicat   <int> 3, 3, 3, 3, 2, 4, 4, 2, 3, 2, 2, 3, 3, 4, 2, 2, 2, 2,...
## $ agecat   <int> 1, 2, 2, 4, 2, 2, 4, 2, 3, 2, 2, 2, 2, 2, 1, 1, 2, 2,...
## $ highbp   <int> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,...
## $ packs    <int> 0, 0, 1, 2, 2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 1, 1, 1, 1,...

Accessing elements in a data frame

Various ways exist to access elements in a data frame. To access a single variable, the variable name or a position can be used. Indexing starts at 1 like in math, unlike in many programming languages.

## Single variable access. This forms a vector.
framingham$SEX
## This is the same.
framingham[["SEX"]]
## This is the same.
framingham[[1]]

## Using dimension. This forms a smaller data frame in tidyverse.
framingham[1:2, c("SEX","AGE")]

Basic data structures in R

We saw a vector coming out of a data frame in the previous part. An “atomic vector” is the smallest data object in R that holds values of the same type. A “list” is a heterogeneous data object that can contain different types of objects including vectors or lists. This tweet may give a better idea.

A data frame is in fact a heterogeneous list of vectors of different types (each holding a single type of data) that are all of the same length (sample size n).

Atomic vector types

An atomic vector can be of several types including.

  • logical c(TRUE, FALSE)
  • numeric
    • integer c(-1L, 0L, 1L, 2L)
    • double c(-1.0, 0.0, 1.0, 2.0)
  • character c(“this”, “is”, “a”, “character”, “vector”)

Missing value handling

Missing values are very common in real-life data analysis. R has a build-in way to handle this by “NA”. NA is a value that is not known. It acts like a placeholder, but gives NA for most operations.

NA + 1
## [1] NA
NA * 1
## [1] NA
NA + NA
## [1] NA
NA == 1
## [1] NA
## Cannot detect NA by equality!
NA == NA
## [1] NA
## Use is.na()
is.na(NA)
## [1] TRUE
## For some reason, %in% (member of operation) gives FALSE.
NA %in% c(1)
## [1] FALSE

Factors

Categorical variables are handled as “factors” in R. Factors are really integers with a finite number of levels that are labeled. In our dataset, bmicat has the following coding. They can be handy when plotting data or conducting regression analyses.

  • 1 = Underweight (BMI < 18.5)
  • 2 = Normal Weight (18.5 < BMI < 25)
  • 3 = Overweight (25 < BMI < 30)
  • 4 = Obese (BMI > 30)
## BMI category is coded 1,2,3,4
table(framingham$bmicat)
## 
##    1    2    3    4 
##   57 1936 1845  577
## factor() can be used to associate levels to labels.
framingham$bmicat2 <- factor(framingham$bmicat,
                             levels = c(1,2,3,4),
                             labels = c("underweight","normal","overweight","obese"))
table(framingham$bmicat2)
## 
## underweight      normal  overweight       obese 
##          57        1936        1845         577
## Change reference level to normal.
framingham$bmicat3 <- forcats::fct_relevel(framingham$bmicat2, "normal")
table(framingham$bmicat3)
## 
##      normal underweight  overweight       obese 
##        1936          57        1845         577
## Recode by collapsing normal and underweight together with a new name.
framingham$bmicat4 <- forcats::fct_recode(framingham$bmicat2,
                                          "normal or underweight" = "normal",
                                          "normal or underweight" = "underweight")
table(framingham$bmicat4)
## 
## normal or underweight            overweight                 obese 
##                  1993                  1845                   577

Subsetting a data frame: select()ing columns and filter()ing rows

Columns can be select()ed. Note the change in dimension

## Select a few variables and drop others
select(framingham, SEX, AGE, TOTCHOL)
## # A tibble: 4,434 x 3
##      SEX   AGE TOTCHOL
##    <int> <int>   <int>
##  1     1    39     195
##  2     0    46     250
##  3     1    48     245
##  4     0    61     225
##  5     0    46     285
##  6     0    43     228
##  7     0    63     205
##  8     0    45     313
##  9     1    52     260
## 10     1    43     225
## # ... with 4,424 more rows
## Move some of the variables to the left.
select(framingham, SYSBP, DIABP, everything())
## # A tibble: 4,434 x 40
##    SYSBP DIABP   SEX TOTCHOL   AGE CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <dbl> <dbl> <int>   <int> <int>    <int>   <int> <dbl>    <int>  <int>
##  1 106.0    70     1     195    39        0       0 26.97        0      0
##  2 121.0    81     0     250    46        0       0 28.73        0      0
##  3 127.5    80     1     245    48        1      20 25.34        0      0
##  4 150.0    95     0     225    61        1      30 28.58        0      0
##  5 130.0    84     0     285    46        1      23 23.10        0      0
##  6 180.0   110     0     228    43        0       0 30.30        0      0
##  7 138.0    71     0     205    63        0       0 33.11        0      0
##  8 100.0    71     0     313    45        1      20 21.68        0      0
##  9 141.5    89     1     260    52        0       0 26.36        0      0
## 10 162.0   107     1     225    43        1      30 23.61        0      0
## # ... with 4,424 more rows, and 30 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>,
## #   bmicat2 <fctr>, bmicat3 <fctr>, bmicat4 <fctr>

Rows can be filter()ed based on conditions on variables. Note the change in dimension. R does not modify your data unless you explicitly assign it back.

## Data frame name is followed by conditions that return TRUE/FALSE
filter(framingham, AGE >= 65)
## # A tibble: 203 x 40
##      SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>
##  1     0     252    65 179.5 114.0        0       0 30.47        0      0
##  2     0     254    67 157.0  89.0        0       0 24.25        0      0
##  3     0     311    66 154.0  80.0        0       0 28.55        0      0
##  4     0     193    65 123.0  76.5        0       0 29.33        0      0
##  5     0     278    66 187.0  88.0        0       0 40.52        0      0
##  6     0     264    67 139.0  80.0        0       0 25.75        0      0
##  7     1     288    66 109.0  71.0        0       0 29.29        0      0
##  8     0     214    66 212.0 104.0        0       0 25.32        0      0
##  9     0     259    67 151.0 101.0        0       0 21.67        0      0
## 10     1     257    67 125.0  67.5        0       0 25.95        0      0
## # ... with 193 more rows, and 30 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>,
## #   bmicat2 <fctr>, bmicat3 <fctr>, bmicat4 <fctr>
## Multiple conditions are intersected (AND)
filter(framingham, AGE >= 65, AGE < 70, SEX == 1)
## # A tibble: 92 x 40
##      SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>
##  1     1     288    66 109.0  71.0        0       0 29.29        0      0
##  2     1     257    67 125.0  67.5        0       0 25.95        0      0
##  3     1     207    65 139.0  88.0        0       0 24.04        0      0
##  4     1     219    65 148.0  90.0        1      15 29.35        0      0
##  5     1     164    68 142.0  85.0        0       0 30.28        1      0
##  6     1     203    67 122.0  74.0        0       0 15.54        0      0
##  7     1     250    68 109.0  73.0        1      10 24.68        0      0
##  8     1     276    66 159.0  82.0        0       0 31.42        0      0
##  9     1      NA    65 152.5  97.5        0       0 28.35        0      0
## 10     1     214    67 127.5  80.0        0       0 22.11        0      0
## # ... with 82 more rows, and 30 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>,
## #   bmicat2 <fctr>, bmicat3 <fctr>, bmicat4 <fctr>
## To actually keep the new dataset, you need to assign it.
framingham_sub <- filter(framingham, AGE >= 65, AGE < 70, SEX == 1)
framingham_sub
## # A tibble: 92 x 40
##      SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>
##  1     1     288    66 109.0  71.0        0       0 29.29        0      0
##  2     1     257    67 125.0  67.5        0       0 25.95        0      0
##  3     1     207    65 139.0  88.0        0       0 24.04        0      0
##  4     1     219    65 148.0  90.0        1      15 29.35        0      0
##  5     1     164    68 142.0  85.0        0       0 30.28        1      0
##  6     1     203    67 122.0  74.0        0       0 15.54        0      0
##  7     1     250    68 109.0  73.0        1      10 24.68        0      0
##  8     1     276    66 159.0  82.0        0       0 31.42        0      0
##  9     1      NA    65 152.5  97.5        0       0 28.35        0      0
## 10     1     214    67 127.5  80.0        0       0 22.11        0      0
## # ... with 82 more rows, and 30 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>,
## #   bmicat2 <fctr>, bmicat3 <fctr>, bmicat4 <fctr>

Manipulating variables: rename() and mutate()

Use rename() to change variables names. If you don’t assign it back explicitly, the original data frame does not change.

## rename() can be used to rename specific variables.
## Again assignment is necessary to keep it.
rename(framingham,
       total_cholesterol = TOTCHOL,
       cigarretes_per_day = CIGPDAY)
## # A tibble: 4,434 x 40
##      SEX total_cholesterol   AGE SYSBP DIABP CURSMOKE cigarretes_per_day
##    <int>             <int> <int> <dbl> <dbl>    <int>              <int>
##  1     1               195    39 106.0    70        0                  0
##  2     0               250    46 121.0    81        0                  0
##  3     1               245    48 127.5    80        1                 20
##  4     0               225    61 150.0    95        1                 30
##  5     0               285    46 130.0    84        1                 23
##  6     0               228    43 180.0   110        0                  0
##  7     0               205    63 138.0    71        0                  0
##  8     0               313    45 100.0    71        1                 20
##  9     1               260    52 141.5    89        0                  0
## 10     1               225    43 162.0   107        1                 30
## # ... with 4,424 more rows, and 33 more variables: BMI <dbl>,
## #   DIABETES <int>, BPMEDS <int>, HEARTRTE <int>, GLUCOSE <int>,
## #   PREVCHD <int>, PREVAP <int>, PREVMI <int>, PREVSTRK <int>,
## #   PREVHYP <int>, DEATH <int>, ANGINA <int>, HOSPMI <int>, MI_FCHD <int>,
## #   ANYCHD <int>, STROKE <int>, CVD <int>, HYPERTEN <int>, TIMEAP <dbl>,
## #   TIMEMI <dbl>, TIMEMIFC <dbl>, TIMECHD <dbl>, TIMESTRK <dbl>,
## #   TIMECVD <dbl>, TIMEDTH <dbl>, TIMEHYP <dbl>, bmicat <int>,
## #   agecat <int>, highbp <int>, packs <int>, bmicat2 <fctr>,
## #   bmicat3 <fctr>, bmicat4 <fctr>
## Give a variable less confusing name and assign back to the same name.
framingham <- rename(framingham,
                     MALE = SEX)
framingham
## # A tibble: 4,434 x 40
##     MALE TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <int>   <int> <int> <dbl> <dbl>    <int>   <int> <dbl>    <int>  <int>
##  1     1     195    39 106.0    70        0       0 26.97        0      0
##  2     0     250    46 121.0    81        0       0 28.73        0      0
##  3     1     245    48 127.5    80        1      20 25.34        0      0
##  4     0     225    61 150.0    95        1      30 28.58        0      0
##  5     0     285    46 130.0    84        1      23 23.10        0      0
##  6     0     228    43 180.0   110        0       0 30.30        0      0
##  7     0     205    63 138.0    71        0       0 33.11        0      0
##  8     0     313    45 100.0    71        1      20 21.68        0      0
##  9     1     260    52 141.5    89        0       0 26.36        0      0
## 10     1     225    43 162.0   107        1      30 23.61        0      0
## # ... with 4,424 more rows, and 30 more variables: HEARTRTE <int>,
## #   GLUCOSE <int>, PREVCHD <int>, PREVAP <int>, PREVMI <int>,
## #   PREVSTRK <int>, PREVHYP <int>, DEATH <int>, ANGINA <int>,
## #   HOSPMI <int>, MI_FCHD <int>, ANYCHD <int>, STROKE <int>, CVD <int>,
## #   HYPERTEN <int>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>,
## #   TIMEHYP <dbl>, bmicat <int>, agecat <int>, highbp <int>, packs <int>,
## #   bmicat2 <fctr>, bmicat3 <fctr>, bmicat4 <fctr>

To create transformed variables, mutate() is used. Again if you don’t assign explicitly, the original data frame does not change despite the name mutate().

framingham <- mutate(framingham,
                     htn = as.numeric(SYSBP > 140 | DIABP > 90))

Pipes to chain functions together: data %>% f1 %>% f2

We often have to apply these functions sequentially. This can be done by nesting functions or chaining them with the pipe operator %>%. Here is an example where I change variable name, then filter() based on age and sex (male only), and then select() several variables, and sort by AGE. See the similarity and differences. Again the original framingham dataset is not modified unless you explicitly assign it back.

## Nesting can be ugly.
arrange(select(filter(rename(framingham,
                             total_cholesterol = TOTCHOL),
                      AGE >= 65, AGE < 70, MALE == 1),
               MALE, AGE, total_cholesterol),
        AGE)
## # A tibble: 92 x 3
##     MALE   AGE total_cholesterol
##    <int> <int>             <int>
##  1     1    65               207
##  2     1    65               219
##  3     1    65                NA
##  4     1    65               251
##  5     1    65               285
##  6     1    65               171
##  7     1    65               225
##  8     1    65               201
##  9     1    65               167
## 10     1    65               193
## # ... with 82 more rows
## Pipes to the rescue.
framingham %>%
    rename(total_cholesterol = TOTCHOL) %>%
    filter(AGE >= 65, AGE < 70, MALE == 1) %>%
    select(MALE, AGE, total_cholesterol) %>%
    arrange(AGE)
## # A tibble: 92 x 3
##     MALE   AGE total_cholesterol
##    <int> <int>             <int>
##  1     1    65               207
##  2     1    65               219
##  3     1    65                NA
##  4     1    65               251
##  5     1    65               285
##  6     1    65               171
##  7     1    65               225
##  8     1    65               201
##  9     1    65               167
## 10     1    65               193
## # ... with 82 more rows

Summarizing variables: summarize() and group_by()

We sometimes want to examine summary statistics, often within some subgroups. Here we construct subgroups by sex and previous history of coronary heart disease. na.rm = TRUE means drop the missing values NA when calculating the mean, sd or variance (complete case analysis).

framingham %>%
    group_by(MALE, PREVCHD) %>%
    summarize(mean_total_chol = mean(TOTCHOL, na.rm = TRUE),
              sd_total_chol = sd(TOTCHOL, na.rm = TRUE),
              var_total_chol = var(TOTCHOL, na.rm = TRUE))
## # A tibble: 4 x 5
## # Groups:   MALE [?]
##    MALE PREVCHD mean_total_chol sd_total_chol var_total_chol
##   <int>   <int>           <dbl>         <dbl>          <dbl>
## 1     0       0        239.4413      46.19686       2134.150
## 2     0       1        248.0735      46.57663       2169.383
## 3     1       0        233.1048      42.13729       1775.551
## 4     1       1        240.5242      45.04297       2028.869

For the purpose of “Table 1” (baseline patient characteristics), a specialized package may be more useful.

library(tableone)
mytab1 <- CreateTableOne(vars = c("TOTCHOL","AGE","BMI","DIABETES","BPMEDS"), # Variables to include
                         strata = c("MALE", "PREVCHD"),                       # Classification variable
                         data = framingham,                                   # dataset
                         factorVars = c("DIABETES","BPMEDS"))                 # variables to handle as categoricals.
print(mytab1)
##                      Stratified by MALE:PREVCHD
##                       0:0            1:0            0:1           
##   n                     2420           1820             70        
##   TOTCHOL (mean (sd)) 239.44 (46.20) 233.10 (42.14) 248.07 (46.58)
##   AGE (mean (sd))      49.80 (8.60)   49.29 (8.53)   58.29 (5.68) 
##   BMI (mean (sd))      25.51 (4.50)   26.19 (3.42)   28.42 (5.69) 
##   DIABETES = 1 (%)        57 (2.4)       52 (2.9)        5 ( 7.1) 
##   BPMEDS = 1 (%)          89 (3.7)       35 (1.9)       13 (18.8) 
##                      Stratified by MALE:PREVCHD
##                       1:1            p      test
##   n                      124                    
##   TOTCHOL (mean (sd)) 240.52 (45.04) <0.001     
##   AGE (mean (sd))      57.02 (8.23)  <0.001     
##   BMI (mean (sd))      25.94 (3.27)  <0.001     
##   DIABETES = 1 (%)         7 (5.6)    0.015     
##   BPMEDS = 1 (%)           7 (6.0)   <0.001