This tuturial is the first in the dplyr training series.

Why dplyr

dplyr is a great tool to use in R. The commands may look long and overwhelming to someone not using dplyr but that is not the case. Once you learn the basics then it is very intuitive.

It is just making a long sentence by using different words of any language.

Audience

If you a beginner in R or if you have experience in R but never used dplyr or want to learn something new about dplyr then go ahead and watch this tutorial on youtube.

DPLYR : SELECT

We will be covering all practical aspects of dpyyr::select command in this. This tuturial is part of a series of tuturials on all practical aspects of dplyr

Create sample dataset

Run the following command to create our sample dataset This is a fictitious data about hospital patients and their clinical information like diagnostic codes and other demographic information.

library(dplyr)  # we will be using this packages
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("dplyr") # If you do not have this packages then run this code to install the package. Remove the # from the front before running it.



t1 <- sample(paste0("Hospital ", toupper(letters)), size = 100, replace=TRUE)
t2 <- sample(x = c("Male", "Female")   , size = 100, replace=TRUE)
t3 <- floor(runif(100, min = 0, max = 110))
t4 <- sample(x = c("Survived", "Died") , size = 100, replace=TRUE)
t5  <- sample(paste0("Facility ", toupper(letters)), size = 100, replace=TRUE)

d <- data.frame(cbind(t1,t2,t3,t4, t5))
names(d) <- c('AdmittingHospital', 'Gender', 'AgeYears', 'Outcome', 'Dischargeto')



# Fix classses of columns
d$Gender <- as.factor(d$Gender)
d$Outcome <- as.factor(d$Outcome)
d$AgeYears <- as.integer(d$AgeYears)

d$AgeGroup <- cut(d$AgeYears, 
                  breaks = c(-Inf
                             ,5 ,10 ,15,20,25,30,35,40,45,50,55,60 ,65,70,75,80,85
                             , Inf), 
                  
                  labels = c("0-4 years"
                             ,"5-9 years","10-14 years","15-19 years","20-24 years"
                             ,"25-29 years","30-34 years","35-39 years","40-44 years"
                             ,"45-49 years","50-54 years","55-59 years","60-64 years"
                             ,"65-69 years","70-74 years","75-79 years","80-84 years"
                             ,"85+ years"),
                  right = FALSE)




d$Diag1 <- sample(x= c("A00.0","E00.0","F01.50","G00.0","H00.011"), size = 100, replace = TRUE)
d$Diag3 <- sample(x= c("Y70","Y71","Y72","Y73","Y74"), size = 100, replace = TRUE)
d$Diag4 <- sample(x= c("G00","G01","G02","G03","G04", "G05"), size = 100, replace = TRUE)
d$Diag2 <- sample(x= c("H00","H10","H15","H16","H28"), size = 100, replace = TRUE)
d$Diag5 <- sample(x= c("E00","E01","E02","E03","E04","E05"), size = 100, replace = TRUE)
d$Diag6 <- sample(x= c("E08","E09","E10","E11","E12", "E13"), size = 100, replace = TRUE)
d$Diag7 <- sample(x= c("E40","E41","E42","E43","E44"), size = 100, replace = TRUE)

Have a look at the sample dataset

d

Dplyr in action

A demo of the power and verstality of dplyr

We are doing lots of transformation to our dataset and also plotting the data in a single statement. If you are a beginner then don’t get confused with the complexity of the statement.

library(ggplot2)

d%>%
  dplyr::select(AdmittingHospital , AgeYears, starts_with("Diag") )%>%
  dplyr::mutate(AdultOrKid =  case_when( AgeYears  == 0   ~ "Newborn"
                                       , AgeYears <= 16   ~ "Paediatric"
                                       , AgeYears > 16    ~  "Adult"
                                       ))%>%
  dplyr::group_by(AdultOrKid)%>%
  dplyr::mutate(MeanAge = mean(AgeYears))%>%
  dplyr::relocate(Diag2, .after = Diag1)%>%
  dplyr::arrange(AdmittingHospital,AgeYears)%>%
  dplyr::rename(`First Facility` = AdmittingHospital)%>%
  ggplot(aes(x =`First Facility`, y = AgeYears, fill = `First Facility`)) + geom_boxplot() + theme_bw()

SELECT

Selecting the data columns

d1 <- d%>%
      dplyr::select(AdmittingHospital, Outcome)

d1

Select columns by their position

d2 <- d%>%
  dplyr::select(1,2)
d2

Selecting columns by a start and end position

d3 <- d%>%
  dplyr::select(1:4)
d3

Another variation of the selection of columns

d4 <- d%>%
  dplyr::select(1, 3:5)

d4

Selecting all columns in between by defining the start and ending field names

d5 <- d%>%
  dplyr::select(AdmittingHospital :Dischargeto)

d5

Selecting the data fields by defining the starting text

Using a single character

d6 <- d%>%
  dplyr::select(starts_with("D"))

d6

Using a pattern to match

d7 <- d%>%
       dplyr::select(starts_with("Diag"))

d7

using ends with

d8 <- d%>%
  dplyr::select(ends_with("1"))
d8

Multiple criteria in ends with

d9 <- d%>%
  dplyr::select(ends_with(c("1","3")))
d9

Using contains command to search for a pattern anywhere in the field name.

d10 <-d%>%
  dplyr::select(contains("Age"))
d10

Removing data columns

deselecting/ removing a single data column/data field

Note that the original dataset is always intact as we are creating a new dataset in this example.

d11 <- d%>%
  dplyr::select(-Gender)
d11

Deselecting multiple columns

d12 <- d%>%
  dplyr::select(-c(Gender,AgeYears,AgeGroup     ))
d12

Renaming

Renaming the columns while selecting them

There is also a separate rename command in dplyr also which can be used. In the example below, if you use a space in the data field name then you have to use the these quotes. These are not your normal ’’ quotes.

d13 <- d%>%
  dplyr::select(`First Hospital` = AdmittingHospital 
                , Diag_primary =  Diag1 )
d13

Renaming using the rename command

# Note that the syntax is New Name =  Old Name

d14 <- d%>%
  dplyr::select(AdmittingHospital , Diag1 )%>%
  dplyr::rename(Diagnostic_01 = Diag1)
d14

Relocating

Relocate the column to the beginning

# Some relocate features

d15 <- d%>%
       dplyr::relocate(Outcome)
d15

Relocate the column before a certain column

d16 <- d%>%
  dplyr::relocate(AgeYears, .before = AgeGroup)
d16

Relocate the column after a certain column

d17 <- d%>%
  dplyr::relocate(AgeYears, .after = AgeGroup)
d17

Relocate columns based on a class

Example character variables

d18 <- d%>%
  dplyr::relocate(where(is.character))
d18

factor variables

d19 <- d%>%
  dplyr::relocate(where(is.factor))
d19

numeric variables

d20 <- d%>%
  dplyr::relocate(where(is.numeric))
d20

using multiple criteria

d21 <- d%>%
  dplyr::relocate(where(is.character))%>%
  dplyr::relocate(where(is.factor), .after = where(is.character))
d21