This tuturial is the first in the dplyr training series.
dplyr is a great tool to use in R. The commands may look long and overwhelming to someone not using dplyr but that is not the case. Once you learn the basics then it is very intuitive.
It is just making a long sentence by using different words of any language.
If you a beginner in R or if you have experience in R but never used dplyr or want to learn something new about dplyr then go ahead and watch this tutorial on youtube.
We will be covering all practical aspects of dpyyr::select command in this. This tuturial is part of a series of tuturials on all practical aspects of dplyr
Run the following command to create our sample dataset This is a fictitious data about hospital patients and their clinical information like diagnostic codes and other demographic information.
library(dplyr) # we will be using this packages
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("dplyr") # If you do not have this packages then run this code to install the package. Remove the # from the front before running it.
t1 <- sample(paste0("Hospital ", toupper(letters)), size = 100, replace=TRUE)
t2 <- sample(x = c("Male", "Female") , size = 100, replace=TRUE)
t3 <- floor(runif(100, min = 0, max = 110))
t4 <- sample(x = c("Survived", "Died") , size = 100, replace=TRUE)
t5 <- sample(paste0("Facility ", toupper(letters)), size = 100, replace=TRUE)
d <- data.frame(cbind(t1,t2,t3,t4, t5))
names(d) <- c('AdmittingHospital', 'Gender', 'AgeYears', 'Outcome', 'Dischargeto')
# Fix classses of columns
d$Gender <- as.factor(d$Gender)
d$Outcome <- as.factor(d$Outcome)
d$AgeYears <- as.integer(d$AgeYears)
d$AgeGroup <- cut(d$AgeYears,
breaks = c(-Inf
,5 ,10 ,15,20,25,30,35,40,45,50,55,60 ,65,70,75,80,85
, Inf),
labels = c("0-4 years"
,"5-9 years","10-14 years","15-19 years","20-24 years"
,"25-29 years","30-34 years","35-39 years","40-44 years"
,"45-49 years","50-54 years","55-59 years","60-64 years"
,"65-69 years","70-74 years","75-79 years","80-84 years"
,"85+ years"),
right = FALSE)
d$Diag1 <- sample(x= c("A00.0","E00.0","F01.50","G00.0","H00.011"), size = 100, replace = TRUE)
d$Diag3 <- sample(x= c("Y70","Y71","Y72","Y73","Y74"), size = 100, replace = TRUE)
d$Diag4 <- sample(x= c("G00","G01","G02","G03","G04", "G05"), size = 100, replace = TRUE)
d$Diag2 <- sample(x= c("H00","H10","H15","H16","H28"), size = 100, replace = TRUE)
d$Diag5 <- sample(x= c("E00","E01","E02","E03","E04","E05"), size = 100, replace = TRUE)
d$Diag6 <- sample(x= c("E08","E09","E10","E11","E12", "E13"), size = 100, replace = TRUE)
d$Diag7 <- sample(x= c("E40","E41","E42","E43","E44"), size = 100, replace = TRUE)
d
We are doing lots of transformation to our dataset and also plotting the data in a single statement. If you are a beginner then don’t get confused with the complexity of the statement.
library(ggplot2)
d%>%
dplyr::select(AdmittingHospital , AgeYears, starts_with("Diag") )%>%
dplyr::mutate(AdultOrKid = case_when( AgeYears == 0 ~ "Newborn"
, AgeYears <= 16 ~ "Paediatric"
, AgeYears > 16 ~ "Adult"
))%>%
dplyr::group_by(AdultOrKid)%>%
dplyr::mutate(MeanAge = mean(AgeYears))%>%
dplyr::relocate(Diag2, .after = Diag1)%>%
dplyr::arrange(AdmittingHospital,AgeYears)%>%
dplyr::rename(`First Facility` = AdmittingHospital)%>%
ggplot(aes(x =`First Facility`, y = AgeYears, fill = `First Facility`)) + geom_boxplot() + theme_bw()
d1 <- d%>%
dplyr::select(AdmittingHospital, Outcome)
d1
d2 <- d%>%
dplyr::select(1,2)
d2
d3 <- d%>%
dplyr::select(1:4)
d3
d4 <- d%>%
dplyr::select(1, 3:5)
d4
d5 <- d%>%
dplyr::select(AdmittingHospital :Dischargeto)
d5
Using a single character
d6 <- d%>%
dplyr::select(starts_with("D"))
d6
Using a pattern to match
d7 <- d%>%
dplyr::select(starts_with("Diag"))
d7
using ends with
d8 <- d%>%
dplyr::select(ends_with("1"))
d8
Multiple criteria in ends with
d9 <- d%>%
dplyr::select(ends_with(c("1","3")))
d9
Using contains command to search for a pattern anywhere in the field name.
d10 <-d%>%
dplyr::select(contains("Age"))
d10
Note that the original dataset is always intact as we are creating a new dataset in this example.
d11 <- d%>%
dplyr::select(-Gender)
d11
d12 <- d%>%
dplyr::select(-c(Gender,AgeYears,AgeGroup ))
d12
There is also a separate rename command in dplyr
also which can be used. In the example below, if you use a space in the
data field name then you have to use the these quotes.
These are not your normal ’’ quotes.
d13 <- d%>%
dplyr::select(`First Hospital` = AdmittingHospital
, Diag_primary = Diag1 )
d13
# Note that the syntax is New Name = Old Name
d14 <- d%>%
dplyr::select(AdmittingHospital , Diag1 )%>%
dplyr::rename(Diagnostic_01 = Diag1)
d14
# Some relocate features
d15 <- d%>%
dplyr::relocate(Outcome)
d15
d16 <- d%>%
dplyr::relocate(AgeYears, .before = AgeGroup)
d16
d17 <- d%>%
dplyr::relocate(AgeYears, .after = AgeGroup)
d17
d18 <- d%>%
dplyr::relocate(where(is.character))
d18
d19 <- d%>%
dplyr::relocate(where(is.factor))
d19
d20 <- d%>%
dplyr::relocate(where(is.numeric))
d20
d21 <- d%>%
dplyr::relocate(where(is.character))%>%
dplyr::relocate(where(is.factor), .after = where(is.character))
d21