This tuturial is the part of the dplyr training series. Here is the YouTube Video link

https://youtu.be/CxNfHbUX1Ng

Why dplyr

dplyr is a great tool to use in R. The commands may look long and overwhelming to someone not using dplyr but that is not the case. Once you learn the basics then it is very intuitive. It is just making a long sentence by using different words of any language.

Audience

For beginners or experienced R users wanting to learn various commands of dplyr.

DPLYR : Filter

We will be covering all practical aspects of dplyr::filter command in this. This tutorial is part of a series of tutorials on all practical aspects of dplyr All youtube videos are available in a single playlist on YouTube.

https://www.youtube.com/playlist?list=PLkHcMTpvAaXVJzyRSytUn3nSK92TJphxR

Create sample dataset

Run the following command to create our sample data set This is a fictitious data about hospital patients and their clinical information like diagnostic codes and other demographic information.

library(dplyr)  
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)  
#install.packages("dplyr") 
#install.packages("stringr") 
# If you do not have this packages then run this code to install the package. Remove the # from the front before running it.

t1 <- sample(paste0("Hospital ", toupper(letters)), size = 100, replace=TRUE)
t2 <- sample(x = c("Male", "Female")   , size = 100, replace=TRUE)
t3 <- floor(runif(100, min = 0, max = 110))
t4 <- sample(x = c("Survived", "Died") , size = 100, replace=TRUE)
t5  <- sample(paste0("Facility ", toupper(letters)), size = 100, replace=TRUE)

d <- data.frame(cbind(t1,t2,t3,t4, t5))
names(d) <- c('AdmittingHospital', 'Gender', 'AgeYears', 'Outcome', 'Dischargeto')

# Fix classses of columns
#d$Gender <- as.factor(d$Gender)
#d$Outcome <- as.factor(d$Outcome)
d$AgeYears <- as.integer(d$AgeYears)

d$AgeGroup <- cut(d$AgeYears, 
                  breaks = c(-Inf
                             ,5 ,10 ,15,20,25,30,35,40,45,50,55,60 ,65,70,75,80,85
                             , Inf), 
                  
                  labels = c("0-4 years"
                             ,"5-9 years","10-14 years","15-19 years","20-24 years"
                             ,"25-29 years","30-34 years","35-39 years","40-44 years"
                             ,"45-49 years","50-54 years","55-59 years","60-64 years"
                             ,"65-69 years","70-74 years","75-79 years","80-84 years"
                             ,"85+ years"),
                  right = FALSE)




d$Diag1 <- sample(x= c("A00.0","E0.0","F01.50","G00.0","H00.01"), size = 100, replace = TRUE)
d$Diag3 <- sample(x= c("Y70","Y71","Y72","Y73","Y74"), size = 100, replace = TRUE)
d$Diag4 <- sample(x= c("G00","G01","G02","G03","G04", "G05"), size = 100, replace = TRUE)
d$Diag2 <- sample(x= c("H00","H10","H15","H16","H28"), size = 100, replace = TRUE)
d$Diag5 <- sample(x= c("E00","E01","E02","E03","E04","E05"), size = 100, replace = TRUE)
d$Diag6 <- sample(x= c("E08","E09","E10","E11","E12", "E13"), size = 100, replace = TRUE)
d$Diag7 <- sample(x= c("E40","E41","E42","E43","E44"), size = 100, replace = TRUE)

Have a look at the sample dataset

d
d0 <- dplyr::filter(d,AdmittingHospital == 'Hospital A')

d0
d1 <- d%>%
      dplyr::filter(AdmittingHospital == 'Hospital A')

d1

Effect of upper case and lower case in search criteria

d2 <- d%>%
      dplyr::filter(AdmittingHospital == 'hospital a')

d2

Search for a numeric criteria

d5 <- d%>%
      dplyr::filter(AgeYears > 20)

d5

Search for a numeric criteria with more transformations

d6 <- d%>%
      dplyr::filter(AgeYears > 20)%>%
      dplyr::arrange(AgeYears)%>%
      dplyr::rename(`Age in Years` = AgeYears)
d6

Quick explanation of AND, OR and XOR operations in filtering multiple fields

df <- data.frame(name = c('Harry', 'Jon', 'Ram','Diane', 'Shayla','Deb')
                 , BlueDose  = c(1,1,1,0,1,0)
                 , GreenDose = c(1,0,0,1,1,0)
                 )




df
df%>%
  dplyr::filter( BlueDose ==1  & GreenDose == 1)
df%>%
  dplyr::filter( BlueDose ==1 | GreenDose == 1)
df%>%
  dplyr::filter(xor(BlueDose ==1, GreenDose == 1))

More than one criteria using AND &

d7 <- d%>%
      dplyr::filter(AdmittingHospital == 'Hospital L' & AgeYears > 20)
d7

Another variation of using AND &

If you simply keep on writing your criteria separated by a Comma then it also works as and AND

d8 <- d%>%
      dplyr::filter(AdmittingHospital == 'Hospital L' , AgeYears > 20)
d8

Another variation of using AND &

If you keep on writing separate filter statements they also work as AND

In this example we defined the first filter as

AdmittingHospital == ‘Hospital L’

and then piped its output to another filter command

dplyr::filter(AgeYears > 20)

d9 <- d%>%
      dplyr::filter(AdmittingHospital == 'Hospital L')%>%
      dplyr::filter(AgeYears > 20)
d9

More than one criteria using OR |

Note that in this example we have used the vertical line | instead of the comma. So in this case we want to search if the Admitting Hospital was ‘Hospital F’ or the Outcome was ‘Died’

Using OR in filter criteria

d10 <- d%>%
      dplyr::filter(AdmittingHospital == 'Hospital F' | Outcome == 'Died')%>%
      dplyr::arrange(desc(Outcome))
d10

Using XOR criteria in filter

It works when exactly one of the criteria is TRUE , where as the OR just looks for any condition being met. Note that XOR works on two fields at a time.

d11 <- d%>%
      dplyr::filter(xor(AdmittingHospital == 'Hospital F' , Outcome == 'Died'))%>%
      dplyr::arrange(desc(Outcome))
d11

More than one criteria using OR |

If the hospital is Hospital A or Hospital B or the outcome is Died

d12 <- d%>%
      dplyr::filter(AdmittingHospital %in%  c('Hospital A', 'Hospital B') | Outcome == 'Died')%>%
      dplyr::arrange( AgeYears)
d12

If the hospital is Hospital A or Hospital B and the outcome is Died

d13 <- d%>%
      dplyr::filter(AdmittingHospital %in%  c('Hospital A', 'Hospital B')  &  Outcome == 'Died')%>%
      dplyr::arrange( AgeYears)
d13

Filtering for numeric values between two values

Age more than 10 and Age less than 15 In other words we want to see the records where the AgeYears is 11,12,13,14

d14 <- d%>%
      dplyr::filter(AgeYears > 10, AgeYears < 15)%>%
      dplyr::arrange(AgeYears)
d14

Age greater than or equal (gte) than 10 and Age less than or equal (lte) than 15 In other words we want to see the records where the AgeYears is 10,11,12,13,14,15

d15 <- d%>%
      dplyr::filter(AgeYears >= 10, AgeYears <= 15)%>%
      dplyr::arrange(AgeYears)
d15
d16<- d%>%
      dplyr::filter(AgeYears >= 10, AgeYears <= 15)%>%
      dplyr::filter(Outcome == 'Died')%>%
      dplyr::arrange(AgeYears)
d16
d17<- d%>%
      dplyr::filter(between(AgeYears ,10,15))%>%
      dplyr::filter(Outcome == 'Died')%>%
      dplyr::arrange(AgeYears)
d17

Searching for pattern

In this example we want to search the Diag1 field and see if the Diag1 contains 00 anywhere in the value of Diag1 field

d17 <- d%>%
      dplyr::filter( grepl('00', Diag1 ))  %>%
      dplyr::arrange(Diag1)
d17

Searching for pattern at the start

In this example we want to search the Diag1 field and see if the Diag1 starts with the character A. We use ‘^A’ to say that we want to search where the field value starts with the letter A.

d18 <- d%>%
      dplyr::filter( grepl('^A', Diag1 ))  %>%
      dplyr::arrange(Diag1)
d18

Multiple columns matching with

With one Exact Criteria

d19 <- d%>%
        dplyr::filter_at(vars(starts_with("Diag")), any_vars( .==  "05")  )
d19

Multiple columns matching with

With multiple but Exact Criterion

d20 <- d%>%
        dplyr::filter_at(vars(starts_with("Diag")), any_vars( . %in%  c("G05", "Y71"))  )
d20

Multiple data columns matched using partial pattern matching

Consider that you have multiple columns which you want to search. For example we have multiple columns in our data set which have the diagnostic codes. Now you want to search all these columns of a row at once and see if a pattern matching occurs. Eg. We want to search Diag1 to Diag7 columns to see if any of them contain the text “01”.

Using str_detect

library(stringr)
d21 <- d%>%
        dplyr::filter_at(vars(starts_with("Diag")), any_vars(stringr::str_detect(., pattern =  "05"))  )
d21

Using %like% from data.table package

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
d22 <- d%>%
        dplyr::filter_at(vars(starts_with("Diag")), any_vars( . %like% "05")  )
d22