Video link https://youtu.be/BjxqClq3f04 # How to validate the date fields in your data in R

Following packages are used in this example

library(validate)
library(dplyr)

Now create a dataset to use in our example

myData <- data.frame(PatientID   = c('P001',    'P002', 'P003'    , NA, 'P005', 'P006', 'P007', 'P008', 'P009', 'P010', 'P008', 'P11')
                     , Age           = c(23    ,      12,   5       ,   8   ,   245 ,NA      ,23    ,45     ,   87    , 121, 56, 130)
                     , Outcome     = c('Died',  'Died', NA, 'Survived','Survived'   ,'Survived',    'Survived', 'Survived', 'Survived', '?', 'Survived', 'Unknown')
                     , SBP        = c(0,    0,  120,    80, 45, 67, 100,    130,    350,    120, 46, 120)
                     , DBP         = c(0,   0,  80, 70, 30, 40, 80, 210,    NA, 80, 0, 80)
)

# Add an id field in the data. If your data already has any unique key then use that instead of creating a new one

myData <- myData%>%
  dplyr::mutate(id  = row_number())

View the data

myData

Create the data validation rules using the validator

Note that we have given a name to each of the rules, this helps when we plot the data.

myrules <- validator( "Patient Id Unique"     = is_unique(PatientID)
                      , "Patient ID"          =  is.na(PatientID)
                      , "Age in range"        = Age >= 0 & Age <= 120
                      , "Outcome validity "   = Outcome %in% c('Survived', 'Died')
                      , "SBP in range"        =  SBP >= 0 & SBP <= 300
                      , "DBP in range"        =  in_range (DBP, min = 0 , max =  200)
                      , "Survived with No BP" =  (DBP == 0 | SBP ==0)  & Outcome == 'Survived' )

Now create the validation results by using the confront function.

output <- confront(myData, myrules, key  ="id")


summary(output)
plot(output)

If you wish to enhance your base plot

you can convert it into a ggplot object and can add more information in the chart. In our example we added a title, subtitle

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
## 
##     expr
library(ggplotify)
pl <- as.ggplot(~plot(output))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Patients dataset")
pl

How do I show the errors in a table

I also want to know the record id of each of my records so that I know what errors are there for each of my record.

library(flextable)
# Convert our output into a dataframe first

dout <- as.data.frame(output)

# Errors for each record id
dErrors <- dout%>%
  dplyr::filter(! value == TRUE)%>%
  dplyr::select(id, name, expression)%>%
  dplyr::arrange(id)

Display the validation results in a flextable

I want to group my data for each record id, so that I know the errors for each of my record.

ft <- flextable(dErrors)%>%
  theme_box()%>%
  merge_v(j = ~id)%>%
  set_header_labels(name         = "Error"
                    ,id          =  "Record ID"
                    , expression = "Validation expression")
ft

Show the results in a different way

Show how many records have a particular data issue.

# Errors by error type

dErrors <- dout%>%
  dplyr::filter(! value == TRUE)%>%
  dplyr::select( name,id, expression)%>%
  dplyr::arrange(name)


ft <- flextable(dErrors)%>%
  theme_box()%>%
  merge_v(j = ~name)%>%
  set_header_labels(name         = "Error"
                    ,id          =  "Record ID"
                    , expression = "Validation expression")
ft