Video link https://youtu.be/BjxqClq3f04 # How to validate the date fields in your data in R
library(validate)
library(dplyr)
myData <- data.frame(PatientID = c('P001', 'P002', 'P003' , NA, 'P005', 'P006', 'P007', 'P008', 'P009', 'P010', 'P008', 'P11')
, Age = c(23 , 12, 5 , 8 , 245 ,NA ,23 ,45 , 87 , 121, 56, 130)
, Outcome = c('Died', 'Died', NA, 'Survived','Survived' ,'Survived', 'Survived', 'Survived', 'Survived', '?', 'Survived', 'Unknown')
, SBP = c(0, 0, 120, 80, 45, 67, 100, 130, 350, 120, 46, 120)
, DBP = c(0, 0, 80, 70, 30, 40, 80, 210, NA, 80, 0, 80)
)
# Add an id field in the data. If your data already has any unique key then use that instead of creating a new one
myData <- myData%>%
dplyr::mutate(id = row_number())
myData
Note that we have given a name to each of the rules, this helps when we plot the data.
myrules <- validator( "Patient Id Unique" = is_unique(PatientID)
, "Patient ID" = is.na(PatientID)
, "Age in range" = Age >= 0 & Age <= 120
, "Outcome validity " = Outcome %in% c('Survived', 'Died')
, "SBP in range" = SBP >= 0 & SBP <= 300
, "DBP in range" = in_range (DBP, min = 0 , max = 200)
, "Survived with No BP" = (DBP == 0 | SBP ==0) & Outcome == 'Survived' )
output <- confront(myData, myrules, key ="id")
summary(output)
plot(output)
you can convert it into a ggplot object and can add more information in the chart. In our example we added a title, subtitle
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
##
## expr
library(ggplotify)
pl <- as.ggplot(~plot(output))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Patients dataset")
pl
I also want to know the record id of each of my records so that I know what errors are there for each of my record.
library(flextable)
# Convert our output into a dataframe first
dout <- as.data.frame(output)
# Errors for each record id
dErrors <- dout%>%
dplyr::filter(! value == TRUE)%>%
dplyr::select(id, name, expression)%>%
dplyr::arrange(id)
I want to group my data for each record id, so that I know the errors for each of my record.
ft <- flextable(dErrors)%>%
theme_box()%>%
merge_v(j = ~id)%>%
set_header_labels(name = "Error"
,id = "Record ID"
, expression = "Validation expression")
ft
Record ID | Error | Validation expression |
1 | Patient.ID | is.na(PatientID) |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
2 | Patient.ID | is.na(PatientID) |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
3 | Patient.ID | is.na(PatientID) |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
4 | Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" |
5 | Patient.ID | is.na(PatientID) |
Age.in.range | Age >= 0 & Age <= 120 | |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
6 | Patient.ID | is.na(PatientID) |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
7 | Patient.ID | is.na(PatientID) |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
8 | Patient.Id.Unique | is_unique(PatientID) |
Patient.ID | is.na(PatientID) | |
DBP.in.range | in_range(DBP, min = 0, max = 200) | |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
9 | Patient.ID | is.na(PatientID) |
SBP.in.range | SBP >= 0 & SBP <= 300 | |
10 | Patient.ID | is.na(PatientID) |
Age.in.range | Age >= 0 & Age <= 120 | |
Outcome.validity. | Outcome %vin% c("Survived", "Died") | |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
11 | Patient.Id.Unique | is_unique(PatientID) |
Patient.ID | is.na(PatientID) | |
12 | Patient.ID | is.na(PatientID) |
Age.in.range | Age >= 0 & Age <= 120 | |
Outcome.validity. | Outcome %vin% c("Survived", "Died") | |
Survived.with.No.BP | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" |
Show how many records have a particular data issue.
# Errors by error type
dErrors <- dout%>%
dplyr::filter(! value == TRUE)%>%
dplyr::select( name,id, expression)%>%
dplyr::arrange(name)
ft <- flextable(dErrors)%>%
theme_box()%>%
merge_v(j = ~name)%>%
set_header_labels(name = "Error"
,id = "Record ID"
, expression = "Validation expression")
ft
Error | Record ID | Validation expression |
Age.in.range | 5 | Age >= 0 & Age <= 120 |
10 | Age >= 0 & Age <= 120 | |
12 | Age >= 0 & Age <= 120 | |
DBP.in.range | 8 | in_range(DBP, min = 0, max = 200) |
Outcome.validity. | 10 | Outcome %vin% c("Survived", "Died") |
12 | Outcome %vin% c("Survived", "Died") | |
Patient.ID | 1 | is.na(PatientID) |
2 | is.na(PatientID) | |
3 | is.na(PatientID) | |
5 | is.na(PatientID) | |
6 | is.na(PatientID) | |
7 | is.na(PatientID) | |
8 | is.na(PatientID) | |
9 | is.na(PatientID) | |
10 | is.na(PatientID) | |
11 | is.na(PatientID) | |
12 | is.na(PatientID) | |
Patient.Id.Unique | 8 | is_unique(PatientID) |
11 | is_unique(PatientID) | |
SBP.in.range | 9 | SBP >= 0 & SBP <= 300 |
Survived.with.No.BP | 1 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" |
2 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
3 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
4 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
5 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
6 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
7 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
8 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
10 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" | |
12 | (abs(DBP - 0) < 1e-08 | abs(SBP - 0) < 1e-08) & Outcome == "Survived" |