Data Validation
Youtube video link https://youtu.be/2i9Pwov1E2g
# How to validate data and display the results in a ggplot chart using
# a nice fully customised bar chart with data labels showing numbers and percentage
# You would need these packages, install them first if needed.
library(validate) # package for validation
library(dplyr) # data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:validate':
##
## expr
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # for plotting
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
##
## expr
library(tidyr) # For data manipulation
library(gt) # for displaying data
library(scales) # for formatting numbers
##
## Attaching package: 'scales'
## The following object is masked from 'package:validate':
##
## number_format
# Create our sample data
data(retailers) #This retailers dataset is from the validate package
# Display the data
gt(retailers)
size |
incl.prob |
staff |
turnover |
other.rev |
total.rev |
staff.costs |
total.costs |
profit |
vat |
sc0 |
0.02 |
75 |
NA |
NA |
1130 |
NA |
18915 |
20045 |
NA |
sc3 |
0.14 |
9 |
1607 |
NA |
1607 |
131 |
1544 |
63 |
NA |
sc3 |
0.14 |
NA |
6886 |
-33 |
6919 |
324 |
6493 |
426 |
NA |
sc3 |
0.14 |
NA |
3861 |
13 |
3874 |
290 |
3600 |
274 |
NA |
sc3 |
0.14 |
NA |
NA |
37 |
5602 |
314 |
5530 |
72 |
NA |
sc0 |
0.02 |
1 |
25 |
NA |
25 |
NA |
22 |
3 |
NA |
sc3 |
0.14 |
5 |
NA |
NA |
1335 |
135 |
136 |
1 |
1346 |
sc1 |
0.02 |
3 |
404 |
13 |
417 |
NA |
342 |
75 |
NA |
sc3 |
0.14 |
6 |
2596 |
NA |
2596 |
147 |
2486 |
110 |
NA |
sc2 |
0.05 |
5 |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
sc2 |
0.05 |
5 |
645 |
NA |
645 |
130 |
636 |
9 |
NA |
sc2 |
0.05 |
5 |
2872 |
NA |
2872 |
182 |
2652 |
220 |
NA |
sc3 |
0.14 |
13 |
5678 |
12 |
5690 |
326 |
5656 |
34 |
NA |
sc1 |
0.02 |
NA |
931397 |
NA |
931397 |
36872 |
841489 |
89908 |
863 |
sc1 |
0.02 |
3 |
80000 |
NA |
NA |
40000 |
NA |
NA |
813 |
sc0 |
0.02 |
52 |
9067 |
622 |
9689 |
1125 |
9911 |
-222 |
964 |
sc3 |
0.14 |
10 |
1500 |
20 |
1520 |
195 |
1384 |
136 |
733 |
sc1 |
0.02 |
4 |
440 |
NA |
440 |
16 |
379 |
60 |
296 |
sc2 |
0.05 |
3 |
690 |
NA |
690 |
19000 |
464507 |
225493 |
486 |
sc3 |
0.14 |
8 |
1852 |
NA |
1852 |
120 |
1812 |
40 |
1312 |
sc0 |
0.02 |
2 |
359 |
9 |
368 |
NA |
339 |
29 |
257 |
sc0 |
0.02 |
3 |
839 |
NA |
839 |
2 |
717 |
122 |
654 |
sc1 |
0.02 |
2 |
471 |
NA |
471 |
34 |
411 |
60 |
377 |
sc1 |
0.02 |
4 |
933 |
2 |
935 |
31 |
814 |
121 |
811 |
sc2 |
0.05 |
3 |
1665 |
NA |
1665 |
70 |
186 |
1478 |
1472 |
sc3 |
0.14 |
6 |
2318 |
NA |
2318 |
184 |
390 |
86 |
2082 |
sc2 |
0.05 |
2 |
1175 |
12 |
1187 |
114 |
NA |
17 |
1058 |
sc3 |
0.14 |
16 |
2946 |
7 |
2953 |
245 |
2870 |
83 |
2670 |
sc0 |
0.02 |
1 |
492 |
NA |
492 |
NA |
470 |
22 |
449 |
sc2 |
0.05 |
6 |
1831 |
1831 |
1831 |
53 |
1443 |
388 |
1695 |
sc3 |
0.14 |
29 |
7271 |
30 |
7301 |
451 |
7242 |
59 |
6754 |
sc2 |
0.05 |
8 |
971 |
NA |
107 |
28 |
95 |
100 |
905 |
sc3 |
0.14 |
13 |
4118 |
11 |
4129 |
57 |
3601 |
528 |
3841 |
sc3 |
0.14 |
9 |
2803 |
NA |
2803 |
106 |
2643 |
160 |
2668 |
sc3 |
0.14 |
15 |
2876 |
33 |
2909 |
539 |
2627 |
282 |
2758 |
sc3 |
0.14 |
14 |
2649 |
98350 |
2747 |
221302 |
2725410 |
22457 |
2548 |
sc2 |
0.05 |
6 |
1024 |
4 |
206 |
64 |
170 |
37 |
995 |
sc2 |
0.05 |
53 |
9842 |
NA |
9842 |
837 |
10000 |
-160 |
9655 |
sc2 |
0.05 |
7 |
2463 |
38 |
2501 |
87 |
2347 |
154 |
2441 |
sc3 |
0.14 |
NA |
4445 |
98 |
4543 |
369 |
4266 |
277 |
4412 |
sc3 |
0.14 |
20 |
3284 |
11 |
3295 |
181 |
3168 |
127 |
3263 |
sc2 |
0.05 |
2 |
814 |
NA |
814 |
107 |
175 |
NA |
810 |
sc1 |
0.02 |
NA |
1210 |
NA |
1210 |
52 |
1124 |
86 |
1205 |
sc0 |
0.02 |
1 |
343 |
NA |
343 |
NA |
NA |
NA |
343 |
sc2 |
0.05 |
3 |
952 |
NA |
952 |
79 |
NA |
149 |
952 |
sc0 |
0.02 |
1 |
41 |
NA |
41 |
NA |
32 |
9 |
41 |
sc3 |
0.14 |
60 |
3633 |
NA |
3633 |
257 |
3626 |
7 |
3634 |
sc3 |
0.14 |
8 |
2906 |
NA |
2906 |
144 |
453 |
53 |
2907 |
sc3 |
0.14 |
10 |
2333 |
6 |
2339 |
193 |
2353 |
-14 |
2335 |
sc3 |
0.14 |
12 |
2275 |
5 |
2280 |
222 |
2302 |
-22 |
2277 |
sc2 |
0.05 |
7 |
1728 |
NA |
1728 |
153 |
1681 |
47 |
1742 |
sc3 |
0.14 |
24 |
6872 |
32 |
6904 |
485 |
6729 |
174 |
6959 |
sc3 |
0.14 |
29 |
3571 |
76 |
3647 |
311 |
3554 |
93 |
3700 |
sc3 |
0.14 |
11 |
1021 |
NA |
1021 |
235 |
472 |
549 |
1067 |
sc0 |
0.02 |
1 |
197 |
NA |
197 |
NA |
168 |
30 |
221 |
sc2 |
0.05 |
7 |
917 |
NA |
917 |
30 |
781 |
136 |
1030 |
sc2 |
0.05 |
8 |
2000 |
NA |
2000 |
NA |
1700 |
NA |
2271 |
sc3 |
0.14 |
3 |
200 |
NA |
200 |
49 |
177 |
222 |
251 |
sc2 |
0.05 |
4 |
342 |
NA |
342 |
30 |
299 |
43 |
1068 |
sc2 |
0.05 |
6 |
1 |
NA |
1411 |
179 |
1215 |
196 |
1389 |
# Create some rules for our validation
rules <- validator(
Costs = staff.costs < total.costs #Staff costs should be less than total costs
, Revenue = turnover + other.rev == total.rev # Total Revenue
, Staffing = !is.na(staff)) # Staffing should be a value
# Now compare your data with your rules
# This is called confront(yourdata, yourrules)
cf <- confront(retailers, rules)
# You can plot this data using the plot command
# The chart is very basic
plot(cf)

# Lets create a new chart using GGPLOT
dVal <- tibble(summary(cf)) # Change the summary output to a tibble
# View this data
gt(dVal)
name |
items |
passes |
fails |
nNA |
error |
warning |
expression |
Costs |
60 |
47 |
0 |
13 |
FALSE |
FALSE |
staff.costs < total.costs |
Revenue |
60 |
19 |
4 |
37 |
FALSE |
FALSE |
abs(turnover + other.rev - total.rev) <= 1e-08 |
Staffing |
60 |
54 |
6 |
0 |
FALSE |
FALSE |
!is.na(staff) |
# create a 100% stacked bar chart,convert data to long format
dVal.long <- dVal%>%
dplyr::select(name, passes,fails,nNA)%>%
tidyr::pivot_longer(names_to = 'names', cols = c('passes', 'fails', 'nNA' ))
# View the long format dataset
gt(dVal.long)
name |
names |
value |
Costs |
passes |
47 |
Costs |
fails |
0 |
Costs |
nNA |
13 |
Revenue |
passes |
19 |
Revenue |
fails |
4 |
Revenue |
nNA |
37 |
Staffing |
passes |
54 |
Staffing |
fails |
6 |
Staffing |
nNA |
0 |
# Now lets group the data by the name field
# and calculate the percentages
# for fail,passs, nNA for each name)
dVal.long <- dVal.long%>%
dplyr::group_by(name)%>%
dplyr::mutate(pct = value/sum(value))
# View the dataset
gt(dVal.long)
names |
value |
pct |
Costs |
passes |
47 |
0.78333333 |
fails |
0 |
0.00000000 |
nNA |
13 |
0.21666667 |
Revenue |
passes |
19 |
0.31666667 |
fails |
4 |
0.06666667 |
nNA |
37 |
0.61666667 |
Staffing |
passes |
54 |
0.90000000 |
fails |
6 |
0.10000000 |
nNA |
0 |
0.00000000 |
# Now we need to create a formatted label
# which shows percentage and numbers.
# We are using the paste0 command to combine the pct and the values
# we used scales::percent command to format percentages properly
dVal.long <- dVal.long%>%
dplyr::group_by(name)%>%
dplyr::mutate(FormattedLabel= paste0(value,' (',scales::percent(pct,accuracy = 0.1),')'))
# View the final dataset
gt(dVal.long)
names |
value |
pct |
FormattedLabel |
Costs |
passes |
47 |
0.78333333 |
47 (78.3%) |
fails |
0 |
0.00000000 |
0 (0.0%) |
nNA |
13 |
0.21666667 |
13 (21.7%) |
Revenue |
passes |
19 |
0.31666667 |
19 (31.7%) |
fails |
4 |
0.06666667 |
4 (6.7%) |
nNA |
37 |
0.61666667 |
37 (61.7%) |
Staffing |
passes |
54 |
0.90000000 |
54 (90.0%) |
fails |
6 |
0.10000000 |
6 (10.0%) |
nNA |
0 |
0.00000000 |
0 (0.0%) |
# Ready to plot ?
pl <- ggplot(data = dVal.long, aes(x = name, y = value, fill = names ))
pl <- pl + geom_bar(stat ="identity", position ="fill")
# If the value is 0 then we don't want the label to be shown
pl <- pl + geom_text(aes(label= ifelse(value == 0,'', FormattedLabel))
, position = position_fill(vjust = 0.5))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Retailers dataset")
pl <- pl + theme_bw()
pl <- pl + scale_fill_manual(values = c("red","orange","green"))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Retailers dataset")
pl <- pl + labs(x='Data Field Name', y ="No. of records")
pl <- pl + labs(caption = '@techanswers88')
pl <- pl + scale_y_continuous(labels = scales::percent) #Y scale percentage formatting
pl # Lets plot the chart now

# This chart shows the numbers and percentages
# As this is a ggplot chart you can change all aspects of it
# like fill colours, titles, legends, themes etc
# Thank you for watching this video,
# If it was of value to you then please like and subscribe