Data Validation

Youtube video link https://youtu.be/2i9Pwov1E2g

# How to validate data and display the results in a ggplot chart using
# a nice fully customised bar chart with data labels showing numbers and percentage
# You would need these packages, install them first if needed.
library(validate) # package for validation
library(dplyr) # data manipulation
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:validate':
## 
##     expr
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) # for plotting
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
## 
##     expr
library(tidyr) # For data manipulation
library(gt) # for displaying data
library(scales) # for formatting numbers
## 
## Attaching package: 'scales'
## The following object is masked from 'package:validate':
## 
##     number_format
# Create our sample data
data(retailers) #This retailers dataset is from the validate package

# Display the data
gt(retailers)
size incl.prob staff turnover other.rev total.rev staff.costs total.costs profit vat
sc0 0.02 75 NA NA 1130 NA 18915 20045 NA
sc3 0.14 9 1607 NA 1607 131 1544 63 NA
sc3 0.14 NA 6886 -33 6919 324 6493 426 NA
sc3 0.14 NA 3861 13 3874 290 3600 274 NA
sc3 0.14 NA NA 37 5602 314 5530 72 NA
sc0 0.02 1 25 NA 25 NA 22 3 NA
sc3 0.14 5 NA NA 1335 135 136 1 1346
sc1 0.02 3 404 13 417 NA 342 75 NA
sc3 0.14 6 2596 NA 2596 147 2486 110 NA
sc2 0.05 5 NA NA NA NA NA NA NA
sc2 0.05 5 645 NA 645 130 636 9 NA
sc2 0.05 5 2872 NA 2872 182 2652 220 NA
sc3 0.14 13 5678 12 5690 326 5656 34 NA
sc1 0.02 NA 931397 NA 931397 36872 841489 89908 863
sc1 0.02 3 80000 NA NA 40000 NA NA 813
sc0 0.02 52 9067 622 9689 1125 9911 -222 964
sc3 0.14 10 1500 20 1520 195 1384 136 733
sc1 0.02 4 440 NA 440 16 379 60 296
sc2 0.05 3 690 NA 690 19000 464507 225493 486
sc3 0.14 8 1852 NA 1852 120 1812 40 1312
sc0 0.02 2 359 9 368 NA 339 29 257
sc0 0.02 3 839 NA 839 2 717 122 654
sc1 0.02 2 471 NA 471 34 411 60 377
sc1 0.02 4 933 2 935 31 814 121 811
sc2 0.05 3 1665 NA 1665 70 186 1478 1472
sc3 0.14 6 2318 NA 2318 184 390 86 2082
sc2 0.05 2 1175 12 1187 114 NA 17 1058
sc3 0.14 16 2946 7 2953 245 2870 83 2670
sc0 0.02 1 492 NA 492 NA 470 22 449
sc2 0.05 6 1831 1831 1831 53 1443 388 1695
sc3 0.14 29 7271 30 7301 451 7242 59 6754
sc2 0.05 8 971 NA 107 28 95 100 905
sc3 0.14 13 4118 11 4129 57 3601 528 3841
sc3 0.14 9 2803 NA 2803 106 2643 160 2668
sc3 0.14 15 2876 33 2909 539 2627 282 2758
sc3 0.14 14 2649 98350 2747 221302 2725410 22457 2548
sc2 0.05 6 1024 4 206 64 170 37 995
sc2 0.05 53 9842 NA 9842 837 10000 -160 9655
sc2 0.05 7 2463 38 2501 87 2347 154 2441
sc3 0.14 NA 4445 98 4543 369 4266 277 4412
sc3 0.14 20 3284 11 3295 181 3168 127 3263
sc2 0.05 2 814 NA 814 107 175 NA 810
sc1 0.02 NA 1210 NA 1210 52 1124 86 1205
sc0 0.02 1 343 NA 343 NA NA NA 343
sc2 0.05 3 952 NA 952 79 NA 149 952
sc0 0.02 1 41 NA 41 NA 32 9 41
sc3 0.14 60 3633 NA 3633 257 3626 7 3634
sc3 0.14 8 2906 NA 2906 144 453 53 2907
sc3 0.14 10 2333 6 2339 193 2353 -14 2335
sc3 0.14 12 2275 5 2280 222 2302 -22 2277
sc2 0.05 7 1728 NA 1728 153 1681 47 1742
sc3 0.14 24 6872 32 6904 485 6729 174 6959
sc3 0.14 29 3571 76 3647 311 3554 93 3700
sc3 0.14 11 1021 NA 1021 235 472 549 1067
sc0 0.02 1 197 NA 197 NA 168 30 221
sc2 0.05 7 917 NA 917 30 781 136 1030
sc2 0.05 8 2000 NA 2000 NA 1700 NA 2271
sc3 0.14 3 200 NA 200 49 177 222 251
sc2 0.05 4 342 NA 342 30 299 43 1068
sc2 0.05 6 1 NA 1411 179 1215 196 1389
# Create some rules for our validation
rules <- validator( 
  Costs = staff.costs < total.costs #Staff costs should be less than total costs
  , Revenue = turnover + other.rev == total.rev # Total Revenue 
  , Staffing =  !is.na(staff)) # Staffing should be a value

# Now compare your data with your rules
# This is called confront(yourdata, yourrules)
cf <- confront(retailers, rules)
 
# You can plot this data using the plot command
# The chart is very basic
plot(cf)

# Lets create a new chart using GGPLOT
dVal <- tibble(summary(cf)) # Change the summary output to a tibble

# View this data
gt(dVal)
name items passes fails nNA error warning expression
Costs 60 47 0 13 FALSE FALSE staff.costs < total.costs
Revenue 60 19 4 37 FALSE FALSE abs(turnover + other.rev - total.rev) <= 1e-08
Staffing 60 54 6 0 FALSE FALSE !is.na(staff)
# create a 100% stacked bar chart,convert data to long format
dVal.long <- dVal%>%
  dplyr::select(name, passes,fails,nNA)%>%
  tidyr::pivot_longer(names_to = 'names', cols = c('passes', 'fails', 'nNA' ))

# View the long format dataset 
gt(dVal.long)
name names value
Costs passes 47
Costs fails 0
Costs nNA 13
Revenue passes 19
Revenue fails 4
Revenue nNA 37
Staffing passes 54
Staffing fails 6
Staffing nNA 0
# Now lets group the data by the name field
# and calculate the percentages
# for fail,passs, nNA for each name)
dVal.long <- dVal.long%>%
  dplyr::group_by(name)%>%
  dplyr::mutate(pct = value/sum(value))

# View the dataset 
gt(dVal.long)
names value pct
Costs
passes 47 0.78333333
fails 0 0.00000000
nNA 13 0.21666667
Revenue
passes 19 0.31666667
fails 4 0.06666667
nNA 37 0.61666667
Staffing
passes 54 0.90000000
fails 6 0.10000000
nNA 0 0.00000000
# Now we need to create a formatted label
# which shows percentage and numbers.
# We are using the paste0 command to combine the pct and the values
# we used scales::percent command to format percentages properly

dVal.long <- dVal.long%>%
  dplyr::group_by(name)%>%
  dplyr::mutate(FormattedLabel= paste0(value,' (',scales::percent(pct,accuracy = 0.1),')'))

# View the final dataset 
gt(dVal.long)
names value pct FormattedLabel
Costs
passes 47 0.78333333 47 (78.3%)
fails 0 0.00000000 0 (0.0%)
nNA 13 0.21666667 13 (21.7%)
Revenue
passes 19 0.31666667 19 (31.7%)
fails 4 0.06666667 4 (6.7%)
nNA 37 0.61666667 37 (61.7%)
Staffing
passes 54 0.90000000 54 (90.0%)
fails 6 0.10000000 6 (10.0%)
nNA 0 0.00000000 0 (0.0%)
# Ready to plot ?
pl <- ggplot(data = dVal.long, aes(x  = name, y = value, fill = names ))
pl <- pl + geom_bar(stat ="identity", position ="fill")
# If the value is 0 then we don't want the label to be shown
pl <- pl + geom_text(aes(label= ifelse(value == 0,'', FormattedLabel))
                     , position = position_fill(vjust = 0.5))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Retailers dataset")
pl <- pl + theme_bw()
pl <- pl + scale_fill_manual(values = c("red","orange","green"))
pl <- pl + labs(title ="Result of the validation")
pl <- pl + labs(subtitle ="Retailers dataset")
pl <- pl + labs(x='Data Field Name', y ="No. of records")
pl <- pl + labs(caption = '@techanswers88')
pl <- pl + scale_y_continuous(labels = scales::percent) #Y scale percentage formatting
pl # Lets plot the chart now

# This chart shows the numbers and percentages
# As this is a ggplot chart you can change all aspects of it
# like fill colours, titles, legends, themes etc
# Thank you for watching this video,
# If it was of value to you then please like and subscribe