This study includes the generation of synthetic data
using the synthpop package, the application of
privacy mechanisms via DPpack, and a
comparative evaluation of real and synthetic datasets
through machine learning (SVM) models.
packages <- c("synthpop", "DPpack", "dplyr", "forcats", "ggplot2", "readr")
install.packages(setdiff(packages, rownames(installed.packages())))
lapply(packages, library, character.only = TRUE)
## Warning: package 'synthpop' was built under R version 4.4.3
## Warning: package 'DPpack' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## [[1]]
## [1] "synthpop" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "DPpack" "synthpop" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "dplyr" "DPpack" "synthpop" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "forcats" "dplyr" "DPpack" "synthpop" "stats" "graphics"
## [7] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "ggplot2" "forcats" "dplyr" "DPpack" "synthpop" "stats"
## [7] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "readr" "ggplot2" "forcats" "dplyr" "DPpack" "synthpop"
## [7] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [13] "base"
df <- read_csv("aze_cleaned_subset.csv")
## Rows: 213 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): a4a, a6a, a2, a4b, a6b, a7
## dbl (7): a14d, a14m, a14h, a14min, a20y, a20m, a20d
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat_vars <- c("a4a", "a6a", "a2", "a4b", "a6b", "a7")
df[cat_vars] <- lapply(df[cat_vars], as.factor)
Categorical variables are converted to factors for proper modeling.
syn_result <- syn(df, m = 1, minnumlevels = 6)
## CAUTION: Your data set has fewer observations (213) than we advise.
## We suggest that there should be at least 230 observations
## (100 + 10 * no. of variables used in modelling the data).
## Please check your synthetic data carefully with functions
## compare(), utility.tab(), and utility.gen().
##
##
## Variable(s): a20y, a20m, a20d numeric but with only 6 or fewer distinct values turned into factor(s) for synthesis.
##
##
## Synthesis
## -----------
## a4a a6a a2 a4b a6b a14d a14m a14h a14min a7
## a20y a20m a20d
print(class(syn_result))
## [1] "synds"
m=1 creates a single synthetic dataset.
minnumlevels=6 ensures low-level numeric variables are
treated as factors.
compare(syn_result, data = df)
## Calculations done for a4a
## Calculations done for a6a
## Calculations done for a2
## Calculations done for a4b
## Calculations done for a6b
## Calculations done for a14d
## Calculations done for a14m
## Calculations done for a14h
## Calculations done for a14min
## Calculations done for a7
## Only 2 groups produced for a20y even after changing method.
## Calculations done for a20y
## Calculations done for a20m
## Calculations done for a20d
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 2)
## Warning in (function (..., deparse.level = 1) : number of columns of result is
## not a multiple of vector length (arg 2)
##
## Comparing percentages observed with synthetic
## Press return for next variable(s):
## Press return for next variable(s):
## Press return for next variable(s):
##
## Selected utility measures:
## pMSE S_pMSE df
## a4a 0.000031 0.052668 2
## a6a 0.000250 0.284533 3
## a2 0.001419 2.418796 2
## a4b 0.009074 1.288550 24
## a6b 0.000158 0.269744 2
## a14d 0.000187 0.159331 4
## a14m 0.000563 0.479268 4
## a14h 0.000902 0.768597 4
## a14min 0.001068 0.909681 4
## a7 0.000145 0.492486 1
## a20y 0.000010 0.034588 1
## a20m 0.000996 0.848482 4
## a20d 0.000704 1.200000 2
utility.tab(syn_result, df, vars = c("a4a", "a6a", "a2"))
##
## Observed:
## ($tab.obs)
## , , a2 = Baku & Absheron
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 5 18 9 8
## Other Services 9 18 40 5
## Retail 6 19 28 5
##
## , , a2 = Center
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 2 3 3 0
## Other Services 0 5 4 0
## Retail 1 1 3 0
##
## , , a2 = West
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 2 5 4 0
## Other Services 0 1 4 1
## Retail 0 0 4 0
##
##
## Synthesised:
## ($tab.syn)
## , , a2 = Baku & Absheron
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 8 23 8 10
## Other Services 9 17 38 2
## Retail 3 16 36 3
##
## , , a2 = Center
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 2 1 1 2
## Other Services 2 2 11 0
## Retail 2 0 4 0
##
## , , a2 = West
##
## a6a
## a4a Large Medium Small Unknown
## Manufacturing 2 2 0 0
## Other Services 0 2 2 0
## Retail 0 2 2 1
##
##
## Selected utility measures:
## pMSE S_pMSE df
## 1 0.0168 1.9125 30
Visual and tabular comparisons are done to evaluate similarity and utility.
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Zorunlu paket yükleniyor: lattice
target <- "a6a"
predictors <- c("a4a", "a2", "a14h")
real_data <- df[, c(predictors, target)] |> na.omit()
syn_data <- syn_result$syn[, c(predictors, target)] |> na.omit()
for (var in c(predictors, target)) {
levels(real_data[[var]]) <- union(levels(real_data[[var]]), levels(syn_data[[var]]))
levels(syn_data[[var]]) <- union(levels(real_data[[var]]), levels(syn_data[[var]]))
}
Ensures factor levels are consistent across both datasets.
set.seed(123)
train_index <- createDataPartition(real_data[[target]], p = 0.7, list = FALSE)
train_real <- real_data[train_index, ]
test_real <- real_data[-train_index, ]
train_syn <- syn_data[train_index, ]
test_syn <- syn_data[-train_index, ]
70/30 split for both real and synthetic data using same indices.
svm_real <- svm(a6a ~ ., data = train_real, kernel = "linear", probability = TRUE)
svm_syn <- svm(a6a ~ ., data = train_syn, kernel = "linear", probability = TRUE)
SVM models trained separately on real and synthetic datasets.
pred_real <- predict(svm_real, test_real)
conf_matrix_real <- confusionMatrix(pred_real, test_real$a6a)
pred_syn <- predict(svm_syn, test_syn)
conf_matrix_syn <- confusionMatrix(pred_syn, test_syn$a6a)
cat("📌 SVM Accuracy (REAL data):", round(conf_matrix_real$overall['Accuracy'], 3), "\n")
## 📌 SVM Accuracy (REAL data): 0.516
print(conf_matrix_real$table)
## Reference
## Prediction Large Medium Small Unknown
## Large 0 0 0 0
## Medium 2 7 4 3
## Small 5 14 25 2
## Unknown 0 0 0 0
cat("\n📌 SVM Accuracy (SYNTHETIC data):", round(conf_matrix_syn$overall['Accuracy'], 3), "\n")
##
## 📌 SVM Accuracy (SYNTHETIC data): 0.516
print(conf_matrix_syn$table)
## Reference
## Prediction Large Medium Small Unknown
## Large 0 0 1 1
## Medium 4 12 4 2
## Small 4 13 20 1
## Unknown 0 0 0 0
Comparison shows how well the synthetic dataset mimics the real one in downstream tasks.
This R Markdown document demonstrates how to: - Generate synthetic
data using synthpop - Evaluate similarity and utility of
synthetic data - Compare performance using SVM models
Future work may include Random Forests, AUC metrics, or advanced DP techniques like Laplace Mechanism or DPglm.