library(dplyr)
library(readr)
library(knitr)
dat = read_csv("./Csv/B.csv", col_names=FALSE)
まずは、k=1な行をいくつか削除して、一意率を0.5以下にする。
一意性は、X1とX6の1の位を丸めて、全ての列を準識別子とみなした時にk=1かどうか、を見るとのこと。
とりあえず不要列の削除と、数値列の丸めをやっておく。
dat =
dat %>%
mutate(X2_ = round(X2, -1)) %>%
mutate(X6_ = round(X6, -1)) %>%
mutate(tmp = paste(X1,X2_,X3,X4,X5,X6_,X7,X10,X11,X12, sep="_"))
dat %>% head
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X2_
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 62 White Grad… Marr… 27.8 0 0 0 0 Q2 1 60
## 2 Male 53 White High… Divo… 30.8 0 1 0 0 Q1 0 50
## 3 Male 78 White High… Marr… 28.8 0 0 0 0 Q3 1 80
## 4 Female 56 White Grad… Part… 42.4 1 0 0 0 Q3 0 60
## 5 Female 42 Black Coll… Divo… 20.3 1 0 0 0 Q4 0 40
## 6 Female 72 Mexi… 11th Sepa… 28.6 0 0 0 0 Q1 0 70
## # … with 2 more variables: X6_ <dbl>, tmp <chr>
```r
dat_kanon =
dat %>%
group_by(tmp) %>%
count() %>%
ungroup()
dat_kanon %>% select(n) %>% table()
## .
## 1 2 3 4 5 6 7 8
## 2939 333 90 40 13 10 2 2
予備選ルールだと、4190行 x 0.5 = 2095行までが一意になっても許される。
現状、2939行が一意になっているので、750行ほどを削除する。
k=1な行をランダムに30%ほど削除すればいい
## 適当に決める
threshold = 0.35
dat_delete_flg =
dat %>%
left_join(dat_kanon, by = "tmp") %>%
mutate(random = runif(nrow(dat))) %>%
mutate(random = if_else(n>1, 1, random)) %>%
mutate(delete_flg = if_else(random < threshold, 1, 0)) %>%
mutate(rownum = 0:(nrow(dat)-1))
dat_delete_flg %>% head()
## # A tibble: 6 × 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X2_
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 62 White Grad… Marr… 27.8 0 0 0 0 Q2 1 60
## 2 Male 53 White High… Divo… 30.8 0 1 0 0 Q1 0 50
## 3 Male 78 White High… Marr… 28.8 0 0 0 0 Q3 1 80
## 4 Female 56 White Grad… Part… 42.4 1 0 0 0 Q3 0 60
## 5 Female 42 Black Coll… Divo… 20.3 1 0 0 0 Q4 0 40
## 6 Female 72 Mexi… 11th Sepa… 28.6 0 0 0 0 Q1 0 70
## # … with 6 more variables: X6_ <dbl>, tmp <chr>, n <int>, random <dbl>,
## # delete_flg <dbl>, rownum <int>
dat_deleted =
dat_delete_flg %>%
filter(delete_flg == 0)
dat_deleted %>% head
## # A tibble: 6 × 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X2_
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 62 White Grad… Marr… 27.8 0 0 0 0 Q2 1 60
## 2 Male 53 White High… Divo… 30.8 0 1 0 0 Q1 0 50
## 3 Male 78 White High… Marr… 28.8 0 0 0 0 Q3 1 80
## 4 Female 42 Black Coll… Divo… 20.3 1 0 0 0 Q4 0 40
## 5 Male 45 Other 11th Never 24.1 0 0 0 0 Q3 0 40
## 6 Female 30 Hisp… Coll… Part… 26.6 0 0 0 0 Q4 0 30
## # … with 6 more variables: X6_ <dbl>, tmp <chr>, n <int>, random <dbl>,
## # delete_flg <dbl>, rownum <int>
dat_deleted %>%
group_by(tmp) %>%
count() %>%
ungroup %>%
select(n) %>%
table
## .
## 1 2 3 4 5 6 7 8
## 1921 333 90 40 13 10 2 2
k=1な行が十分に減っているので、OKっぽい
dat_deletedはC。削除する行の番号を記録しておく。
dat_delete_flg %>%
filter(delete_flg == 1) %>%
select(rownum) %>%
write.table("./Csv/pre_anony_03_e.csv", quote=FALSE, row.names=FALSE, col.names=FALSE, sep=",")
dat_delete_flg %>%
filter(delete_flg == 1) %>%
dim
## [1] 1018 19
900行近くを消している。
dat_deleted %>%
select(X1:X12) %>%
write.table("./Csv/C.csv", quote=FALSE, row.names=FALSE, col.names=FALSE, sep=",")
dat_deleted %>% head()
## # A tibble: 6 × 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X2_
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 62 White Grad… Marr… 27.8 0 0 0 0 Q2 1 60
## 2 Male 53 White High… Divo… 30.8 0 1 0 0 Q1 0 50
## 3 Male 78 White High… Marr… 28.8 0 0 0 0 Q3 1 80
## 4 Female 42 Black Coll… Divo… 20.3 1 0 0 0 Q4 0 40
## 5 Male 45 Other 11th Never 24.1 0 0 0 0 Q3 0 40
## 6 Female 30 Hisp… Coll… Part… 26.6 0 0 0 0 Q4 0 30
## # … with 6 more variables: X6_ <dbl>, tmp <chr>, n <int>, random <dbl>,
## # delete_flg <dbl>, rownum <int>
第二匿名化はめんどくさいので、D=Cとして提出してみる。
提出したいので、フォーマットチェックしてみる…と、X6がinvalidらしい。
ルールスライドのUniqrtの節によると、
でないといけないらしいので、確認してみる
dat_delete_flg %>% select(X2,X6) %>% summary
## X2 X6
## Min. :20.00 Min. :14.50
## 1st Qu.:35.00 1st Qu.:24.40
## Median :50.00 Median :28.10
## Mean :50.46 Mean :29.18
## 3rd Qu.:65.00 3rd Qu.:32.70
## Max. :80.00 Max. :67.30
ので、top/bottom-codingする。
X2は大丈夫そう。X6は対応する。
dat_delete_flg =
dat_delete_flg %>%
mutate(X6 = if_else(X6 > 75, 70, X6)) %>%
mutate(X6 = if_else(X6 < 20, 20, X6))
dat_delete_flg %>% select(X2,X6) %>% summary
## X2 X6
## Min. :20.00 Min. :20.00
## 1st Qu.:35.00 1st Qu.:24.40
## Median :50.00 Median :28.10
## Mean :50.46 Mean :29.24
## 3rd Qu.:65.00 3rd Qu.:32.70
## Max. :80.00 Max. :67.30
dat_delete_flg %>% head
## # A tibble: 6 × 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X2_
## <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 62 White Grad… Marr… 27.8 0 0 0 0 Q2 1 60
## 2 Male 53 White High… Divo… 30.8 0 1 0 0 Q1 0 50
## 3 Male 78 White High… Marr… 28.8 0 0 0 0 Q3 1 80
## 4 Female 56 White Grad… Part… 42.4 1 0 0 0 Q3 0 60
## 5 Female 42 Black Coll… Divo… 20.3 1 0 0 0 Q4 0 40
## 6 Female 72 Mexi… 11th Sepa… 28.6 0 0 0 0 Q1 0 70
## # … with 6 more variables: X6_ <dbl>, tmp <chr>, n <int>, random <dbl>,
## # delete_flg <dbl>, rownum <int>
scale = 0.05
dat_delete_flg_noized =
dat_delete_flg %>%
mutate(random = runif(nrow(.)) - 0.5) %>%
mutate(X2 = X2 + X2 * random * scale) %>%
mutate(random = runif(nrow(.)) - 0.5) %>%
mutate(X6 = X6 + X6 * random * scale) %>%
select(-random)
dat_delete_flg_noized %>%
filter(delete_flg == 0) %>%
select(X1:X12) %>%
write.table("./Csv/C.csv", col.names=FALSE, row.names=FALSE, quote=FALSE, sep = ",")
dat_delete_flg_noized %>%
filter(delete_flg == 0) %>%
select(X1:X12) %>%
write.table("./Csv/pre_anony_03_d.csv", col.names=FALSE, row.names=FALSE, quote=FALSE, sep = ",")
res = system("bash ./mytest_evaluate.sh", intern = TRUE)
print(res)
## [1] "umark.py Csv/B.csv Csv/C.csv"
## [2] " cnt rate Coef OR pvalue cor"
## [3] "max 578.000000 0.034836 0.267387 0.399606 0.447176 0.038724"
## [4] "mean 138.787879 0.007435 0.090484 0.083812 0.086871 0.008623"
## [5] "uniqrt.py Csv/C.csv"
## [6] "2031 0.6402900378310215 0.4847255369928401"
## [7] "umark.py Csv/B.csv Csv/pre_anony_03_d.csv"
## [8] " cnt rate Coef OR pvalue cor"
## [9] "max 578.000000 0.034836 0.267387 0.399606 0.447176 0.038724"
## [10] "mean 138.787879 0.007435 0.090484 0.083812 0.086871 0.008623"
## [11] "iloss.py Csv/C.csv Csv/pre_anony_03_d.csv"
## [12] " 1 5 cat max"
## [13] "mean 0.0 0.0 0.0 0.0"
## [14] "max 0.0 0.0 0.0 0.0"
rate < 0.05, cor < 0.1, uniqrt < 0.5は大丈夫そう
OR < 0.1が多分ダメ
res = system("bash ./mytest_check_anon.sh", intern = TRUE)
print(res)
## [1] "./checkDX.py ./Csv/B.csv ./Csv/pre_anony_03_d.csv ./Csv/pre_anony_03_e.csv"
## [2] "D: num OK"
## [3] "D: obj OK"
## [4] "1 OK"
## [5] "5 OK"
## [6] "6 OK"
## [7] "7 OK"
## [8] "11 OK"
## [9] "0 OK"
## [10] "2 OK"
## [11] "3 OK"
## [12] "4 OK"
## [13] "10 OK"
## [14] "(3172, 12) OK"
## [15] "X: int OK"
## [16] "X: unique OK"
## [17] "(1018, 1) OK"