This is all for setup. Optionally (based on the knit parameters) we retrieve the demo (or full) corpus from privaseer , and ingest it into a persistent database. (More details in git: <https://github.com/MIT-Informatics/privacy-policies> )
The ingest process:
unpacks the tar file
iterates through each html policy file
parse the html into chunks (base on paragraphs and similar elements)
cleans the text (for odd characters, etc.)
splits it into words (tokens)
removes stop words
stems the terms
computes k-n ngrams
these are all stored and indexed in three tables – describing the file, paragraphs, and ngrams
### Refresh plan set
## Note run the sources in the data directory
if (GLOBALS$refresh_data) {
source(fs::path(GLOBALS$src_dir,"fetch_data.R"))
wd <- getwd()
setwd(GLOBALS$data_dir )
fetch_privaseer()
setwd(wd)
}
if (TRUE) {
wd <- getwd()
source(fs::path(GLOBALS$src_dir,"build_db.R"))
setwd(GLOBALS$data_dir )
if (GLOBALS$refresh_data) {
ingest_privaseer_data()
}
maindb.con <- setup_db()
ptbls <- setup_privaseer_tables(maindb.con)
tokens.tbl <- ptbls$tok
para.tbl <- ptbls$par
GLOBALS$con <- maindb.con
setwd(wd)
}
rm(setup_db,setup_privaseer_tables,wd,ptbls,maindb.con)How long are these policies?
fcnts.tib <-
para.tbl %>%
count(file,name="npar") %>%
collect()
(
fcnts.tib %>%
gf_ash(~npar) +
labs(x="Number of paragraphs in policy")
) %>%
ggplotly()How readable are they?
library(quanteda.textstats)
#TODO: refactor with subqueries on the file table instead of cacheing entire list of file ids in memory
files.ls <- para.tbl %>%
group_by(file) %>%
summarize(dummy=1) %>%
select(-dummy) %>%
collect() %>% pull()
readability_sq<-function(x) {
para.tbl %>%
filter(`file`==x) %>%
select(`text`) %>%
collect() %>%
pull %>%
str_flatten() %>%
quanteda.textstats::textstat_readability() %>%
as.data.frame() %>%
select(-`document`) %>%
mutate(file=x)
}
# use a sample since this is costly
system.time ({
sum.tib <- purrr::map_dfr(sample(files.ls,200), readability_sq)
sum.tib %<>% left_join(fcnts.tib,by="file")
})## user system elapsed
## 28.459 0.562 29.006
(sum.tib %>%
gf_boxplot(~Flesch)) /
( sum.tib %>%
gf_point(Flesch~npar))What are common words and phrases?
term.tib <- tokens.tbl %>%
count(token, name="freq") %>%
collect()
doc.tib <- tokens.tbl %>%
count(token,file) %>% count(token, name="docfreq") %>%
collect()
term.tib %<>% full_join(doc.tib, by="token")
rm(doc.tib)
term.tib %>% slice_max(freq,n=1000) %>% DT::datatable()library(wordcloud2, quiet=TRUE)
term.tib %>%
select(token,freq) %>%
slice_max(freq,n=250) %>%
rename(word=token) %>%
wordcloud2::wordcloud2()library(SnowballC,quiet=TRUE)
statTerms.ls <-
c("statistical",
"statistics",
"aggregate",
"aggregated",
"analytical",
"analytic",
"demographic")
purposeTerms.ls <-
c("purpose","justification","basis","use")
statTerms.ls %<>% SnowballC::wordStem() %>% unique()
purposeTerms.ls %<>% SnowballC::wordStem() %>% unique()
combinedTerms.ls <- expand_grid(x=statTerms.ls,y=purposeTerms.ls) %>%
transmute(combined=paste(x,y)) %>% pull
combinedTerms2.ls <- expand_grid(y=statTerms.ls,x=purposeTerms.ls) %>%
transmute(combined=paste(x,y)) %>% pull
statTerms.tib <-
tibble(token=c(statTerms.ls,
purposeTerms.ls,
combinedTerms.ls,
combinedTerms2.ls)) %>%
left_join(term.tib,by="token") %>%
mutate(across(everything(),~replace_na(.x,0)))
rm(statTerms.ls,purposeTerms.ls,combinedTerms.ls, combinedTerms2.ls)
statTerms.tib %>% filter(freq>0) %>% arrange(desc(docfreq)) %>% gt| token | freq | docfreq |
|---|---|---|
| us | 62894 | 3686 |
| purpos | 13177 | 2836 |
| analyt | 4431 | 1241 |
| statist | 1807 | 1214 |
| aggreg | 2223 | 1157 |
| basi | 2447 | 949 |
| demograph | 1253 | 792 |
| us analyt | 810 | 615 |
| statist purpos | 359 | 328 |
| analyt us | 410 | 320 |
| us statist | 259 | 247 |
| us aggreg | 220 | 192 |
| statist us | 155 | 149 |
| analyt purpos | 112 | 105 |
| aggreg basi | 105 | 94 |
| aggreg us | 76 | 74 |
| purpos statist | 65 | 61 |
| purpos analyt | 52 | 52 |
| purpos aggreg | 34 | 34 |
| demograph purpos | 30 | 29 |
| justif | 36 | 23 |
| us demograph | 23 | 21 |
| aggreg purpos | 12 | 12 |
| demograph us | 13 | 12 |
| basi aggreg | 8 | 8 |
| basi statist | 2 | 2 |
| purpos demograph | 1 | 1 |
selectTerm <- "statist purpos"
termoccur.tib <- tokens.tbl %>%
filter(token==selectTerm) %>% select(file,par) %>% collect()
samplestxt.tib <- para.tbl %>% right_join( (termoccur.tib %>% slice_sample(n=200)),
by=c("file"="file","par"="par"),
copy=TRUE) %>% collect()
samplestxt.tib %>% select(text) %>% datatable()