1 Setup

This is all for setup. Optionally (based on the knit parameters) we retrieve the demo (or full) corpus from privaseer , and ingest it into a persistent database. (More details in git: <https://github.com/MIT-Informatics/privacy-policies> )

The ingest process:

unpacks the tar file
iterates through each html policy file
- parse the html into chunks (base on paragraphs and similar elements)
- cleans the text (for odd characters, etc.)
- splits it into words (tokens)
- removes stop words
- stems the terms
- computes k-n ngrams
these are all stored and indexed in three tables – describing the file, paragraphs, and ngrams

### Refresh plan set 

## Note run the sources in the data directory

if (GLOBALS$refresh_data) {
  source(fs::path(GLOBALS$src_dir,"fetch_data.R"))
  wd <- getwd()
  setwd(GLOBALS$data_dir )
  fetch_privaseer()
  setwd(wd)
}

if (TRUE) {
  wd <- getwd()
  source(fs::path(GLOBALS$src_dir,"build_db.R")) 
  setwd(GLOBALS$data_dir )
  if (GLOBALS$refresh_data) {
    ingest_privaseer_data()
  }
  maindb.con <- setup_db()
  ptbls <- setup_privaseer_tables(maindb.con)
  tokens.tbl <- ptbls$tok
  para.tbl <- ptbls$par
  GLOBALS$con <- maindb.con
  setwd(wd)
}


rm(setup_db,setup_privaseer_tables,wd,ptbls,maindb.con)

2 Overview

How long are these policies?
```
fcnts.tib <- 
  para.tbl %>% 
  count(file,name="npar") %>% 
  collect() 

  (  
   fcnts.tib %>%
   gf_ash(~npar) +
   labs(x="Number of paragraphs in policy")
  ) %>%
  ggplotly()
```

How readable are they?

library(quanteda.textstats)

#TODO: refactor with subqueries on the file table  instead of cacheing entire list of file ids in memory
files.ls <- para.tbl %>% 
   group_by(file) %>%
   summarize(dummy=1) %>%
   select(-dummy) %>%
   collect() %>% pull()

readability_sq<-function(x) {
   para.tbl %>%
    filter(`file`==x) %>% 
    select(`text`) %>%
    collect() %>%
    pull %>% 
    str_flatten() %>% 
    quanteda.textstats::textstat_readability() %>%
    as.data.frame() %>%
    select(-`document`) %>%
    mutate(file=x) 
}

# use a sample since this is costly
system.time ({
  sum.tib <- purrr::map_dfr(sample(files.ls,200), readability_sq)
  sum.tib %<>% left_join(fcnts.tib,by="file") 
})

##    user  system elapsed 
##  28.459   0.562  29.006

(sum.tib %>% 
  gf_boxplot(~Flesch)) /
( sum.tib %>% 
  gf_point(Flesch~npar))

What are common words and phrases?

term.tib <- tokens.tbl %>%
  count(token, name="freq") %>%
  collect()

doc.tib <- tokens.tbl %>%
  count(token,file) %>% count(token, name="docfreq") %>%
  collect()

term.tib %<>% full_join(doc.tib, by="token") 
rm(doc.tib)

term.tib %>% slice_max(freq,n=1000) %>% DT::datatable()

library(wordcloud2, quiet=TRUE)

term.tib %>% 
  select(token,freq) %>%
  slice_max(freq,n=250) %>% 
  rename(word=token) %>%
  wordcloud2::wordcloud2()

3 Statistical Purpose Terms

3.1 Frequency

library(SnowballC,quiet=TRUE)

statTerms.ls <- 
  c("statistical",
    "statistics",
    "aggregate",
    "aggregated",
    "analytical",
    "analytic",
    "demographic")

purposeTerms.ls <-
  c("purpose","justification","basis","use")

statTerms.ls %<>% SnowballC::wordStem() %>% unique()
purposeTerms.ls %<>% SnowballC::wordStem() %>% unique()
combinedTerms.ls <- expand_grid(x=statTerms.ls,y=purposeTerms.ls) %>% 
  transmute(combined=paste(x,y)) %>% pull
combinedTerms2.ls <- expand_grid(y=statTerms.ls,x=purposeTerms.ls) %>% 
  transmute(combined=paste(x,y)) %>% pull

statTerms.tib <- 
  tibble(token=c(statTerms.ls,
                 purposeTerms.ls,
                 combinedTerms.ls,
                 combinedTerms2.ls)) %>% 
  left_join(term.tib,by="token") %>%
  mutate(across(everything(),~replace_na(.x,0)))

rm(statTerms.ls,purposeTerms.ls,combinedTerms.ls, combinedTerms2.ls)

statTerms.tib %>% filter(freq>0) %>% arrange(desc(docfreq)) %>% gt

token	freq	docfreq
us	62894	3686
purpos	13177	2836
analyt	4431	1241
statist	1807	1214
aggreg	2223	1157
basi	2447	949
demograph	1253	792
us analyt	810	615
statist purpos	359	328
analyt us	410	320
us statist	259	247
us aggreg	220	192
statist us	155	149
analyt purpos	112	105
aggreg basi	105	94
aggreg us	76	74
purpos statist	65	61
purpos analyt	52	52
purpos aggreg	34	34
demograph purpos	30	29
justif	36	23
us demograph	23	21
aggreg purpos	12	12
demograph us	13	12
basi aggreg	8	8
basi statist	2	2
purpos demograph	1	1

3.2 Context of “Statistical Purpose” - Random Sample of 200

selectTerm <- "statist purpos"

termoccur.tib <- tokens.tbl %>%
  filter(token==selectTerm) %>% select(file,par) %>% collect()

samplestxt.tib <- para.tbl %>% right_join( (termoccur.tib %>% slice_sample(n=200)),
                         by=c("file"="file","par"="par"), 
                         copy=TRUE)  %>% collect()

samplestxt.tib %>% select(text) %>% datatable()

Privacy Policies Exploration

Micah Altman

2021-11-16

1 Setup

2 Overview

3 Statistical Purpose Terms

3.1 Frequency

3.2 Context of “Statistical Purpose” - Random Sample of 200