1 Setup

This is all for setup. Optionally (based on the knit parameters) we retrieve the demo (or full) corpus from privaseer , and ingest it into a persistent database. (More details in git: <https://github.com/MIT-Informatics/privacy-policies> )

The ingest process:

  • unpacks the tar file

  • iterates through each html policy file

    • parse the html into chunks (base on paragraphs and similar elements)

    • cleans the text (for odd characters, etc.)

    • splits it into words (tokens)

    • removes stop words

    • stems the terms

    • computes k-n ngrams

  • these are all stored and indexed in three tables – describing the file, paragraphs, and ngrams

### Refresh plan set 

## Note run the sources in the data directory

if (GLOBALS$refresh_data) {
  source(fs::path(GLOBALS$src_dir,"fetch_data.R"))
  wd <- getwd()
  setwd(GLOBALS$data_dir )
  fetch_privaseer()
  setwd(wd)
}

if (TRUE) {
  wd <- getwd()
  source(fs::path(GLOBALS$src_dir,"build_db.R")) 
  setwd(GLOBALS$data_dir )
  if (GLOBALS$refresh_data) {
    ingest_privaseer_data()
  }
  maindb.con <- setup_db()
  ptbls <- setup_privaseer_tables(maindb.con)
  tokens.tbl <- ptbls$tok
  para.tbl <- ptbls$par
  GLOBALS$con <- maindb.con
  setwd(wd)
}


rm(setup_db,setup_privaseer_tables,wd,ptbls,maindb.con)

2 Overview

  • How long are these policies?

    fcnts.tib <- 
      para.tbl %>% 
      count(file,name="npar") %>% 
      collect() 
    
      (  
       fcnts.tib %>%
       gf_ash(~npar) +
       labs(x="Number of paragraphs in policy")
      ) %>%
      ggplotly()
  • How readable are they?

    library(quanteda.textstats)
    
    #TODO: refactor with subqueries on the file table  instead of cacheing entire list of file ids in memory
    files.ls <- para.tbl %>% 
       group_by(file) %>%
       summarize(dummy=1) %>%
       select(-dummy) %>%
       collect() %>% pull()
    
    readability_sq<-function(x) {
       para.tbl %>%
        filter(`file`==x) %>% 
        select(`text`) %>%
        collect() %>%
        pull %>% 
        str_flatten() %>% 
        quanteda.textstats::textstat_readability() %>%
        as.data.frame() %>%
        select(-`document`) %>%
        mutate(file=x) 
    }
    
    # use a sample since this is costly
    system.time ({
      sum.tib <- purrr::map_dfr(sample(files.ls,200), readability_sq)
      sum.tib %<>% left_join(fcnts.tib,by="file") 
    })
    ##    user  system elapsed 
    ##  28.459   0.562  29.006
    (sum.tib %>% 
      gf_boxplot(~Flesch)) /
    ( sum.tib %>% 
      gf_point(Flesch~npar))

  • What are common words and phrases?

term.tib <- tokens.tbl %>%
  count(token, name="freq") %>%
  collect()

doc.tib <- tokens.tbl %>%
  count(token,file) %>% count(token, name="docfreq") %>%
  collect()

term.tib %<>% full_join(doc.tib, by="token") 
rm(doc.tib)

term.tib %>% slice_max(freq,n=1000) %>% DT::datatable()
library(wordcloud2, quiet=TRUE)

term.tib %>% 
  select(token,freq) %>%
  slice_max(freq,n=250) %>% 
  rename(word=token) %>%
  wordcloud2::wordcloud2()

3 Statistical Purpose Terms

3.1 Frequency

library(SnowballC,quiet=TRUE)

statTerms.ls <- 
  c("statistical",
    "statistics",
    "aggregate",
    "aggregated",
    "analytical",
    "analytic",
    "demographic")

purposeTerms.ls <-
  c("purpose","justification","basis","use")

statTerms.ls %<>% SnowballC::wordStem() %>% unique()
purposeTerms.ls %<>% SnowballC::wordStem() %>% unique()
combinedTerms.ls <- expand_grid(x=statTerms.ls,y=purposeTerms.ls) %>% 
  transmute(combined=paste(x,y)) %>% pull
combinedTerms2.ls <- expand_grid(y=statTerms.ls,x=purposeTerms.ls) %>% 
  transmute(combined=paste(x,y)) %>% pull

statTerms.tib <- 
  tibble(token=c(statTerms.ls,
                 purposeTerms.ls,
                 combinedTerms.ls,
                 combinedTerms2.ls)) %>% 
  left_join(term.tib,by="token") %>%
  mutate(across(everything(),~replace_na(.x,0)))

rm(statTerms.ls,purposeTerms.ls,combinedTerms.ls, combinedTerms2.ls)

statTerms.tib %>% filter(freq>0) %>% arrange(desc(docfreq)) %>% gt
token freq docfreq
us 62894 3686
purpos 13177 2836
analyt 4431 1241
statist 1807 1214
aggreg 2223 1157
basi 2447 949
demograph 1253 792
us analyt 810 615
statist purpos 359 328
analyt us 410 320
us statist 259 247
us aggreg 220 192
statist us 155 149
analyt purpos 112 105
aggreg basi 105 94
aggreg us 76 74
purpos statist 65 61
purpos analyt 52 52
purpos aggreg 34 34
demograph purpos 30 29
justif 36 23
us demograph 23 21
aggreg purpos 12 12
demograph us 13 12
basi aggreg 8 8
basi statist 2 2
purpos demograph 1 1

3.2 Context of “Statistical Purpose” - Random Sample of 200

selectTerm <- "statist purpos"

termoccur.tib <- tokens.tbl %>%
  filter(token==selectTerm) %>% select(file,par) %>% collect()

samplestxt.tib <- para.tbl %>% right_join( (termoccur.tib %>% slice_sample(n=200)),
                         by=c("file"="file","par"="par"), 
                         copy=TRUE)  %>% collect()

samplestxt.tib %>% select(text) %>% datatable()