This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

print("hello!")
## [1] "hello!"

GutenbergR and Quanteda

Get Started

Load libraries.

library(gutenbergr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(quanteda)
## Package version: 3.3.0
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textplots)
library(quanteda.textstats)
library(openxlsx)
library(tictoc)

Loading constants and functions.

Dictionary path

dict_path <- '/media/catana/Data/projects/teaching/nlp/dict/dicts.xlsx'

Functions

gutenfiles() to get text

# function to get n texts from gutenbergr package
gutenfiles <-
  function(n = 20,
           lang = 'en',
           bshelf = 'Science Fiction',
           author_birthdate = 1850) {
    #set.seed(55)

    metadata_raw <-
      gutenberg_metadata

    if (lang != 'all') {
      metadata_raw <- metadata_raw %>%
        filter(language %in% lang)
    }
    if (bshelf != 'all') {
      metadata_raw <- metadata_raw %>%
        filter(gutenberg_bookshelf %in% bshelf)
    }

    if (author_birthdate != 'all') {
      new_authors <- gutenberg_authors %>%
        filter(birthdate > author_birthdate)

      metadata_raw <- metadata_raw %>%
        filter(gutenberg_author_id %in% new_authors$gutenberg_author_id)
    }
    gb_ids <- metadata_raw$gutenberg_id
    gb_id <- sample(gb_ids, n, replace = F)
    metadata_filtered <-
      filter(metadata_raw, gutenberg_id %in% gb_id)
    gb_dwnld <-
      gutenberg_download(gb_id, meta_fields = c('title', 'author'))
    return(gb_dwnld)
  }

read_dict() to read dictionary from .xlsx file

# function to read dictionary from .xlsx
read_dict <- function(dict_path = "dicts.xlsx") {
  # read in the xlsx file
  wb <- loadWorkbook(dict_path)

  # get the names of all sheets in the workbook
  sheet_names <- names(wb)

  # create an empty list to store the data
  my_list <- list()

  # loop through each sheet in the workbook
  for (sheet_name in sheet_names) {
    # read the sheet into a data frame
    df <- read.xlsx(wb, sheet = sheet_name)

    df[is.na(df)] <- ""

    # create an empty list for this sheet
    sheet_list <- list()

    # loop through each column in the data frame and add it to the sheet list
    for (col_name in colnames(df)) {
      sheet_list[[col_name]] <- df[[col_name]]
    }

    # add the sheet list to the main list with the sheet name as the list name
    my_list[[sheet_name]] <- sheet_list

  }
  ddd <- dictionary(my_list)
  return(ddd)
}
Loading books from Gutenberg database
# get a bunch of books
books <- gutenfiles(n = 10)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Reshaping dataframe with loaded books
message('Number of rows before reshaping: ', nrow(books))
## Number of rows before reshaping: 14243
Reshape to more readable format one text one row
books_aggregated <- books %>%
  group_by(gutenberg_id) %>%
  summarise(
    title = first(title),
    author = first(author),
    text = paste(text, collapse = " ")
  ) %>%
  ungroup()
message('Number of rows after reshaping: ', nrow(books_aggregated))
## Number of rows after reshaping: 10
Creating and reshaping corpus
# Construct a corpus
books_corpus <- corpus(books_aggregated, docid_field = 'title')
# check corpus document names
books_docnames <- docnames(books_corpus)
# get corpus document variables (metadata)
books_docvars <- docvars(books_corpus)
# If you want to extract individual elements of document variables,
# you can specify field
books_docvars_authors <- docvars(books_corpus, field = 'author')

# Reshape corpus
corp_sent <- corpus_reshape(books_corpus, to = "sentences")
# Number of documents in the new corpus
message('Number of documents in the corpus of sentences: ', ndoc(corp_sent))
## Number of documents in the corpus of sentences: 9321
# reshape back
corp_doc <- corpus_reshape(corp_sent, to = "documents")
Making summary of the corpus
# get summary for the corpus
books_summary <- summary(books_corpus)
print(books_summary)
## Corpus consisting of 10 documents, showing 10 documents:
## 
##                              Text Types Tokens Sentences gutenberg_id
##         Return to Pleasure Island  2730  13380       826        17027
##                      Star Surgeon  5868  59810      3518        18492
##                          Divinity  1692   7709       453        22623
##  The Machine That Saved The World  3150  16578      1201        26174
##                         The Skull  1806  10106      1113        30255
##                Where There's Hope   862   2682       215        30715
##               The Dwindling Years  1656   6621       458        50103
##          The Luckiest Man in Denv  1408   5555       410        50835
##                           Pen Pal  1523   6266       408        51286
##               If You Was a Moklin  1672   9879       719        51752
##                       author
##               Doctorow, Cory
##          Nourse, Alan Edward
##            Samachson, Joseph
##             Leinster, Murray
##              Dick, Philip K.
##                Bixby, Jerome
##              Del Rey, Lester
##  Kornbluth, C. M. (Cyril M.)
##             Marlowe, Stephen
##             Leinster, Murray
Making tokens
# tokens() segments texts in a corpus
# into tokens (words or sentences) by word boundaries.
books_tokens <- tokens(
  books_corpus,
  remove_punct = T,
  remove_symbols = T,
  remove_numbers = T,
  remove_url = T,
  remove_separators = T
)
Looking up keywords in context

Looking for “blast” and “start”

# Key words in context
kw_blast_start <-
  kwic(books_tokens, pattern =  c("blast*", "start*"))
kw_blast_start_sample <-
  kw_blast_start %>% as.data.frame() %>%
  select(pre, keyword, post) %>%
  head(10)
print(kw_blast_start_sample)
##                                pre  keyword                              post
## 1             have just sat up and  started talking Orville should know about
## 2       they stopped Every time he  started                  a new grade or a
## 3           long in the tooth He'd    start     pushing fudge-nut dips and by
## 4                in the eye at the    start       of the speech then switched
## 5      about the line The youngest  started vibrating with excitement and the
## 6            his grip He looked up startled   into Joe's grinning face Unlike
## 7  We're gonna flatten that sucker    start              fresh and build us a
## 8          look and then his chest  started       ringing He extracted a slim
## 9    then Good night George George  started       cooking dinner for two More
## 10       him search After that Joe  started      sending out a runner usually

Looking for a multi-word _“being d*“_

# for a multi-word expressions
# separate words by white space and wrap the character vector by phrase()
kw_being_do <- kwic(books_tokens, pattern = phrase("being d*"))
kw_being_do_sample <-
  kw_being_do %>% as.data.frame() %>%
  select(pre, keyword, post) %>%
  head(10)
print(kw_being_do_sample)
##                             pre         keyword
## 1 him--one of the special jeeps being developed
## 2 than anything else he’d loved    being driven
##                                       post
## 1 at this particular installation--and its
## 2             Even after chauffeurs were a
Selecting tokens
# Select tokens
toks_nostop <-
  tokens_select(books_tokens, pattern = stopwords("en"), selection = "remove")
toks_nostop2 <-
  tokens_remove(books_tokens, pattern = stopwords("en"))
# see also parameters padding and window

Tokens with stopwords removed (sample):

print(sample(toks_nostop[[1]], 30))
##  [1] "budget"       "civilised"    "points"       "fretted"      "tell"        
##  [6] "shape"        "Alarm-bells"  "strong"       "necessary"    "inside"      
## [11] "boy"          "third"        "flattened"    "refill"       "freezer"     
## [16] "Licensor"     "huh"          "building"     "ground"       "smile"       
## [21] "top"          "laughing"     "September"    "happy"        "may"         
## [26] "constituting" "merch"        "explosion"    "perform"      "slapped"
Reading dictionary from .xlsx file
dict <- read_dict(dict_path = dict_path)
Looking up tokens from dictionary in the books
toks_dict <- tokens_lookup(books_tokens, dictionary = dict)
message('Sample of dictionary entries found in the book - ', names(toks_dict)[1])
## Sample of dictionary entries found in the book - Return to Pleasure Island
toks_dict_sample <- toks_dict[[1]][grep('body|animals|numerals',toks_dict[[1]], ignore.case = T)]
toks_dict_sample <- toks_dict_sample %>% head(100)
print(toks_dict_sample)
##   [1] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
##   [4] "BODY.UPPER"        "NUMERALS.CARDINAL" "NUMERALS.ORDINAL" 
##   [7] "BODY.SENSES"       "BODY.INTERNAL"     "NUMERALS.CARDINAL"
##  [10] "BODY.UPPER"        "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
##  [13] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"       
##  [16] "NUMERALS.ORDINAL"  "NUMERALS.CARDINAL" "NUMERALS.ORDINAL" 
##  [19] "NUMERALS.ORDINAL"  "NUMERALS.ORDINAL"  "BODY.UPPER"       
##  [22] "BODY.UPPER"        "NUMERALS.CARDINAL" "BODY.UPPER"       
##  [25] "BODY.UPPER"        "BODY.UPPER"        "BODY.UPPER"       
##  [28] "NUMERALS.CARDINAL" "BODY.UPPER"        "BODY.UPPER"       
##  [31] "NUMERALS.CARDINAL" "BODY.UPPER"        "BODY.UPPER"       
##  [34] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.HEAD"        
##  [37] "BODY.HEAD"         "BODY.UPPER"        "BODY.HEAD"        
##  [40] "BODY.UPPER"        "BODY.HEAD"         "BODY.UPPER"       
##  [43] "BODY.HEAD"         "BODY.HEAD"         "NUMERALS.CARDINAL"
##  [46] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"       
##  [49] "BODY.UPPER"        "BODY.UPPER"        "NUMERALS.CARDINAL"
##  [52] "NUMERALS.CARDINAL" "BODY.UPPER"        "NUMERALS.CARDINAL"
##  [55] "NUMERALS.CARDINAL" "BODY.UPPER"        "BODY.UPPER"       
##  [58] "BODY.HEAD"         "BODY.EYE"          "BODY.FLUIDS"      
##  [61] "BODY.UPPER"        "NUMERALS.CARDINAL" "BODY.SENSES"      
##  [64] "BODY.HEAD"         "BODY.EYE"          "BODY.HEAD"        
##  [67] "BODY.HEAD"         "BODY.INTERNAL"     "BODY.UPPER"       
##  [70] "BODY.LOWER"        "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
##  [73] "BODY.LOWER"        "NUMERALS.ORDINAL"  "NUMERALS.CARDINAL"
##  [76] "BODY.FLUIDS"       "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
##  [79] "BODY.HEAD"         "BODY.LOWER"        "BODY.UPPER"       
##  [82] "BODY.UPPER"        "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
##  [85] "NUMERALS.CARDINAL" "BODY.UPPER"        "NUMERALS.CARDINAL"
##  [88] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"       
##  [91] "NUMERALS.CARDINAL" "ANIMALS.MAMMAL"    "ANIMALS.MAMMAL"   
##  [94] "ANIMALS.MAMMAL"    "NUMERALS.CARDINAL" "BODY.HEAD"        
##  [97] "NUMERALS.CARDINAL" "BODY.UPPER"        "NUMERALS.CARDINAL"
## [100] "BODY.EYE"
Document Feature Matrix

Create DFM from tokens

books_dfm <- dfm(books_tokens) %>%
  dfm_remove(stopwords('en')) %>%
  dfm_trim(min_termfreq = 20)

Document Feature matrix:

print(books_dfm)
## Document-feature matrix of: 10 documents, 541 features (35.49% sparse) and 2 docvars.
##                                   features
## docs                               place four see something exactly know great
##   Return to Pleasure Island            3    3  15         9       1   20     2
##   Star Surgeon                        24    5  99        67      12  103    38
##   Divinity                             3    0   8         6       2    4     3
##   The Machine That Saved The World     6    2  14        19       2   20     2
##   The Skull                            9    2  16        13       0   33     2
##   Where There's Hope                   6    0   5         2       0    5     0
##                                   features
## docs                               though take old
##   Return to Pleasure Island             9   15  13
##   Star Surgeon                         35   33  19
##   Divinity                              1   10   8
##   The Machine That Saved The World      4    7   5
##   The Skull                             1    9   9
##   Where There's Hope                    0    1   1
## [ reached max_ndoc ... 4 more documents, reached max_nfeat ... 531 more features ]

Plot DFM

textplot_wordcloud(books_dfm, max_words = 150, color = c('blue','red')) 

# change colors
# 150 most frequent words
Frequencies
books_freqs <- textstat_frequency(books_dfm, n = 5, groups = docname_)
print(books_freqs)
##      feature frequency rank docfreq                            group
## 1        god        34    1       1                         Divinity
## 2        one        30    2       1                         Divinity
## 3     helmet        25    3       1                         Divinity
## 4    bradley        25    3       1                         Divinity
## 5      first        22    5       1                         Divinity
## 6     moklin        78    1       1              If You Was a Moklin
## 7       says        73    2       1              If You Was a Moklin
## 8    moklins        62    3       1              If You Was a Moklin
## 9       like        60    4       1              If You Was a Moklin
## 10    brooks        56    5       1              If You Was a Moklin
## 11   matilda        82    1       1                          Pen Pal
## 12     haron        42    2       1                          Pen Pal
## 13     gorka        39    3       1                          Pen Pal
## 14       now        20    4       1                          Pen Pal
## 15      said        20    4       1                          Pen Pal
## 16    george       185    1       1        Return to Pleasure Island
## 17      bill        96    2       1        Return to Pleasure Island
## 18      said        83    3       1        Return to Pleasure Island
## 19       joe        77    4       1        Return to Pleasure Island
## 20      work        70    5       1        Return to Pleasure Island
## 21       dal       507    1       1                     Star Surgeon
## 22      said       390    2       1                     Star Surgeon
## 23     tiger       260    3       1                     Star Surgeon
## 24    doctor       248    4       1                     Star Surgeon
## 25      jack       227    5       1                     Star Surgeon
## 26     giles        52    1       1              The Dwindling Years
## 27      he’d        36    2       1              The Dwindling Years
## 28     years        26    3       1              The Dwindling Years
## 29      even        23    4       1              The Dwindling Years
## 30       now        20    5       1              The Dwindling Years
## 31      said        43    1       1         The Luckiest Man in Denv
## 32    reuben        42    2       1         The Luckiest Man in Denv
## 33       may        27    3       1         The Luckiest Man in Denv
## 34       man        25    4       1         The Luckiest Man in Denv
## 35       one        24    5       1         The Luckiest Man in Denv
## 36  sergeant       138    1       1 The Machine That Saved The World
## 37      said       127    2       1 The Machine That Saved The World
## 38   bellews        94    3       1 The Machine That Saved The World
## 39  machines        76    4       1 The Machine That Saved The World
## 40 broadcast        55    5       1 The Machine That Saved The World
## 41    conger        95    1       1                        The Skull
## 42      said        65    2       1                        The Skull
## 43      know        33    3       1                        The Skull
## 44       man        32    4       1                        The Skull
## 45   speaker        31    5       1                        The Skull
## 46    farrel        26    1       1               Where There's Hope
## 47      said        22    2       1               Where There's Hope
## 48       got        14    3       1               Where There's Hope
## 49       one        10    4       1               Where There's Hope
## 50      back         8    5       1               Where There's Hope

Number of features:

message('Number of features in the document-feature matrix: ', nfeat(books_dfm))
## Number of features in the document-feature matrix: 541
DFM statistics - keyness, all texts against first book
# Create boolean vector with TRUE value for the first book
b_logic <- books_dfm@docvars[["docname_"]]==books_dfm@docvars[["docname_"]][[1]]
books_stat_keyness <- textstat_keyness(books_dfm, b_logic)
print(books_stat_keyness)
##            feature          chi2            p n_target n_reference
## 1           george  1.887205e+03 0.000000e+00      185           2
## 2             bill  7.945362e+02 0.000000e+00       96          19
## 3              joe  6.746958e+02 0.000000e+00       77          11
## 4          orville  6.379538e+02 0.000000e+00       62           0
## 5          license  5.236493e+02 0.000000e+00       52           0
## 6               --  3.691952e+02 0.000000e+00       37           0
## 7              tom  2.766100e+02 0.000000e+00       28           0
## 8             work  2.715298e+02 0.000000e+00       70          78
## 9              son  2.418272e+02 0.000000e+00       32           8
## 10           cabin  1.934705e+02 0.000000e+00       22           2
## 11            soft  1.680370e+02 0.000000e+00       25           9
## 12          rights  1.654450e+02 0.000000e+00       21           4
## 13            boys  1.359679e+02 0.000000e+00       18           4
## 14           terms  1.248107e+02 0.000000e+00       19           7
## 15          strong  9.408956e+01 0.000000e+00       15           6
## 16          father  5.851406e+01 2.020606e-14       12           8
## 17           works  4.394992e+01 3.368861e-11       11          10
## 18            left  4.192005e+01 9.508261e-11       21          46
## 19            care  2.842322e+01 9.748910e-08       13          24
## 20            ones  2.539908e+01 4.661392e-07       16          39
## 21            home  2.372082e+01 1.113711e-06       12          24
## 22             may  1.590030e+01 6.676830e-05       16          56
## 23             big  1.572995e+01 7.305803e-05       11          28
## 24        thinking  1.266626e+01 3.723126e-04        7          14
## 25           night  1.223409e+01 4.692423e-04       10          28
## 26           means  1.164410e+01 6.440656e-04        9          24
## 27            hand  1.073902e+01 1.048996e-03       14          56
## 28         morning  9.655889e+00 1.887455e-03        8          22
## 29          waited  9.655889e+00 1.887455e-03        8          22
## 30             say  9.006435e+00 2.690307e-03       17          80
## 31           hands  8.586596e+00 3.386465e-03       14          62
## 32            free  8.585092e+00 3.389262e-03        6          14
## 33           mouth  7.720172e+00 5.460718e-03        7          20
## 34            held  7.585335e+00 5.884504e-03        9          31
## 35            took  6.081947e+00 1.365701e-02       14          71
## 36            away  5.872355e+00 1.538046e-02       16          86
## 37         watched  5.551234e+00 1.846767e-02        8          30
## 38             put  5.480657e+00 1.922803e-02       11          53
## 39         already  5.254307e+00 2.189252e-02        9          37
## 40           right  4.825830e+00 2.803636e-02       25         161
## 41          smiled  4.805552e+00 2.836817e-02       10          45
## 42             ran  4.594436e+00 3.207590e-02        5          15
## 43          inside  4.416169e+00 3.559989e-02        7          27
## 44           whole  4.074276e+00 4.354091e-02        7          28
## 45          always  3.835594e+00 5.017522e-02        9          42
## 46            said  3.689751e+00 5.474778e-02       83         684
## 47            line  3.650637e+00 5.604779e-02        5          17
## 48            made  3.623977e+00 5.695256e-02       16          99
## 49             old  3.463035e+00 6.275502e-02       13          77
## 50             day  3.376667e+00 6.612565e-02        9          48
## 51            take  3.370037e+00 6.639228e-02       15          93
## 52            eyes  3.304895e+00 6.907374e-02       13          78
## 53            tell  2.953729e+00 8.567964e-02       11          65
## 54            back  2.910289e+00 8.801611e-02       32         239
## 55             led  2.896561e+00 8.876874e-02        5          19
## 56            kind  2.789388e+00 9.489047e-02        9          51
## 57            door  2.757063e+00 9.682585e-02        9          47
## 58           still  2.594032e+00 1.072670e-01       13          83
## 59          stared  2.510449e+00 1.130937e-01        8          41
## 60           along  2.472795e+00 1.158317e-01        7          34
## 61            talk  2.472795e+00 1.158317e-01        7          34
## 62             use  2.148104e+00 1.427461e-01        8          43
## 63        anything  2.095767e+00 1.477076e-01       11          71
## 64             arm  2.028967e+00 1.543255e-01        5          22
## 65            told  1.974049e+00 1.600188e-01       10          64
## 66         clearly  1.841162e+00 1.748148e-01        4          16
## 67            best  1.827389e+00 1.764362e-01        6          30
## 68          needed  1.827389e+00 1.764362e-01        6          30
## 69            hard  1.794985e+00 1.803200e-01        5          23
## 70          though  1.725279e+00 1.890149e-01        9          58
## 71             end  1.643788e+00 1.998065e-01        6          31
## 72         started  1.604330e+00 2.052907e-01        8          51
## 73            high  1.583852e+00 2.082067e-01        5          24
## 74         brought  1.573711e+00 2.096689e-01        7          39
## 75            keep  1.573711e+00 2.096689e-01        7          39
## 76            good  1.537096e+00 2.150506e-01       17         127
## 77           every  1.532758e+00 2.156990e-01       11          76
## 78           since  1.474973e+00 2.245626e-01        6          32
## 79           stood  1.474973e+00 2.245626e-01        6          32
## 80             new  1.424182e+00 2.327162e-01       13          94
## 81       important  1.393284e+00 2.378512e-01        5          25
## 82            stop  1.393284e+00 2.378512e-01        5          25
## 83         working  1.221292e+00 2.691072e-01        5          26
## 84          hardly  1.145080e+00 2.845814e-01        4          19
## 85            name  1.066134e+00 3.018202e-01        5          27
## 86          around  1.057137e+00 3.038691e-01       12          90
## 87            gave  1.046435e+00 3.063295e-01        6          35
## 88           meant  9.679492e-01 3.251922e-01        4          20
## 89         getting  9.262814e-01 3.358311e-01        5          28
## 90          picked  9.262814e-01 3.358311e-01        5          28
## 91             sat  9.262814e-01 3.358311e-01        5          28
## 92            want  9.050042e-01 3.414434e-01       13         101
## 93         without  8.618851e-01 3.532117e-01       11          84
## 94            sure  8.192838e-01 3.653896e-01        7          50
## 95      everything  8.166105e-01 3.661730e-01        6          37
## 96           angry  8.127779e-01 3.673002e-01        4          21
## 97          called  8.003833e-01 3.709788e-01        5          29
## 98           three  7.950273e-01 3.725843e-01       11          85
## 99          taking  6.770333e-01 4.106102e-01        4          22
## 100        stopped  5.903774e-01 4.422732e-01        7          48
## 101        thought  5.344642e-01 4.647360e-01       14         117
## 102          asked  5.112691e-01 4.745900e-01        7          54
## 103           like  5.104019e-01 4.749649e-01       27         240
## 104           kept  4.555015e-01 4.997339e-01        4          24
## 105          start  4.143168e-01 5.197865e-01        5          33
## 106         except  3.662702e-01 5.450444e-01        4          25
## 107         walked  3.662702e-01 5.450444e-01        4          25
## 108        arrived  3.252705e-01 5.684573e-01        3          17
## 109           long  2.924049e-01 5.886835e-01       13         114
## 110           give  2.797644e-01 5.968557e-01        6          44
## 111            set  2.797644e-01 5.968557e-01        6          44
## 112         looked  2.766563e-01 5.989012e-01       14         124
## 113           four  2.385988e-01 6.252200e-01        3          18
## 114          stand  2.385988e-01 6.252200e-01        3          18
## 115       supposed  2.385988e-01 6.252200e-01        3          18
## 116           need  2.054994e-01 6.503180e-01        7          60
## 117         others  1.926587e-01 6.607130e-01        6          51
## 118           gone  1.866626e-01 6.657090e-01        6          46
## 119          bring  1.686618e-01 6.813038e-01        3          19
## 120           grew  1.686618e-01 6.813038e-01        3          19
## 121            guy  1.686618e-01 6.813038e-01        3          19
## 122         opened  1.686618e-01 6.813038e-01        3          19
## 123         school  1.686618e-01 6.813038e-01        3          19
## 124           upon  1.685714e-01 6.813846e-01        4          28
## 125            one  1.584716e-01 6.905676e-01       35         335
## 126           knew  1.476716e-01 7.007705e-01       11         100
## 127           came  1.434428e-01 7.048821e-01       15         139
## 128        waiting  1.347047e-01 7.136031e-01        5          38
## 129         months  1.132777e-01 7.364430e-01        3          20
## 130            can  9.738320e-02 7.549928e-01       19         181
## 131          hours  8.629570e-02 7.689403e-01        6          49
## 132         pretty  7.758898e-02 7.805917e-01        3          26
## 133           dark  7.062809e-02 7.904245e-01        3          21
## 134          smile  7.062809e-02 7.904245e-01        3          21
## 135          water  7.062809e-02 7.904245e-01        3          21
## 136           full  7.058178e-02 7.904916e-01        5          40
## 137            job  7.058178e-02 7.904916e-01        5          40
## 138         worked  6.272879e-02 8.022338e-01        4          36
## 139          think  6.184692e-02 8.036003e-01       15         144
## 140           next  3.944871e-02 8.425624e-01        4          37
## 141          sleep  3.918568e-02 8.430813e-01        3          22
## 142        talking  3.202142e-02 8.579807e-01        4          32
## 143         future  3.134327e-02 8.594767e-01        2          18
## 144         handed  3.134327e-02 8.594767e-01        2          18
## 145           late  3.134327e-02 8.594767e-01        2          18
## 146       probably  3.134327e-02 8.594767e-01        2          18
## 147       received  3.134327e-02 8.594767e-01        2          18
## 148           able  2.473189e-02 8.750370e-01        3          28
## 149          alone  2.473189e-02 8.750370e-01        3          28
## 150           live  1.765809e-02 8.942853e-01        3          23
## 151         living  1.765809e-02 8.942853e-01        3          23
## 152         little  1.328081e-02 9.082530e-01       12         119
## 153           make  1.270869e-02 9.102424e-01       11         109
## 154             em  1.095220e-02 9.166513e-01        2          19
## 155        feeling  1.095220e-02 9.166513e-01        2          19
## 156          rehab  1.095220e-02 9.166513e-01        2          19
## 157      shoulders  1.095220e-02 9.166513e-01        2          19
## 158           idea  9.918733e-03 9.206676e-01        3          29
## 159          white  9.918733e-03 9.206676e-01        3          29
## 160          heard  9.741597e-03 9.213768e-01        4          39
## 161        finally  5.797424e-03 9.393071e-01        5          44
## 162        carried  4.944384e-03 9.439419e-01        3          24
## 163         office  4.944384e-03 9.439419e-01        3          24
## 164           went  4.198068e-03 9.483392e-01       13         131
## 165            yes  3.187940e-03 9.549739e-01        5          50
## 166         ground  1.911346e-03 9.651284e-01        3          30
## 167         breath  1.273759e-03 9.715297e-01        2          20
## 168         wanted  5.396907e-04 9.814658e-01        6          61
## 169        reached  4.805192e-04 9.825112e-01        4          35
## 170        trouble  4.805192e-04 9.825112e-01        4          35
## 171           near  1.009331e-04 9.919842e-01        3          25
## 172        suppose  1.009331e-04 9.919842e-01        3          25
## 173         things  2.277671e-05 9.961921e-01        8          82
## 174           soon -1.098012e-04 9.916394e-01        3          31
## 175          least -7.689967e-04 9.778769e-01        3          37
## 176          clear -9.114875e-04 9.759148e-01        2          21
## 177          table -1.824388e-03 9.659304e-01        4          42
## 178          ahead -2.314249e-03 9.616313e-01        2          27
## 179      carefully -2.314249e-03 9.616313e-01        2          27
## 180        minutes -2.314249e-03 9.616313e-01        2          27
## 181            let -3.139136e-03 9.553195e-01        4          48
## 182      different -3.982732e-03 9.496798e-01        3          32
## 183          words -3.982732e-03 9.496798e-01        3          32
## 184            bit -5.800665e-03 9.392901e-01        3          38
## 185        looking -5.800665e-03 9.392901e-01        3          38
## 186          world -5.800665e-03 9.392901e-01        3          38
## 187          blood -8.701690e-03 9.256789e-01        2          22
## 188       business -8.701690e-03 9.256789e-01        2          22
## 189         moving -8.701690e-03 9.256789e-01        2          22
## 190         nobody -8.701690e-03 9.256789e-01        2          22
## 191            bad -1.087931e-02 9.169282e-01        2          28
## 192         behind -1.087931e-02 9.169282e-01        2          28
## 193           turn -1.087931e-02 9.169282e-01        2          28
## 194           five -1.305781e-02 9.090231e-01        3          33
## 195          heart -1.305781e-02 9.090231e-01        3          33
## 196           used -1.523720e-02 9.017594e-01        3          39
## 197         longer -2.366686e-02 8.777356e-01        2          23
## 198           days -2.518207e-02 8.739142e-01        2          29
## 199           side -2.518207e-02 8.739142e-01        2          29
## 200         turned -2.851377e-02 8.659066e-01        9          98
## 201           open -2.877173e-02 8.653072e-01        3          40
## 202           last -3.270224e-02 8.564950e-01        4          51
## 203          going -3.489137e-02 8.518235e-01        8          88
## 204         either -4.468524e-02 8.325839e-01        2          30
## 205       remember -4.468524e-02 8.325839e-01        2          30
## 206          taken -4.468524e-02 8.325839e-01        2          30
## 207       question -4.497987e-02 8.320411e-01        2          24
## 208        allowed -4.676162e-02 8.287973e-01        1          19
## 209           note -4.676162e-02 8.287973e-01        1          19
## 210     remembered -4.676162e-02 8.287973e-01        1          19
## 211        someone -4.676162e-02 8.287973e-01        1          19
## 212        walking -4.676162e-02 8.287973e-01        1          19
## 213         nodded -4.737361e-02 8.276981e-01        4          46
## 214          shook -4.737361e-02 8.276981e-01        4          46
## 215            air -6.749936e-02 7.950131e-01        3          36
## 216           hair -6.891664e-02 7.929207e-01        2          31
## 217           sent -6.891664e-02 7.929207e-01        2          31
## 218           ever -7.112035e-02 7.897125e-01        6          69
## 219           fast -7.193613e-02 7.885384e-01        2          25
## 220           even -7.441552e-02 7.850130e-01       17         187
## 221           know -7.640303e-02 7.822324e-01       20         219
## 222          comes -7.793042e-02 7.801219e-01        1          20
## 223           gets -7.793042e-02 7.801219e-01        1          20
## 224        learned -7.793042e-02 7.801219e-01        1          20
## 225          stars -7.793042e-02 7.801219e-01        1          20
## 226          never -8.094318e-02 7.760233e-01        9         102
## 227             go -8.465930e-02 7.710798e-01       10         113
## 228            got -1.046298e-01 7.463429e-01       15         168
## 229           also -1.151279e-01 7.343799e-01        1          21
## 230      certainly -1.151279e-01 7.343799e-01        1          21
## 231         toward -1.186668e-01 7.304850e-01        3          44
## 232            ask -1.575687e-01 6.914049e-01        1          22
## 233            car -1.575687e-01 6.914049e-01        1          22
## 234        fingers -1.575687e-01 6.914049e-01        1          22
## 235          happy -1.575687e-01 6.914049e-01        1          22
## 236           hour -1.575687e-01 6.914049e-01        1          22
## 237         making -1.575687e-01 6.914049e-01        1          22
## 238           show -1.575687e-01 6.914049e-01        1          22
## 239           must -1.603945e-01 6.887936e-01        6          73
## 240           rest -1.660455e-01 6.836504e-01        2          34
## 241        exactly -2.045980e-01 6.510348e-01        1          23
## 242        instead -2.045980e-01 6.510348e-01        1          23
## 243        picture -2.045980e-01 6.510348e-01        1          23
## 244          point -2.045980e-01 6.510348e-01        1          23
## 245        problem -2.045980e-01 6.510348e-01        1          23
## 246         coming -2.054679e-01 6.503431e-01        2          35
## 247          sense -2.054679e-01 6.503431e-01        2          35
## 248       together -2.054679e-01 6.503431e-01        2          35
## 249            yet -2.267256e-01 6.339624e-01        5          64
## 250           wait -2.479505e-01 6.185219e-01        2          36
## 251         collar -2.556662e-01 6.131133e-01        1          24
## 252        outside -2.556955e-01 6.130930e-01        3          48
## 253           come -2.765891e-01 5.989456e-01       10         122
## 254        special -2.932586e-01 5.881399e-01        2          37
## 255           done -2.954787e-01 5.867308e-01        5          66
## 256         helmet -3.103079e-01 5.774913e-01        1          25
## 257           hold -3.103079e-01 5.774913e-01        1          25
## 258          known -3.103079e-01 5.774913e-01        1          25
## 259          times -3.103079e-01 5.774913e-01        1          25
## 260           felt -3.324944e-01 5.641940e-01        5          67
## 261           well -3.397222e-01 5.599896e-01        9         113
## 262           find -3.475374e-01 5.555105e-01        6          79
## 263          close -3.681268e-01 5.440271e-01        1          26
## 264           feet -3.681268e-01 5.440271e-01        1          26
## 265           kids -3.681268e-01 5.440271e-01        1          26
## 266          leave -3.681268e-01 5.440271e-01        1          26
## 267          speak -3.681268e-01 5.440271e-01        1          26
## 268        strange -3.681268e-01 5.440271e-01        1          26
## 269            ten -3.681268e-01 5.440271e-01        1          26
## 270           word -3.915259e-01 5.314984e-01        2          39
## 271           food -4.287832e-01 5.125866e-01        1          27
## 272           girl -4.287832e-01 5.125866e-01        1          27
## 273          green -4.287832e-01 5.125866e-01        1          27
## 274         reason -4.287832e-01 5.125866e-01        1          27
## 275        sitting -4.287832e-01 5.125866e-01        1          27
## 276          sound -4.287832e-01 5.125866e-01        1          27
## 277          stuff -4.287832e-01 5.125866e-01        1          27
## 278           true -4.287832e-01 5.125866e-01        1          27
## 279          tried -4.441218e-01 5.051397e-01        2          40
## 280            way -4.743577e-01 4.909892e-01       11         140
## 281           case -4.919843e-01 4.830435e-01        1          28
## 282          guess -4.919843e-01 4.830435e-01        1          28
## 283          power -4.919843e-01 4.830435e-01        1          28
## 284           real -4.919843e-01 4.830435e-01        1          28
## 285       shoulder -4.919843e-01 4.830435e-01        1          28
## 286         signal -4.919843e-01 4.830435e-01        1          28
## 287        staring -4.919843e-01 4.830435e-01        1          28
## 288         really -4.978903e-01 4.804286e-01        4          59
## 289           face -5.521434e-01 4.574432e-01        7          96
## 290          given -5.574761e-01 4.552789e-01        1          29
## 291        somehow -5.574761e-01 4.552789e-01        1          29
## 292            two -6.290660e-01 4.276981e-01       10         133
## 293            far -6.740842e-01 4.116313e-01        2          44
## 294           feel -6.740842e-01 4.116313e-01        2          44
## 295         moment -6.832823e-01 4.084588e-01        5          75
## 296          moved -6.944758e-01 4.046462e-01        1          31
## 297           past -6.944758e-01 4.046462e-01        1          31
## 298          space -6.944758e-01 4.046462e-01        1          31
## 299            try -6.944758e-01 4.046462e-01        1          31
## 300          ready -7.656202e-01 3.815754e-01        1          32
## 301           tiny -7.656202e-01 3.815754e-01        1          32
## 302           half -7.991055e-01 3.713610e-01        2          46
## 303          thing -8.362749e-01 3.604643e-01        5          78
## 304           read -8.383209e-01 3.598774e-01        1          33
## 305            saw -8.665542e-01 3.519110e-01        4          66
## 306          dozen -9.124452e-01 3.394665e-01        1          34
## 307           pink -9.124452e-01 3.394665e-01        1          34
## 308            see -9.645669e-01 3.260389e-01       15         200
## 309           many -9.878750e-01 3.202623e-01        1          35
## 310          break -1.005635e+00 3.159508e-01        0          20
## 311          broke -1.005635e+00 3.159508e-01        0          20
## 312       expected -1.005635e+00 3.159508e-01        0          20
## 313      four-star -1.005635e+00 3.159508e-01        0          20
## 314           less -1.005635e+00 3.159508e-01        0          20
## 315      pathology -1.005635e+00 3.159508e-01        0          20
## 316       quarters -1.005635e+00 3.159508e-01        0          20
## 317        reports -1.005635e+00 3.159508e-01        0          20
## 318           sick -1.005635e+00 3.159508e-01        0          20
## 319          skill -1.005635e+00 3.159508e-01        0          20
## 320      somewhere -1.005635e+00 3.159508e-01        0          20
## 321         survey -1.005635e+00 3.159508e-01        0          20
## 322         slowly -1.065382e+00 3.019910e-01        2          50
## 323      something -1.087582e+00 2.970076e-01        9         132
## 324             dr -1.095761e+00 2.951982e-01        0          21
## 325           hell -1.095761e+00 2.951982e-01        0          21
## 326   intelligence -1.095761e+00 2.951982e-01        0          21
## 327 mahon-modified -1.095761e+00 2.951982e-01        0          21
## 328         nature -1.095761e+00 2.951982e-01        0          21
## 329            six -1.095761e+00 2.951982e-01        0          21
## 330       surgical -1.095761e+00 2.951982e-01        0          21
## 331           town -1.095761e+00 2.951982e-01        0          21
## 332         enough -1.112556e+00 2.915269e-01        5          83
## 333          looks -1.134888e+00 2.867351e-01        2          51
## 334         friend -1.142241e+00 2.851793e-01        1          37
## 335          admit -1.186563e+00 2.760239e-01        0          22
## 336        brucker -1.186563e+00 2.760239e-01        0          22
## 337         change -1.186563e+00 2.760239e-01        0          22
## 338   communicator -1.186563e+00 2.760239e-01        0          22
## 339     completely -1.186563e+00 2.760239e-01        0          22
## 340           goes -1.186563e+00 2.760239e-01        0          22
## 341           gray -1.186563e+00 2.760239e-01        0          22
## 342            lay -1.186563e+00 2.760239e-01        0          22
## 343          order -1.186563e+00 2.760239e-01        0          22
## 344       practice -1.186563e+00 2.760239e-01        0          22
## 345        seattle -1.186563e+00 2.760239e-01        0          22
## 346        studied -1.186563e+00 2.760239e-01        0          22
## 347    transmitter -1.186563e+00 2.760239e-01        0          22
## 348           seen -1.205440e+00 2.722371e-01        2          52
## 349           sort -1.220998e+00 2.691649e-01        1          38
## 350          great -1.276980e+00 2.584613e-01        2          53
## 351       happened -1.276980e+00 2.584613e-01        2          53
## 352        company -1.277953e+00 2.582800e-01        0          23
## 353           fear -1.277953e+00 2.582800e-01        0          23
## 354         passed -1.277953e+00 2.582800e-01        0          23
## 355        pattern -1.277953e+00 2.582800e-01        0          23
## 356        planets -1.277953e+00 2.582800e-01        0          23
## 357       produced -1.277953e+00 2.582800e-01        0          23
## 358       research -1.277953e+00 2.582800e-01        0          23
## 359         spread -1.277953e+00 2.582800e-01        0          23
## 360           look -1.289567e+00 2.561278e-01        5          86
## 361           data -1.369859e+00 2.418361e-01        0          24
## 362           desk -1.369859e+00 2.418361e-01        0          24
## 363      diagnosis -1.369859e+00 2.418361e-01        0          24
## 364          fight -1.369859e+00 2.418361e-01        0          24
## 365         second -1.369859e+00 2.418361e-01        0          24
## 366        standby -1.369859e+00 2.418361e-01        0          24
## 367        stepped -1.369859e+00 2.418361e-01        0          24
## 368          trees -1.369859e+00 2.418361e-01        0          24
## 369           blue -1.381280e+00 2.398833e-01        1          40
## 370         aboard -1.462219e+00 2.265773e-01        0          25
## 371            ago -1.462219e+00 2.265773e-01        0          25
## 372        bradley -1.462219e+00 2.265773e-01        0          25
## 373           dead -1.462219e+00 2.265773e-01        0          25
## 374          deeth -1.462219e+00 2.265773e-01        0          25
## 375      developed -1.462219e+00 2.265773e-01        0          25
## 376        disease -1.462219e+00 2.265773e-01        0          25
## 377           lock -1.462219e+00 2.265773e-01        0          25
## 378          trade -1.462219e+00 2.265773e-01        0          25
## 379           head -1.474029e+00 2.247111e-01        5          89
## 380        nothing -1.474029e+00 2.247111e-01        5          89
## 381          place -1.532945e+00 2.156710e-01        3          63
## 382          small -1.532945e+00 2.156710e-01        3          63
## 383            sir -1.544824e+00 2.139011e-01        1          42
## 384      beginning -1.554983e+00 2.124013e-01        0          26
## 385          death -1.554983e+00 2.124013e-01        0          26
## 386         farrel -1.554983e+00 2.124013e-01        0          26
## 387           hope -1.554983e+00 2.124013e-01        0          26
## 388          level -1.554983e+00 2.124013e-01        0          26
## 389         normal -1.554983e+00 2.124013e-01        0          26
## 390       somebody -1.554983e+00 2.124013e-01        0          26
## 391      spokesman -1.554983e+00 2.124013e-01        0          26
## 392       training -1.554983e+00 2.124013e-01        0          26
## 393           wave -1.554983e+00 2.124013e-01        0          26
## 394           body -1.627683e+00 2.020238e-01        1          43
## 395           call -1.627683e+00 2.020238e-01        1          43
## 396            men -1.627683e+00 2.020238e-01        1          43
## 397          drive -1.648105e+00 1.992170e-01        0          27
## 398            lot -1.648105e+00 1.992170e-01        0          27
## 399       medicine -1.648105e+00 1.992170e-01        0          27
## 400           shop -1.648105e+00 1.992170e-01        0          27
## 401          skull -1.648105e+00 1.992170e-01        0          27
## 402       thousand -1.648105e+00 1.992170e-01        0          27
## 403          found -1.692608e+00 1.932576e-01        4          79
## 404             us -1.702712e+00 1.919337e-01        9         144
## 405           cold -1.741548e+00 1.869428e-01        0          28
## 406            gun -1.741548e+00 1.869428e-01        0          28
## 407        hundred -1.741548e+00 1.869428e-01        0          28
## 408         minute -1.741548e+00 1.869428e-01        0          28
## 409         moruan -1.741548e+00 1.869428e-01        0          28
## 410       realized -1.741548e+00 1.869428e-01        0          28
## 411         ship's -1.741548e+00 1.869428e-01        0          28
## 412          woman -1.741548e+00 1.869428e-01        0          28
## 413         trying -1.795337e+00 1.802772e-01        1          45
## 414           much -1.796199e+00 1.801727e-01        5          94
## 415        another -1.827045e+00 1.764770e-01        3          67
## 416           life -1.827045e+00 1.764770e-01        3          67
## 417         afraid -1.835280e+00 1.755051e-01        0          29
## 418     broadcasts -1.835280e+00 1.755051e-01        0          29
## 419           part -1.835280e+00 1.755051e-01        0          29
## 420      questions -1.835280e+00 1.755051e-01        0          29
## 421     understand -1.835280e+00 1.755051e-01        0          29
## 422            get -1.862123e+00 1.723800e-01       16         233
## 423       appeared -1.929273e+00 1.648381e-01        0          30
## 424           fact -1.929273e+00 1.648381e-01        0          30
## 425       galactic -1.929273e+00 1.648381e-01        0          30
## 426         galaxy -1.929273e+00 1.648381e-01        0          30
## 427           hear -1.929273e+00 1.648381e-01        0          30
## 428           race -1.929273e+00 1.648381e-01        0          30
## 429           time -1.962688e+00 1.612261e-01       15         223
## 430          young -1.965309e+00 1.609466e-01        1          47
## 431         chance -2.023501e+00 1.548817e-01        0          31
## 432       earthmen -2.023501e+00 1.548817e-01        0          31
## 433          later -2.033268e+00 1.538894e-01        2          55
## 434          wrong -2.033268e+00 1.538894e-01        2          55
## 435       suddenly -2.051076e+00 1.520986e-01        1          48
## 436        machine -2.137323e+00 1.437525e-01        1          49
## 437     physicians -2.212581e+00 1.368895e-01        0          33
## 438           else -2.224022e+00 1.358786e-01        1          50
## 439        believe -2.307398e+00 1.287595e-01        0          34
## 440        certain -2.307398e+00 1.287595e-01        0          34
## 441        contact -2.307398e+00 1.287595e-01        0          34
## 442         howell -2.307398e+00 1.287595e-01        0          34
## 443        landing -2.307398e+00 1.287595e-01        0          34
## 444         answer -2.402378e+00 1.211509e-01        0          35
## 445            god -2.402378e+00 1.211509e-01        0          35
## 446    intelligent -2.402378e+00 1.211509e-01        0          35
## 447        patient -2.402378e+00 1.211509e-01        0          35
## 448          ships -2.402378e+00 1.211509e-01        0          35
## 449        surgery -2.402378e+00 1.211509e-01        0          35
## 450         almost -2.445210e+00 1.178841e-01        3          75
## 451           he’d -2.497510e+00 1.140265e-01        0          36
## 452        message -2.497510e+00 1.140265e-01        0          36
## 453       arnquist -2.592781e+00 1.073517e-01        0          37
## 454         graves -2.592781e+00 1.073517e-01        0          37
## 455        speaker -2.592781e+00 1.073517e-01        0          37
## 456          ain't -2.688180e+00 1.010952e-01        0          38
## 457        council -2.688180e+00 1.010952e-01        0          38
## 458           crew -2.688180e+00 1.010952e-01        0          38
## 459          light -2.688180e+00 1.010952e-01        0          38
## 460         plague -2.688180e+00 1.010952e-01        0          38
## 461         timgar -2.688180e+00 1.010952e-01        0          38
## 462          gorka -2.783699e+00 9.522800e-02        0          39
## 463         system -2.783699e+00 9.522800e-02        0          39
## 464           mind -2.793743e+00 9.463295e-02        2          64
## 465          among -2.879329e+00 8.972343e-02        0          40
## 466          dal's -2.879329e+00 8.972343e-02        0          40
## 467      operating -2.879329e+00 8.972343e-02        0          40
## 468          virus -2.879329e+00 8.972343e-02        0          40
## 469         better -2.948183e+00 8.597419e-02        4          96
## 470    information -2.975062e+00 8.455691e-02        0          41
## 471          mahon -2.975062e+00 8.455691e-02        0          41
## 472         report -2.975062e+00 8.455691e-02        0          41
## 473        general -3.070891e+00 7.970578e-02        0          42
## 474          haron -3.070891e+00 7.970578e-02        0          42
## 475         humans -3.070891e+00 7.970578e-02        0          42
## 476         reuben -3.070891e+00 7.970578e-02        0          42
## 477         screen -3.070891e+00 7.970578e-02        0          42
## 478        surgeon -3.070891e+00 7.970578e-02        0          42
## 479        garvian -3.166810e+00 7.514908e-02        0          43
## 480      physician -3.166810e+00 7.514908e-02        0          43
## 481          quite -3.166810e+00 7.514908e-02        0          43
## 482          first -3.257098e+00 7.111488e-02        8         156
## 483          betsy -3.262814e+00 7.086742e-02        0          44
## 484         matter -3.262814e+00 7.086742e-02        0          44
## 485         people -3.302263e+00 6.918449e-02        5         115
## 486        perhaps -3.358897e+00 6.684283e-02        0          45
## 487      creatures -3.455054e+00 6.305866e-02        0          46
## 488         across -3.580197e+00 5.847220e-02        1          56
## 489          voice -3.580197e+00 5.847220e-02        1          56
## 490       caldwell -3.647576e+00 5.615089e-02        0          48
## 491        control -3.674150e+00 5.526243e-02        1          57
## 492            red -3.743932e+00 5.299960e-02        0          49
## 493      inspector -3.936819e+00 4.724004e-02        0          51
## 494        trading -3.936819e+00 4.724004e-02        0          51
## 495          giles -4.033343e+00 4.460946e-02        0          52
## 496         tanner -4.033343e+00 4.460946e-02        0          52
## 497           post -4.051232e+00 4.413911e-02        1          61
## 498       contract -4.129918e+00 4.213138e-02        0          53
## 499          lecky -4.129918e+00 4.213138e-02        0          53
## 500        service -4.145792e+00 4.173812e-02        1          62
## 501      broadcast -4.323211e+00 3.759591e-02        0          55
## 502          field -4.323211e+00 3.759591e-02        0          55
## 503         brooks -4.419924e+00 3.552162e-02        0          56
## 504           mean -4.419924e+00 3.552162e-02        0          56
## 505          might -4.756826e+00 2.918232e-02        2          86
## 506          maybe -4.810429e+00 2.828800e-02        1          69
## 507         course -5.383253e+00 2.033093e-02        1          75
## 508          years -5.492756e+00 1.909541e-02        2          94
## 509  confederation -5.561838e+00 1.835613e-02        0          57
## 510       creature -5.561838e+00 1.835613e-02        0          57
## 511           star -5.757379e+00 1.641955e-02        0          59
## 512       machines -5.766430e+00 1.633518e-02        1          79
## 513          human -5.855159e+00 1.553146e-02        0          60
## 514           just -5.883854e+00 1.528034e-02       12         248
## 515        moklins -6.050739e+00 1.390048e-02        0          62
## 516         seemed -6.054200e+00 1.387326e-02        3         117
## 517         patrol -6.246345e+00 1.244498e-02        0          64
## 518           says -6.246627e+00 1.244301e-02        1          84
## 519           help -6.798047e+00 9.125765e-03        2         108
## 520       _lancet_ -6.833323e+00 8.947266e-03        0          70
## 521          began -6.833323e+00 8.947266e-03        0          70
## 522         moklin -7.616329e+00 5.784210e-03        0          78
## 523        matilda -8.007991e+00 4.657137e-03        0          82
## 524        doctors -8.203862e+00 4.180132e-03        0          84
## 525            now -8.591016e+00 3.378255e-03       12         284
## 526        medical -8.889617e+00 2.867969e-03        0          91
## 527           room -9.070585e+00 2.597539e-03        2         132
## 528        bellews -9.183612e+00 2.441917e-03        0          94
## 529         conger -9.281623e+00 2.314643e-03        0          95
## 530          earth -9.452436e+00 2.108688e-03        2         136
## 531         planet -1.016402e+01 1.432078e-03        0         104
## 532          fuzzy -1.085071e+01 9.875815e-04        0         111
## 533            man -1.236392e+01 4.377115e-04        1         147
## 534       sergeant -1.350238e+01 2.382610e-04        0         138
## 535       hospital -1.674990e+01 4.264451e-05        0         171
## 536          black -1.903686e+01 1.282174e-05        1         215
## 537         doctor -2.160301e+01 3.353252e-06        2         261
## 538           jack -2.227748e+01 2.359582e-06        0         227
## 539           ship -2.257419e+01 2.021828e-06        0         230
## 540          tiger -2.564376e+01 4.106215e-07        0         261
## 541            dal -5.023330e+01 1.365130e-12        0         507

Summary for keyness dataframe

print(summary(books_stat_keyness))
##    feature               chi2                 p              n_target      
##  Length:541         Min.   : -50.2333   Min.   :0.00000   Min.   :  0.000  
##  Class :character   1st Qu.:  -1.7415   1st Qu.:0.09523   1st Qu.:  0.000  
##  Mode  :character   Median :  -0.4288   Median :0.25828   Median :  2.000  
##                     Mean   :  11.5541   Mean   :0.36217   Mean   :  4.874  
##                     3rd Qu.:   0.0706   3rd Qu.:0.62522   3rd Qu.:  5.000  
##                     Max.   :1887.2050   Max.   :0.99619   Max.   :185.000  
##   n_reference    
##  Min.   :  0.00  
##  1st Qu.: 24.00  
##  Median : 32.00  
##  Mean   : 50.05  
##  3rd Qu.: 53.00  
##  Max.   :684.00
Plot keyness
plot_keyness <- textplot_keyness(books_stat_keyness)
print(plot_keyness)

Feature co-occurence matrix
books_fcm <- fcm(books_dfm)
print(books_fcm)
## Feature co-occurrence matrix of: 541 by 541 features.
##            features
## features    place four  see something exactly  know great though take  old
##   place       405  185 3071      1997     377  3255   983    976 1189  785
##   four          0   26  724       482      80   803   230    246  306  248
##   see           0    0 5759      7681    1438 12285  3991   3932 4441 2903
##   something     0    0    0      2579     912  8272  2696   2621 2873 1867
##   exactly       0    0    0         0      83  1437   481    470  543  337
##   know          0    0    0         0       0  6547  4182   4123 4687 3133
##   great         0    0    0         0       0     0   716   1392 1406  876
##   though        0    0    0         0       0     0     0    678 1480  988
##   take          0    0    0         0       0     0     0      0  889 1316
##   old           0    0    0         0       0     0     0      0    0  474
## [ reached max_feat ... 531 more features, reached max_nfeat ... 531 more features ]

Subsetting based on top features

feat <- names(topfeatures(books_fcm, 50))
size <- log(colSums(dfm_select(books_dfm, feat, selection = "keep")))
books_fcm_select <- fcm_select(books_fcm, pattern = feat, selection = "keep")
print(books_fcm_select)
## Feature co-occurrence matrix of: 50 by 50 features.
##         features
## features   now  time  said  just   two  room  came earth think seemed
##   now    13869 22769 68910 27701 12212 14413 12185 18293 14601  12218
##   time       0  9218 56605 22771 10045 11806 10092 14960 12011   9933
##   said       0     0 90727 67578 30911 35799 31553 44427 37144  29822
##   just       0     0     0 14379 12211 14620 11865 18777 14932  11874
##   two        0     0     0     0  2668  6360  5521  7905  6518   5276
##   room       0     0     0     0     0  3751  6300  9628  7604   6214
##   came       0     0     0     0     0     0  2834  7705  6443   5324
##   earth      0     0     0     0     0     0     0  6258  9604   7986
##   think      0     0     0     0     0     0     0     0  3912   6274
##   seemed     0     0     0     0     0     0     0     0     0   2674
## [ reached max_feat ... 40 more features, reached max_nfeat ... 40 more features ]
Create feature network plot
plot_fcm <- textplot_network(books_fcm_select, min_freq = 0.8, vertex_size = size / max(size) * 3)
print(plot_fcm)

Create frequency plot
plot_freq <- books_dfm %>%
  textstat_frequency(n = 15) %>%
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()
print(plot_freq)

Lexical diversity
books_lexdiv <- textstat_lexdiv(books_dfm)
print(books_lexdiv)
##                            document        TTR
## 1         Return to Pleasure Island 0.13917330
## 2                      Star Surgeon 0.03489763
## 3                          Divinity 0.25808879
## 4  The Machine That Saved The World 0.12849162
## 5                         The Skull 0.18506998
## 6                Where There's Hope 0.41176471
## 7               The Dwindling Years 0.26083815
## 8          The Luckiest Man in Denv 0.28627451
## 9                           Pen Pal 0.27806563
## 10              If You Was a Moklin 0.14186851
Plot lexical diversity
plot(books_lexdiv$TTR, type = "l", xaxt = "n", xlab = NULL, ylab = "TTR")
grid()
axis(1, at = seq_len(nrow(books_lexdiv)), labels = books_dfm$author, las = 3, hadj = 0)

Claster Dendrogram
books_dist <- as.dist(textstat_dist(books_dfm))
clust <- hclust(books_dist)
plot(clust, xlab = "Distance", ylab = NULL)

Collocation analysis
stat_colloc <- textstat_collocations(books_tokens, min_count = 100)
print(head(stat_colloc, 10))
##       collocation count count_nested length   lambda        z
## 1        had been   190            0      2 4.752868 45.51074
## 2    black doctor   164            0      2 8.216841 43.57646
## 3          of the   724            0      2 1.932639 42.20224
## 4          it was   245            0      2 2.919331 38.60291
## 5          he had   249            0      2 2.866186 37.67258
## 6  hospital earth   102            0      2 8.462323 37.19547
## 7       there was   146            0      2 3.798476 36.21688
## 8         he said   212            0      2 3.027291 36.20042
## 9          in the   461            0      2 2.103731 36.18028
## 10        i don't   112            0      2 4.651049 35.41933

Collocations of three words

stat_colloc2 <- tokens_select(books_tokens, pattern = "^[A-Z]",
                            valuetype = "regex",
                            case_insensitive = FALSE,
                            padding = TRUE) %>%
  textstat_collocations(min_count = 5, size = 3)
print(head(stat_colloc2, 20))
##                collocation count count_nested length     lambda          z
## 1       doctor hugo tanner     8            0      3 -0.5685091 -0.3138335
## 2  transcriber's note this     6            0      3 -2.2562365 -0.8240342
## 3       the black doctor's     5            0      3 -2.3219757 -1.4348132
## 4        black doctor hugo     9            0      3 -3.1647519 -1.5307211
## 5   four-star black doctor     5            0      3 -3.2740270 -1.5775303
## 6            old man bland     5            0      3 -7.2460743 -2.3826886
## 7  general practice patrol    16            0      3 -9.3662428 -3.1002355
## 8      black doctor tanner    20            0      3 -4.6894066 -3.1355719
## 9    black doctor arnquist    22            0      3 -5.7237567 -3.7058302
## 10        the black doctor    36            0      3 -2.6180264 -3.9773838