This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
print("hello!")
## [1] "hello!"
Load libraries.
library(gutenbergr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(quanteda)
## Package version: 3.3.0
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textplots)
library(quanteda.textstats)
library(openxlsx)
library(tictoc)
Dictionary path
dict_path <- '/media/catana/Data/projects/teaching/nlp/dict/dicts.xlsx'
gutenfiles() to get text
# function to get n texts from gutenbergr package
gutenfiles <-
function(n = 20,
lang = 'en',
bshelf = 'Science Fiction',
author_birthdate = 1850) {
#set.seed(55)
metadata_raw <-
gutenberg_metadata
if (lang != 'all') {
metadata_raw <- metadata_raw %>%
filter(language %in% lang)
}
if (bshelf != 'all') {
metadata_raw <- metadata_raw %>%
filter(gutenberg_bookshelf %in% bshelf)
}
if (author_birthdate != 'all') {
new_authors <- gutenberg_authors %>%
filter(birthdate > author_birthdate)
metadata_raw <- metadata_raw %>%
filter(gutenberg_author_id %in% new_authors$gutenberg_author_id)
}
gb_ids <- metadata_raw$gutenberg_id
gb_id <- sample(gb_ids, n, replace = F)
metadata_filtered <-
filter(metadata_raw, gutenberg_id %in% gb_id)
gb_dwnld <-
gutenberg_download(gb_id, meta_fields = c('title', 'author'))
return(gb_dwnld)
}
read_dict() to read dictionary from .xlsx file
# function to read dictionary from .xlsx
read_dict <- function(dict_path = "dicts.xlsx") {
# read in the xlsx file
wb <- loadWorkbook(dict_path)
# get the names of all sheets in the workbook
sheet_names <- names(wb)
# create an empty list to store the data
my_list <- list()
# loop through each sheet in the workbook
for (sheet_name in sheet_names) {
# read the sheet into a data frame
df <- read.xlsx(wb, sheet = sheet_name)
df[is.na(df)] <- ""
# create an empty list for this sheet
sheet_list <- list()
# loop through each column in the data frame and add it to the sheet list
for (col_name in colnames(df)) {
sheet_list[[col_name]] <- df[[col_name]]
}
# add the sheet list to the main list with the sheet name as the list name
my_list[[sheet_name]] <- sheet_list
}
ddd <- dictionary(my_list)
return(ddd)
}
# get a bunch of books
books <- gutenfiles(n = 10)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
message('Number of rows before reshaping: ', nrow(books))
## Number of rows before reshaping: 14243
books_aggregated <- books %>%
group_by(gutenberg_id) %>%
summarise(
title = first(title),
author = first(author),
text = paste(text, collapse = " ")
) %>%
ungroup()
message('Number of rows after reshaping: ', nrow(books_aggregated))
## Number of rows after reshaping: 10
# Construct a corpus
books_corpus <- corpus(books_aggregated, docid_field = 'title')
# check corpus document names
books_docnames <- docnames(books_corpus)
# get corpus document variables (metadata)
books_docvars <- docvars(books_corpus)
# If you want to extract individual elements of document variables,
# you can specify field
books_docvars_authors <- docvars(books_corpus, field = 'author')
# Reshape corpus
corp_sent <- corpus_reshape(books_corpus, to = "sentences")
# Number of documents in the new corpus
message('Number of documents in the corpus of sentences: ', ndoc(corp_sent))
## Number of documents in the corpus of sentences: 9321
# reshape back
corp_doc <- corpus_reshape(corp_sent, to = "documents")
# get summary for the corpus
books_summary <- summary(books_corpus)
print(books_summary)
## Corpus consisting of 10 documents, showing 10 documents:
##
## Text Types Tokens Sentences gutenberg_id
## Return to Pleasure Island 2730 13380 826 17027
## Star Surgeon 5868 59810 3518 18492
## Divinity 1692 7709 453 22623
## The Machine That Saved The World 3150 16578 1201 26174
## The Skull 1806 10106 1113 30255
## Where There's Hope 862 2682 215 30715
## The Dwindling Years 1656 6621 458 50103
## The Luckiest Man in Denv 1408 5555 410 50835
## Pen Pal 1523 6266 408 51286
## If You Was a Moklin 1672 9879 719 51752
## author
## Doctorow, Cory
## Nourse, Alan Edward
## Samachson, Joseph
## Leinster, Murray
## Dick, Philip K.
## Bixby, Jerome
## Del Rey, Lester
## Kornbluth, C. M. (Cyril M.)
## Marlowe, Stephen
## Leinster, Murray
# tokens() segments texts in a corpus
# into tokens (words or sentences) by word boundaries.
books_tokens <- tokens(
books_corpus,
remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T
)
Looking for “blast” and “start”
# Key words in context
kw_blast_start <-
kwic(books_tokens, pattern = c("blast*", "start*"))
kw_blast_start_sample <-
kw_blast_start %>% as.data.frame() %>%
select(pre, keyword, post) %>%
head(10)
print(kw_blast_start_sample)
## pre keyword post
## 1 have just sat up and started talking Orville should know about
## 2 they stopped Every time he started a new grade or a
## 3 long in the tooth He'd start pushing fudge-nut dips and by
## 4 in the eye at the start of the speech then switched
## 5 about the line The youngest started vibrating with excitement and the
## 6 his grip He looked up startled into Joe's grinning face Unlike
## 7 We're gonna flatten that sucker start fresh and build us a
## 8 look and then his chest started ringing He extracted a slim
## 9 then Good night George George started cooking dinner for two More
## 10 him search After that Joe started sending out a runner usually
Looking for a multi-word _“being d*“_
# for a multi-word expressions
# separate words by white space and wrap the character vector by phrase()
kw_being_do <- kwic(books_tokens, pattern = phrase("being d*"))
kw_being_do_sample <-
kw_being_do %>% as.data.frame() %>%
select(pre, keyword, post) %>%
head(10)
print(kw_being_do_sample)
## pre keyword
## 1 him--one of the special jeeps being developed
## 2 than anything else he’d loved being driven
## post
## 1 at this particular installation--and its
## 2 Even after chauffeurs were a
# Select tokens
toks_nostop <-
tokens_select(books_tokens, pattern = stopwords("en"), selection = "remove")
toks_nostop2 <-
tokens_remove(books_tokens, pattern = stopwords("en"))
# see also parameters padding and window
Tokens with stopwords removed (sample):
print(sample(toks_nostop[[1]], 30))
## [1] "budget" "civilised" "points" "fretted" "tell"
## [6] "shape" "Alarm-bells" "strong" "necessary" "inside"
## [11] "boy" "third" "flattened" "refill" "freezer"
## [16] "Licensor" "huh" "building" "ground" "smile"
## [21] "top" "laughing" "September" "happy" "may"
## [26] "constituting" "merch" "explosion" "perform" "slapped"
dict <- read_dict(dict_path = dict_path)
toks_dict <- tokens_lookup(books_tokens, dictionary = dict)
message('Sample of dictionary entries found in the book - ', names(toks_dict)[1])
## Sample of dictionary entries found in the book - Return to Pleasure Island
toks_dict_sample <- toks_dict[[1]][grep('body|animals|numerals',toks_dict[[1]], ignore.case = T)]
toks_dict_sample <- toks_dict_sample %>% head(100)
print(toks_dict_sample)
## [1] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
## [4] "BODY.UPPER" "NUMERALS.CARDINAL" "NUMERALS.ORDINAL"
## [7] "BODY.SENSES" "BODY.INTERNAL" "NUMERALS.CARDINAL"
## [10] "BODY.UPPER" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
## [13] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"
## [16] "NUMERALS.ORDINAL" "NUMERALS.CARDINAL" "NUMERALS.ORDINAL"
## [19] "NUMERALS.ORDINAL" "NUMERALS.ORDINAL" "BODY.UPPER"
## [22] "BODY.UPPER" "NUMERALS.CARDINAL" "BODY.UPPER"
## [25] "BODY.UPPER" "BODY.UPPER" "BODY.UPPER"
## [28] "NUMERALS.CARDINAL" "BODY.UPPER" "BODY.UPPER"
## [31] "NUMERALS.CARDINAL" "BODY.UPPER" "BODY.UPPER"
## [34] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.HEAD"
## [37] "BODY.HEAD" "BODY.UPPER" "BODY.HEAD"
## [40] "BODY.UPPER" "BODY.HEAD" "BODY.UPPER"
## [43] "BODY.HEAD" "BODY.HEAD" "NUMERALS.CARDINAL"
## [46] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"
## [49] "BODY.UPPER" "BODY.UPPER" "NUMERALS.CARDINAL"
## [52] "NUMERALS.CARDINAL" "BODY.UPPER" "NUMERALS.CARDINAL"
## [55] "NUMERALS.CARDINAL" "BODY.UPPER" "BODY.UPPER"
## [58] "BODY.HEAD" "BODY.EYE" "BODY.FLUIDS"
## [61] "BODY.UPPER" "NUMERALS.CARDINAL" "BODY.SENSES"
## [64] "BODY.HEAD" "BODY.EYE" "BODY.HEAD"
## [67] "BODY.HEAD" "BODY.INTERNAL" "BODY.UPPER"
## [70] "BODY.LOWER" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
## [73] "BODY.LOWER" "NUMERALS.ORDINAL" "NUMERALS.CARDINAL"
## [76] "BODY.FLUIDS" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
## [79] "BODY.HEAD" "BODY.LOWER" "BODY.UPPER"
## [82] "BODY.UPPER" "NUMERALS.CARDINAL" "NUMERALS.CARDINAL"
## [85] "NUMERALS.CARDINAL" "BODY.UPPER" "NUMERALS.CARDINAL"
## [88] "NUMERALS.CARDINAL" "NUMERALS.CARDINAL" "BODY.UPPER"
## [91] "NUMERALS.CARDINAL" "ANIMALS.MAMMAL" "ANIMALS.MAMMAL"
## [94] "ANIMALS.MAMMAL" "NUMERALS.CARDINAL" "BODY.HEAD"
## [97] "NUMERALS.CARDINAL" "BODY.UPPER" "NUMERALS.CARDINAL"
## [100] "BODY.EYE"
Create DFM from tokens
books_dfm <- dfm(books_tokens) %>%
dfm_remove(stopwords('en')) %>%
dfm_trim(min_termfreq = 20)
Document Feature matrix:
print(books_dfm)
## Document-feature matrix of: 10 documents, 541 features (35.49% sparse) and 2 docvars.
## features
## docs place four see something exactly know great
## Return to Pleasure Island 3 3 15 9 1 20 2
## Star Surgeon 24 5 99 67 12 103 38
## Divinity 3 0 8 6 2 4 3
## The Machine That Saved The World 6 2 14 19 2 20 2
## The Skull 9 2 16 13 0 33 2
## Where There's Hope 6 0 5 2 0 5 0
## features
## docs though take old
## Return to Pleasure Island 9 15 13
## Star Surgeon 35 33 19
## Divinity 1 10 8
## The Machine That Saved The World 4 7 5
## The Skull 1 9 9
## Where There's Hope 0 1 1
## [ reached max_ndoc ... 4 more documents, reached max_nfeat ... 531 more features ]
Plot DFM
textplot_wordcloud(books_dfm, max_words = 150, color = c('blue','red'))
# change colors
# 150 most frequent words
books_freqs <- textstat_frequency(books_dfm, n = 5, groups = docname_)
print(books_freqs)
## feature frequency rank docfreq group
## 1 god 34 1 1 Divinity
## 2 one 30 2 1 Divinity
## 3 helmet 25 3 1 Divinity
## 4 bradley 25 3 1 Divinity
## 5 first 22 5 1 Divinity
## 6 moklin 78 1 1 If You Was a Moklin
## 7 says 73 2 1 If You Was a Moklin
## 8 moklins 62 3 1 If You Was a Moklin
## 9 like 60 4 1 If You Was a Moklin
## 10 brooks 56 5 1 If You Was a Moklin
## 11 matilda 82 1 1 Pen Pal
## 12 haron 42 2 1 Pen Pal
## 13 gorka 39 3 1 Pen Pal
## 14 now 20 4 1 Pen Pal
## 15 said 20 4 1 Pen Pal
## 16 george 185 1 1 Return to Pleasure Island
## 17 bill 96 2 1 Return to Pleasure Island
## 18 said 83 3 1 Return to Pleasure Island
## 19 joe 77 4 1 Return to Pleasure Island
## 20 work 70 5 1 Return to Pleasure Island
## 21 dal 507 1 1 Star Surgeon
## 22 said 390 2 1 Star Surgeon
## 23 tiger 260 3 1 Star Surgeon
## 24 doctor 248 4 1 Star Surgeon
## 25 jack 227 5 1 Star Surgeon
## 26 giles 52 1 1 The Dwindling Years
## 27 he’d 36 2 1 The Dwindling Years
## 28 years 26 3 1 The Dwindling Years
## 29 even 23 4 1 The Dwindling Years
## 30 now 20 5 1 The Dwindling Years
## 31 said 43 1 1 The Luckiest Man in Denv
## 32 reuben 42 2 1 The Luckiest Man in Denv
## 33 may 27 3 1 The Luckiest Man in Denv
## 34 man 25 4 1 The Luckiest Man in Denv
## 35 one 24 5 1 The Luckiest Man in Denv
## 36 sergeant 138 1 1 The Machine That Saved The World
## 37 said 127 2 1 The Machine That Saved The World
## 38 bellews 94 3 1 The Machine That Saved The World
## 39 machines 76 4 1 The Machine That Saved The World
## 40 broadcast 55 5 1 The Machine That Saved The World
## 41 conger 95 1 1 The Skull
## 42 said 65 2 1 The Skull
## 43 know 33 3 1 The Skull
## 44 man 32 4 1 The Skull
## 45 speaker 31 5 1 The Skull
## 46 farrel 26 1 1 Where There's Hope
## 47 said 22 2 1 Where There's Hope
## 48 got 14 3 1 Where There's Hope
## 49 one 10 4 1 Where There's Hope
## 50 back 8 5 1 Where There's Hope
Number of features:
message('Number of features in the document-feature matrix: ', nfeat(books_dfm))
## Number of features in the document-feature matrix: 541
# Create boolean vector with TRUE value for the first book
b_logic <- books_dfm@docvars[["docname_"]]==books_dfm@docvars[["docname_"]][[1]]
books_stat_keyness <- textstat_keyness(books_dfm, b_logic)
print(books_stat_keyness)
## feature chi2 p n_target n_reference
## 1 george 1.887205e+03 0.000000e+00 185 2
## 2 bill 7.945362e+02 0.000000e+00 96 19
## 3 joe 6.746958e+02 0.000000e+00 77 11
## 4 orville 6.379538e+02 0.000000e+00 62 0
## 5 license 5.236493e+02 0.000000e+00 52 0
## 6 -- 3.691952e+02 0.000000e+00 37 0
## 7 tom 2.766100e+02 0.000000e+00 28 0
## 8 work 2.715298e+02 0.000000e+00 70 78
## 9 son 2.418272e+02 0.000000e+00 32 8
## 10 cabin 1.934705e+02 0.000000e+00 22 2
## 11 soft 1.680370e+02 0.000000e+00 25 9
## 12 rights 1.654450e+02 0.000000e+00 21 4
## 13 boys 1.359679e+02 0.000000e+00 18 4
## 14 terms 1.248107e+02 0.000000e+00 19 7
## 15 strong 9.408956e+01 0.000000e+00 15 6
## 16 father 5.851406e+01 2.020606e-14 12 8
## 17 works 4.394992e+01 3.368861e-11 11 10
## 18 left 4.192005e+01 9.508261e-11 21 46
## 19 care 2.842322e+01 9.748910e-08 13 24
## 20 ones 2.539908e+01 4.661392e-07 16 39
## 21 home 2.372082e+01 1.113711e-06 12 24
## 22 may 1.590030e+01 6.676830e-05 16 56
## 23 big 1.572995e+01 7.305803e-05 11 28
## 24 thinking 1.266626e+01 3.723126e-04 7 14
## 25 night 1.223409e+01 4.692423e-04 10 28
## 26 means 1.164410e+01 6.440656e-04 9 24
## 27 hand 1.073902e+01 1.048996e-03 14 56
## 28 morning 9.655889e+00 1.887455e-03 8 22
## 29 waited 9.655889e+00 1.887455e-03 8 22
## 30 say 9.006435e+00 2.690307e-03 17 80
## 31 hands 8.586596e+00 3.386465e-03 14 62
## 32 free 8.585092e+00 3.389262e-03 6 14
## 33 mouth 7.720172e+00 5.460718e-03 7 20
## 34 held 7.585335e+00 5.884504e-03 9 31
## 35 took 6.081947e+00 1.365701e-02 14 71
## 36 away 5.872355e+00 1.538046e-02 16 86
## 37 watched 5.551234e+00 1.846767e-02 8 30
## 38 put 5.480657e+00 1.922803e-02 11 53
## 39 already 5.254307e+00 2.189252e-02 9 37
## 40 right 4.825830e+00 2.803636e-02 25 161
## 41 smiled 4.805552e+00 2.836817e-02 10 45
## 42 ran 4.594436e+00 3.207590e-02 5 15
## 43 inside 4.416169e+00 3.559989e-02 7 27
## 44 whole 4.074276e+00 4.354091e-02 7 28
## 45 always 3.835594e+00 5.017522e-02 9 42
## 46 said 3.689751e+00 5.474778e-02 83 684
## 47 line 3.650637e+00 5.604779e-02 5 17
## 48 made 3.623977e+00 5.695256e-02 16 99
## 49 old 3.463035e+00 6.275502e-02 13 77
## 50 day 3.376667e+00 6.612565e-02 9 48
## 51 take 3.370037e+00 6.639228e-02 15 93
## 52 eyes 3.304895e+00 6.907374e-02 13 78
## 53 tell 2.953729e+00 8.567964e-02 11 65
## 54 back 2.910289e+00 8.801611e-02 32 239
## 55 led 2.896561e+00 8.876874e-02 5 19
## 56 kind 2.789388e+00 9.489047e-02 9 51
## 57 door 2.757063e+00 9.682585e-02 9 47
## 58 still 2.594032e+00 1.072670e-01 13 83
## 59 stared 2.510449e+00 1.130937e-01 8 41
## 60 along 2.472795e+00 1.158317e-01 7 34
## 61 talk 2.472795e+00 1.158317e-01 7 34
## 62 use 2.148104e+00 1.427461e-01 8 43
## 63 anything 2.095767e+00 1.477076e-01 11 71
## 64 arm 2.028967e+00 1.543255e-01 5 22
## 65 told 1.974049e+00 1.600188e-01 10 64
## 66 clearly 1.841162e+00 1.748148e-01 4 16
## 67 best 1.827389e+00 1.764362e-01 6 30
## 68 needed 1.827389e+00 1.764362e-01 6 30
## 69 hard 1.794985e+00 1.803200e-01 5 23
## 70 though 1.725279e+00 1.890149e-01 9 58
## 71 end 1.643788e+00 1.998065e-01 6 31
## 72 started 1.604330e+00 2.052907e-01 8 51
## 73 high 1.583852e+00 2.082067e-01 5 24
## 74 brought 1.573711e+00 2.096689e-01 7 39
## 75 keep 1.573711e+00 2.096689e-01 7 39
## 76 good 1.537096e+00 2.150506e-01 17 127
## 77 every 1.532758e+00 2.156990e-01 11 76
## 78 since 1.474973e+00 2.245626e-01 6 32
## 79 stood 1.474973e+00 2.245626e-01 6 32
## 80 new 1.424182e+00 2.327162e-01 13 94
## 81 important 1.393284e+00 2.378512e-01 5 25
## 82 stop 1.393284e+00 2.378512e-01 5 25
## 83 working 1.221292e+00 2.691072e-01 5 26
## 84 hardly 1.145080e+00 2.845814e-01 4 19
## 85 name 1.066134e+00 3.018202e-01 5 27
## 86 around 1.057137e+00 3.038691e-01 12 90
## 87 gave 1.046435e+00 3.063295e-01 6 35
## 88 meant 9.679492e-01 3.251922e-01 4 20
## 89 getting 9.262814e-01 3.358311e-01 5 28
## 90 picked 9.262814e-01 3.358311e-01 5 28
## 91 sat 9.262814e-01 3.358311e-01 5 28
## 92 want 9.050042e-01 3.414434e-01 13 101
## 93 without 8.618851e-01 3.532117e-01 11 84
## 94 sure 8.192838e-01 3.653896e-01 7 50
## 95 everything 8.166105e-01 3.661730e-01 6 37
## 96 angry 8.127779e-01 3.673002e-01 4 21
## 97 called 8.003833e-01 3.709788e-01 5 29
## 98 three 7.950273e-01 3.725843e-01 11 85
## 99 taking 6.770333e-01 4.106102e-01 4 22
## 100 stopped 5.903774e-01 4.422732e-01 7 48
## 101 thought 5.344642e-01 4.647360e-01 14 117
## 102 asked 5.112691e-01 4.745900e-01 7 54
## 103 like 5.104019e-01 4.749649e-01 27 240
## 104 kept 4.555015e-01 4.997339e-01 4 24
## 105 start 4.143168e-01 5.197865e-01 5 33
## 106 except 3.662702e-01 5.450444e-01 4 25
## 107 walked 3.662702e-01 5.450444e-01 4 25
## 108 arrived 3.252705e-01 5.684573e-01 3 17
## 109 long 2.924049e-01 5.886835e-01 13 114
## 110 give 2.797644e-01 5.968557e-01 6 44
## 111 set 2.797644e-01 5.968557e-01 6 44
## 112 looked 2.766563e-01 5.989012e-01 14 124
## 113 four 2.385988e-01 6.252200e-01 3 18
## 114 stand 2.385988e-01 6.252200e-01 3 18
## 115 supposed 2.385988e-01 6.252200e-01 3 18
## 116 need 2.054994e-01 6.503180e-01 7 60
## 117 others 1.926587e-01 6.607130e-01 6 51
## 118 gone 1.866626e-01 6.657090e-01 6 46
## 119 bring 1.686618e-01 6.813038e-01 3 19
## 120 grew 1.686618e-01 6.813038e-01 3 19
## 121 guy 1.686618e-01 6.813038e-01 3 19
## 122 opened 1.686618e-01 6.813038e-01 3 19
## 123 school 1.686618e-01 6.813038e-01 3 19
## 124 upon 1.685714e-01 6.813846e-01 4 28
## 125 one 1.584716e-01 6.905676e-01 35 335
## 126 knew 1.476716e-01 7.007705e-01 11 100
## 127 came 1.434428e-01 7.048821e-01 15 139
## 128 waiting 1.347047e-01 7.136031e-01 5 38
## 129 months 1.132777e-01 7.364430e-01 3 20
## 130 can 9.738320e-02 7.549928e-01 19 181
## 131 hours 8.629570e-02 7.689403e-01 6 49
## 132 pretty 7.758898e-02 7.805917e-01 3 26
## 133 dark 7.062809e-02 7.904245e-01 3 21
## 134 smile 7.062809e-02 7.904245e-01 3 21
## 135 water 7.062809e-02 7.904245e-01 3 21
## 136 full 7.058178e-02 7.904916e-01 5 40
## 137 job 7.058178e-02 7.904916e-01 5 40
## 138 worked 6.272879e-02 8.022338e-01 4 36
## 139 think 6.184692e-02 8.036003e-01 15 144
## 140 next 3.944871e-02 8.425624e-01 4 37
## 141 sleep 3.918568e-02 8.430813e-01 3 22
## 142 talking 3.202142e-02 8.579807e-01 4 32
## 143 future 3.134327e-02 8.594767e-01 2 18
## 144 handed 3.134327e-02 8.594767e-01 2 18
## 145 late 3.134327e-02 8.594767e-01 2 18
## 146 probably 3.134327e-02 8.594767e-01 2 18
## 147 received 3.134327e-02 8.594767e-01 2 18
## 148 able 2.473189e-02 8.750370e-01 3 28
## 149 alone 2.473189e-02 8.750370e-01 3 28
## 150 live 1.765809e-02 8.942853e-01 3 23
## 151 living 1.765809e-02 8.942853e-01 3 23
## 152 little 1.328081e-02 9.082530e-01 12 119
## 153 make 1.270869e-02 9.102424e-01 11 109
## 154 em 1.095220e-02 9.166513e-01 2 19
## 155 feeling 1.095220e-02 9.166513e-01 2 19
## 156 rehab 1.095220e-02 9.166513e-01 2 19
## 157 shoulders 1.095220e-02 9.166513e-01 2 19
## 158 idea 9.918733e-03 9.206676e-01 3 29
## 159 white 9.918733e-03 9.206676e-01 3 29
## 160 heard 9.741597e-03 9.213768e-01 4 39
## 161 finally 5.797424e-03 9.393071e-01 5 44
## 162 carried 4.944384e-03 9.439419e-01 3 24
## 163 office 4.944384e-03 9.439419e-01 3 24
## 164 went 4.198068e-03 9.483392e-01 13 131
## 165 yes 3.187940e-03 9.549739e-01 5 50
## 166 ground 1.911346e-03 9.651284e-01 3 30
## 167 breath 1.273759e-03 9.715297e-01 2 20
## 168 wanted 5.396907e-04 9.814658e-01 6 61
## 169 reached 4.805192e-04 9.825112e-01 4 35
## 170 trouble 4.805192e-04 9.825112e-01 4 35
## 171 near 1.009331e-04 9.919842e-01 3 25
## 172 suppose 1.009331e-04 9.919842e-01 3 25
## 173 things 2.277671e-05 9.961921e-01 8 82
## 174 soon -1.098012e-04 9.916394e-01 3 31
## 175 least -7.689967e-04 9.778769e-01 3 37
## 176 clear -9.114875e-04 9.759148e-01 2 21
## 177 table -1.824388e-03 9.659304e-01 4 42
## 178 ahead -2.314249e-03 9.616313e-01 2 27
## 179 carefully -2.314249e-03 9.616313e-01 2 27
## 180 minutes -2.314249e-03 9.616313e-01 2 27
## 181 let -3.139136e-03 9.553195e-01 4 48
## 182 different -3.982732e-03 9.496798e-01 3 32
## 183 words -3.982732e-03 9.496798e-01 3 32
## 184 bit -5.800665e-03 9.392901e-01 3 38
## 185 looking -5.800665e-03 9.392901e-01 3 38
## 186 world -5.800665e-03 9.392901e-01 3 38
## 187 blood -8.701690e-03 9.256789e-01 2 22
## 188 business -8.701690e-03 9.256789e-01 2 22
## 189 moving -8.701690e-03 9.256789e-01 2 22
## 190 nobody -8.701690e-03 9.256789e-01 2 22
## 191 bad -1.087931e-02 9.169282e-01 2 28
## 192 behind -1.087931e-02 9.169282e-01 2 28
## 193 turn -1.087931e-02 9.169282e-01 2 28
## 194 five -1.305781e-02 9.090231e-01 3 33
## 195 heart -1.305781e-02 9.090231e-01 3 33
## 196 used -1.523720e-02 9.017594e-01 3 39
## 197 longer -2.366686e-02 8.777356e-01 2 23
## 198 days -2.518207e-02 8.739142e-01 2 29
## 199 side -2.518207e-02 8.739142e-01 2 29
## 200 turned -2.851377e-02 8.659066e-01 9 98
## 201 open -2.877173e-02 8.653072e-01 3 40
## 202 last -3.270224e-02 8.564950e-01 4 51
## 203 going -3.489137e-02 8.518235e-01 8 88
## 204 either -4.468524e-02 8.325839e-01 2 30
## 205 remember -4.468524e-02 8.325839e-01 2 30
## 206 taken -4.468524e-02 8.325839e-01 2 30
## 207 question -4.497987e-02 8.320411e-01 2 24
## 208 allowed -4.676162e-02 8.287973e-01 1 19
## 209 note -4.676162e-02 8.287973e-01 1 19
## 210 remembered -4.676162e-02 8.287973e-01 1 19
## 211 someone -4.676162e-02 8.287973e-01 1 19
## 212 walking -4.676162e-02 8.287973e-01 1 19
## 213 nodded -4.737361e-02 8.276981e-01 4 46
## 214 shook -4.737361e-02 8.276981e-01 4 46
## 215 air -6.749936e-02 7.950131e-01 3 36
## 216 hair -6.891664e-02 7.929207e-01 2 31
## 217 sent -6.891664e-02 7.929207e-01 2 31
## 218 ever -7.112035e-02 7.897125e-01 6 69
## 219 fast -7.193613e-02 7.885384e-01 2 25
## 220 even -7.441552e-02 7.850130e-01 17 187
## 221 know -7.640303e-02 7.822324e-01 20 219
## 222 comes -7.793042e-02 7.801219e-01 1 20
## 223 gets -7.793042e-02 7.801219e-01 1 20
## 224 learned -7.793042e-02 7.801219e-01 1 20
## 225 stars -7.793042e-02 7.801219e-01 1 20
## 226 never -8.094318e-02 7.760233e-01 9 102
## 227 go -8.465930e-02 7.710798e-01 10 113
## 228 got -1.046298e-01 7.463429e-01 15 168
## 229 also -1.151279e-01 7.343799e-01 1 21
## 230 certainly -1.151279e-01 7.343799e-01 1 21
## 231 toward -1.186668e-01 7.304850e-01 3 44
## 232 ask -1.575687e-01 6.914049e-01 1 22
## 233 car -1.575687e-01 6.914049e-01 1 22
## 234 fingers -1.575687e-01 6.914049e-01 1 22
## 235 happy -1.575687e-01 6.914049e-01 1 22
## 236 hour -1.575687e-01 6.914049e-01 1 22
## 237 making -1.575687e-01 6.914049e-01 1 22
## 238 show -1.575687e-01 6.914049e-01 1 22
## 239 must -1.603945e-01 6.887936e-01 6 73
## 240 rest -1.660455e-01 6.836504e-01 2 34
## 241 exactly -2.045980e-01 6.510348e-01 1 23
## 242 instead -2.045980e-01 6.510348e-01 1 23
## 243 picture -2.045980e-01 6.510348e-01 1 23
## 244 point -2.045980e-01 6.510348e-01 1 23
## 245 problem -2.045980e-01 6.510348e-01 1 23
## 246 coming -2.054679e-01 6.503431e-01 2 35
## 247 sense -2.054679e-01 6.503431e-01 2 35
## 248 together -2.054679e-01 6.503431e-01 2 35
## 249 yet -2.267256e-01 6.339624e-01 5 64
## 250 wait -2.479505e-01 6.185219e-01 2 36
## 251 collar -2.556662e-01 6.131133e-01 1 24
## 252 outside -2.556955e-01 6.130930e-01 3 48
## 253 come -2.765891e-01 5.989456e-01 10 122
## 254 special -2.932586e-01 5.881399e-01 2 37
## 255 done -2.954787e-01 5.867308e-01 5 66
## 256 helmet -3.103079e-01 5.774913e-01 1 25
## 257 hold -3.103079e-01 5.774913e-01 1 25
## 258 known -3.103079e-01 5.774913e-01 1 25
## 259 times -3.103079e-01 5.774913e-01 1 25
## 260 felt -3.324944e-01 5.641940e-01 5 67
## 261 well -3.397222e-01 5.599896e-01 9 113
## 262 find -3.475374e-01 5.555105e-01 6 79
## 263 close -3.681268e-01 5.440271e-01 1 26
## 264 feet -3.681268e-01 5.440271e-01 1 26
## 265 kids -3.681268e-01 5.440271e-01 1 26
## 266 leave -3.681268e-01 5.440271e-01 1 26
## 267 speak -3.681268e-01 5.440271e-01 1 26
## 268 strange -3.681268e-01 5.440271e-01 1 26
## 269 ten -3.681268e-01 5.440271e-01 1 26
## 270 word -3.915259e-01 5.314984e-01 2 39
## 271 food -4.287832e-01 5.125866e-01 1 27
## 272 girl -4.287832e-01 5.125866e-01 1 27
## 273 green -4.287832e-01 5.125866e-01 1 27
## 274 reason -4.287832e-01 5.125866e-01 1 27
## 275 sitting -4.287832e-01 5.125866e-01 1 27
## 276 sound -4.287832e-01 5.125866e-01 1 27
## 277 stuff -4.287832e-01 5.125866e-01 1 27
## 278 true -4.287832e-01 5.125866e-01 1 27
## 279 tried -4.441218e-01 5.051397e-01 2 40
## 280 way -4.743577e-01 4.909892e-01 11 140
## 281 case -4.919843e-01 4.830435e-01 1 28
## 282 guess -4.919843e-01 4.830435e-01 1 28
## 283 power -4.919843e-01 4.830435e-01 1 28
## 284 real -4.919843e-01 4.830435e-01 1 28
## 285 shoulder -4.919843e-01 4.830435e-01 1 28
## 286 signal -4.919843e-01 4.830435e-01 1 28
## 287 staring -4.919843e-01 4.830435e-01 1 28
## 288 really -4.978903e-01 4.804286e-01 4 59
## 289 face -5.521434e-01 4.574432e-01 7 96
## 290 given -5.574761e-01 4.552789e-01 1 29
## 291 somehow -5.574761e-01 4.552789e-01 1 29
## 292 two -6.290660e-01 4.276981e-01 10 133
## 293 far -6.740842e-01 4.116313e-01 2 44
## 294 feel -6.740842e-01 4.116313e-01 2 44
## 295 moment -6.832823e-01 4.084588e-01 5 75
## 296 moved -6.944758e-01 4.046462e-01 1 31
## 297 past -6.944758e-01 4.046462e-01 1 31
## 298 space -6.944758e-01 4.046462e-01 1 31
## 299 try -6.944758e-01 4.046462e-01 1 31
## 300 ready -7.656202e-01 3.815754e-01 1 32
## 301 tiny -7.656202e-01 3.815754e-01 1 32
## 302 half -7.991055e-01 3.713610e-01 2 46
## 303 thing -8.362749e-01 3.604643e-01 5 78
## 304 read -8.383209e-01 3.598774e-01 1 33
## 305 saw -8.665542e-01 3.519110e-01 4 66
## 306 dozen -9.124452e-01 3.394665e-01 1 34
## 307 pink -9.124452e-01 3.394665e-01 1 34
## 308 see -9.645669e-01 3.260389e-01 15 200
## 309 many -9.878750e-01 3.202623e-01 1 35
## 310 break -1.005635e+00 3.159508e-01 0 20
## 311 broke -1.005635e+00 3.159508e-01 0 20
## 312 expected -1.005635e+00 3.159508e-01 0 20
## 313 four-star -1.005635e+00 3.159508e-01 0 20
## 314 less -1.005635e+00 3.159508e-01 0 20
## 315 pathology -1.005635e+00 3.159508e-01 0 20
## 316 quarters -1.005635e+00 3.159508e-01 0 20
## 317 reports -1.005635e+00 3.159508e-01 0 20
## 318 sick -1.005635e+00 3.159508e-01 0 20
## 319 skill -1.005635e+00 3.159508e-01 0 20
## 320 somewhere -1.005635e+00 3.159508e-01 0 20
## 321 survey -1.005635e+00 3.159508e-01 0 20
## 322 slowly -1.065382e+00 3.019910e-01 2 50
## 323 something -1.087582e+00 2.970076e-01 9 132
## 324 dr -1.095761e+00 2.951982e-01 0 21
## 325 hell -1.095761e+00 2.951982e-01 0 21
## 326 intelligence -1.095761e+00 2.951982e-01 0 21
## 327 mahon-modified -1.095761e+00 2.951982e-01 0 21
## 328 nature -1.095761e+00 2.951982e-01 0 21
## 329 six -1.095761e+00 2.951982e-01 0 21
## 330 surgical -1.095761e+00 2.951982e-01 0 21
## 331 town -1.095761e+00 2.951982e-01 0 21
## 332 enough -1.112556e+00 2.915269e-01 5 83
## 333 looks -1.134888e+00 2.867351e-01 2 51
## 334 friend -1.142241e+00 2.851793e-01 1 37
## 335 admit -1.186563e+00 2.760239e-01 0 22
## 336 brucker -1.186563e+00 2.760239e-01 0 22
## 337 change -1.186563e+00 2.760239e-01 0 22
## 338 communicator -1.186563e+00 2.760239e-01 0 22
## 339 completely -1.186563e+00 2.760239e-01 0 22
## 340 goes -1.186563e+00 2.760239e-01 0 22
## 341 gray -1.186563e+00 2.760239e-01 0 22
## 342 lay -1.186563e+00 2.760239e-01 0 22
## 343 order -1.186563e+00 2.760239e-01 0 22
## 344 practice -1.186563e+00 2.760239e-01 0 22
## 345 seattle -1.186563e+00 2.760239e-01 0 22
## 346 studied -1.186563e+00 2.760239e-01 0 22
## 347 transmitter -1.186563e+00 2.760239e-01 0 22
## 348 seen -1.205440e+00 2.722371e-01 2 52
## 349 sort -1.220998e+00 2.691649e-01 1 38
## 350 great -1.276980e+00 2.584613e-01 2 53
## 351 happened -1.276980e+00 2.584613e-01 2 53
## 352 company -1.277953e+00 2.582800e-01 0 23
## 353 fear -1.277953e+00 2.582800e-01 0 23
## 354 passed -1.277953e+00 2.582800e-01 0 23
## 355 pattern -1.277953e+00 2.582800e-01 0 23
## 356 planets -1.277953e+00 2.582800e-01 0 23
## 357 produced -1.277953e+00 2.582800e-01 0 23
## 358 research -1.277953e+00 2.582800e-01 0 23
## 359 spread -1.277953e+00 2.582800e-01 0 23
## 360 look -1.289567e+00 2.561278e-01 5 86
## 361 data -1.369859e+00 2.418361e-01 0 24
## 362 desk -1.369859e+00 2.418361e-01 0 24
## 363 diagnosis -1.369859e+00 2.418361e-01 0 24
## 364 fight -1.369859e+00 2.418361e-01 0 24
## 365 second -1.369859e+00 2.418361e-01 0 24
## 366 standby -1.369859e+00 2.418361e-01 0 24
## 367 stepped -1.369859e+00 2.418361e-01 0 24
## 368 trees -1.369859e+00 2.418361e-01 0 24
## 369 blue -1.381280e+00 2.398833e-01 1 40
## 370 aboard -1.462219e+00 2.265773e-01 0 25
## 371 ago -1.462219e+00 2.265773e-01 0 25
## 372 bradley -1.462219e+00 2.265773e-01 0 25
## 373 dead -1.462219e+00 2.265773e-01 0 25
## 374 deeth -1.462219e+00 2.265773e-01 0 25
## 375 developed -1.462219e+00 2.265773e-01 0 25
## 376 disease -1.462219e+00 2.265773e-01 0 25
## 377 lock -1.462219e+00 2.265773e-01 0 25
## 378 trade -1.462219e+00 2.265773e-01 0 25
## 379 head -1.474029e+00 2.247111e-01 5 89
## 380 nothing -1.474029e+00 2.247111e-01 5 89
## 381 place -1.532945e+00 2.156710e-01 3 63
## 382 small -1.532945e+00 2.156710e-01 3 63
## 383 sir -1.544824e+00 2.139011e-01 1 42
## 384 beginning -1.554983e+00 2.124013e-01 0 26
## 385 death -1.554983e+00 2.124013e-01 0 26
## 386 farrel -1.554983e+00 2.124013e-01 0 26
## 387 hope -1.554983e+00 2.124013e-01 0 26
## 388 level -1.554983e+00 2.124013e-01 0 26
## 389 normal -1.554983e+00 2.124013e-01 0 26
## 390 somebody -1.554983e+00 2.124013e-01 0 26
## 391 spokesman -1.554983e+00 2.124013e-01 0 26
## 392 training -1.554983e+00 2.124013e-01 0 26
## 393 wave -1.554983e+00 2.124013e-01 0 26
## 394 body -1.627683e+00 2.020238e-01 1 43
## 395 call -1.627683e+00 2.020238e-01 1 43
## 396 men -1.627683e+00 2.020238e-01 1 43
## 397 drive -1.648105e+00 1.992170e-01 0 27
## 398 lot -1.648105e+00 1.992170e-01 0 27
## 399 medicine -1.648105e+00 1.992170e-01 0 27
## 400 shop -1.648105e+00 1.992170e-01 0 27
## 401 skull -1.648105e+00 1.992170e-01 0 27
## 402 thousand -1.648105e+00 1.992170e-01 0 27
## 403 found -1.692608e+00 1.932576e-01 4 79
## 404 us -1.702712e+00 1.919337e-01 9 144
## 405 cold -1.741548e+00 1.869428e-01 0 28
## 406 gun -1.741548e+00 1.869428e-01 0 28
## 407 hundred -1.741548e+00 1.869428e-01 0 28
## 408 minute -1.741548e+00 1.869428e-01 0 28
## 409 moruan -1.741548e+00 1.869428e-01 0 28
## 410 realized -1.741548e+00 1.869428e-01 0 28
## 411 ship's -1.741548e+00 1.869428e-01 0 28
## 412 woman -1.741548e+00 1.869428e-01 0 28
## 413 trying -1.795337e+00 1.802772e-01 1 45
## 414 much -1.796199e+00 1.801727e-01 5 94
## 415 another -1.827045e+00 1.764770e-01 3 67
## 416 life -1.827045e+00 1.764770e-01 3 67
## 417 afraid -1.835280e+00 1.755051e-01 0 29
## 418 broadcasts -1.835280e+00 1.755051e-01 0 29
## 419 part -1.835280e+00 1.755051e-01 0 29
## 420 questions -1.835280e+00 1.755051e-01 0 29
## 421 understand -1.835280e+00 1.755051e-01 0 29
## 422 get -1.862123e+00 1.723800e-01 16 233
## 423 appeared -1.929273e+00 1.648381e-01 0 30
## 424 fact -1.929273e+00 1.648381e-01 0 30
## 425 galactic -1.929273e+00 1.648381e-01 0 30
## 426 galaxy -1.929273e+00 1.648381e-01 0 30
## 427 hear -1.929273e+00 1.648381e-01 0 30
## 428 race -1.929273e+00 1.648381e-01 0 30
## 429 time -1.962688e+00 1.612261e-01 15 223
## 430 young -1.965309e+00 1.609466e-01 1 47
## 431 chance -2.023501e+00 1.548817e-01 0 31
## 432 earthmen -2.023501e+00 1.548817e-01 0 31
## 433 later -2.033268e+00 1.538894e-01 2 55
## 434 wrong -2.033268e+00 1.538894e-01 2 55
## 435 suddenly -2.051076e+00 1.520986e-01 1 48
## 436 machine -2.137323e+00 1.437525e-01 1 49
## 437 physicians -2.212581e+00 1.368895e-01 0 33
## 438 else -2.224022e+00 1.358786e-01 1 50
## 439 believe -2.307398e+00 1.287595e-01 0 34
## 440 certain -2.307398e+00 1.287595e-01 0 34
## 441 contact -2.307398e+00 1.287595e-01 0 34
## 442 howell -2.307398e+00 1.287595e-01 0 34
## 443 landing -2.307398e+00 1.287595e-01 0 34
## 444 answer -2.402378e+00 1.211509e-01 0 35
## 445 god -2.402378e+00 1.211509e-01 0 35
## 446 intelligent -2.402378e+00 1.211509e-01 0 35
## 447 patient -2.402378e+00 1.211509e-01 0 35
## 448 ships -2.402378e+00 1.211509e-01 0 35
## 449 surgery -2.402378e+00 1.211509e-01 0 35
## 450 almost -2.445210e+00 1.178841e-01 3 75
## 451 he’d -2.497510e+00 1.140265e-01 0 36
## 452 message -2.497510e+00 1.140265e-01 0 36
## 453 arnquist -2.592781e+00 1.073517e-01 0 37
## 454 graves -2.592781e+00 1.073517e-01 0 37
## 455 speaker -2.592781e+00 1.073517e-01 0 37
## 456 ain't -2.688180e+00 1.010952e-01 0 38
## 457 council -2.688180e+00 1.010952e-01 0 38
## 458 crew -2.688180e+00 1.010952e-01 0 38
## 459 light -2.688180e+00 1.010952e-01 0 38
## 460 plague -2.688180e+00 1.010952e-01 0 38
## 461 timgar -2.688180e+00 1.010952e-01 0 38
## 462 gorka -2.783699e+00 9.522800e-02 0 39
## 463 system -2.783699e+00 9.522800e-02 0 39
## 464 mind -2.793743e+00 9.463295e-02 2 64
## 465 among -2.879329e+00 8.972343e-02 0 40
## 466 dal's -2.879329e+00 8.972343e-02 0 40
## 467 operating -2.879329e+00 8.972343e-02 0 40
## 468 virus -2.879329e+00 8.972343e-02 0 40
## 469 better -2.948183e+00 8.597419e-02 4 96
## 470 information -2.975062e+00 8.455691e-02 0 41
## 471 mahon -2.975062e+00 8.455691e-02 0 41
## 472 report -2.975062e+00 8.455691e-02 0 41
## 473 general -3.070891e+00 7.970578e-02 0 42
## 474 haron -3.070891e+00 7.970578e-02 0 42
## 475 humans -3.070891e+00 7.970578e-02 0 42
## 476 reuben -3.070891e+00 7.970578e-02 0 42
## 477 screen -3.070891e+00 7.970578e-02 0 42
## 478 surgeon -3.070891e+00 7.970578e-02 0 42
## 479 garvian -3.166810e+00 7.514908e-02 0 43
## 480 physician -3.166810e+00 7.514908e-02 0 43
## 481 quite -3.166810e+00 7.514908e-02 0 43
## 482 first -3.257098e+00 7.111488e-02 8 156
## 483 betsy -3.262814e+00 7.086742e-02 0 44
## 484 matter -3.262814e+00 7.086742e-02 0 44
## 485 people -3.302263e+00 6.918449e-02 5 115
## 486 perhaps -3.358897e+00 6.684283e-02 0 45
## 487 creatures -3.455054e+00 6.305866e-02 0 46
## 488 across -3.580197e+00 5.847220e-02 1 56
## 489 voice -3.580197e+00 5.847220e-02 1 56
## 490 caldwell -3.647576e+00 5.615089e-02 0 48
## 491 control -3.674150e+00 5.526243e-02 1 57
## 492 red -3.743932e+00 5.299960e-02 0 49
## 493 inspector -3.936819e+00 4.724004e-02 0 51
## 494 trading -3.936819e+00 4.724004e-02 0 51
## 495 giles -4.033343e+00 4.460946e-02 0 52
## 496 tanner -4.033343e+00 4.460946e-02 0 52
## 497 post -4.051232e+00 4.413911e-02 1 61
## 498 contract -4.129918e+00 4.213138e-02 0 53
## 499 lecky -4.129918e+00 4.213138e-02 0 53
## 500 service -4.145792e+00 4.173812e-02 1 62
## 501 broadcast -4.323211e+00 3.759591e-02 0 55
## 502 field -4.323211e+00 3.759591e-02 0 55
## 503 brooks -4.419924e+00 3.552162e-02 0 56
## 504 mean -4.419924e+00 3.552162e-02 0 56
## 505 might -4.756826e+00 2.918232e-02 2 86
## 506 maybe -4.810429e+00 2.828800e-02 1 69
## 507 course -5.383253e+00 2.033093e-02 1 75
## 508 years -5.492756e+00 1.909541e-02 2 94
## 509 confederation -5.561838e+00 1.835613e-02 0 57
## 510 creature -5.561838e+00 1.835613e-02 0 57
## 511 star -5.757379e+00 1.641955e-02 0 59
## 512 machines -5.766430e+00 1.633518e-02 1 79
## 513 human -5.855159e+00 1.553146e-02 0 60
## 514 just -5.883854e+00 1.528034e-02 12 248
## 515 moklins -6.050739e+00 1.390048e-02 0 62
## 516 seemed -6.054200e+00 1.387326e-02 3 117
## 517 patrol -6.246345e+00 1.244498e-02 0 64
## 518 says -6.246627e+00 1.244301e-02 1 84
## 519 help -6.798047e+00 9.125765e-03 2 108
## 520 _lancet_ -6.833323e+00 8.947266e-03 0 70
## 521 began -6.833323e+00 8.947266e-03 0 70
## 522 moklin -7.616329e+00 5.784210e-03 0 78
## 523 matilda -8.007991e+00 4.657137e-03 0 82
## 524 doctors -8.203862e+00 4.180132e-03 0 84
## 525 now -8.591016e+00 3.378255e-03 12 284
## 526 medical -8.889617e+00 2.867969e-03 0 91
## 527 room -9.070585e+00 2.597539e-03 2 132
## 528 bellews -9.183612e+00 2.441917e-03 0 94
## 529 conger -9.281623e+00 2.314643e-03 0 95
## 530 earth -9.452436e+00 2.108688e-03 2 136
## 531 planet -1.016402e+01 1.432078e-03 0 104
## 532 fuzzy -1.085071e+01 9.875815e-04 0 111
## 533 man -1.236392e+01 4.377115e-04 1 147
## 534 sergeant -1.350238e+01 2.382610e-04 0 138
## 535 hospital -1.674990e+01 4.264451e-05 0 171
## 536 black -1.903686e+01 1.282174e-05 1 215
## 537 doctor -2.160301e+01 3.353252e-06 2 261
## 538 jack -2.227748e+01 2.359582e-06 0 227
## 539 ship -2.257419e+01 2.021828e-06 0 230
## 540 tiger -2.564376e+01 4.106215e-07 0 261
## 541 dal -5.023330e+01 1.365130e-12 0 507
Summary for keyness dataframe
print(summary(books_stat_keyness))
## feature chi2 p n_target
## Length:541 Min. : -50.2333 Min. :0.00000 Min. : 0.000
## Class :character 1st Qu.: -1.7415 1st Qu.:0.09523 1st Qu.: 0.000
## Mode :character Median : -0.4288 Median :0.25828 Median : 2.000
## Mean : 11.5541 Mean :0.36217 Mean : 4.874
## 3rd Qu.: 0.0706 3rd Qu.:0.62522 3rd Qu.: 5.000
## Max. :1887.2050 Max. :0.99619 Max. :185.000
## n_reference
## Min. : 0.00
## 1st Qu.: 24.00
## Median : 32.00
## Mean : 50.05
## 3rd Qu.: 53.00
## Max. :684.00
plot_keyness <- textplot_keyness(books_stat_keyness)
print(plot_keyness)
books_fcm <- fcm(books_dfm)
print(books_fcm)
## Feature co-occurrence matrix of: 541 by 541 features.
## features
## features place four see something exactly know great though take old
## place 405 185 3071 1997 377 3255 983 976 1189 785
## four 0 26 724 482 80 803 230 246 306 248
## see 0 0 5759 7681 1438 12285 3991 3932 4441 2903
## something 0 0 0 2579 912 8272 2696 2621 2873 1867
## exactly 0 0 0 0 83 1437 481 470 543 337
## know 0 0 0 0 0 6547 4182 4123 4687 3133
## great 0 0 0 0 0 0 716 1392 1406 876
## though 0 0 0 0 0 0 0 678 1480 988
## take 0 0 0 0 0 0 0 0 889 1316
## old 0 0 0 0 0 0 0 0 0 474
## [ reached max_feat ... 531 more features, reached max_nfeat ... 531 more features ]
Subsetting based on top features
feat <- names(topfeatures(books_fcm, 50))
size <- log(colSums(dfm_select(books_dfm, feat, selection = "keep")))
books_fcm_select <- fcm_select(books_fcm, pattern = feat, selection = "keep")
print(books_fcm_select)
## Feature co-occurrence matrix of: 50 by 50 features.
## features
## features now time said just two room came earth think seemed
## now 13869 22769 68910 27701 12212 14413 12185 18293 14601 12218
## time 0 9218 56605 22771 10045 11806 10092 14960 12011 9933
## said 0 0 90727 67578 30911 35799 31553 44427 37144 29822
## just 0 0 0 14379 12211 14620 11865 18777 14932 11874
## two 0 0 0 0 2668 6360 5521 7905 6518 5276
## room 0 0 0 0 0 3751 6300 9628 7604 6214
## came 0 0 0 0 0 0 2834 7705 6443 5324
## earth 0 0 0 0 0 0 0 6258 9604 7986
## think 0 0 0 0 0 0 0 0 3912 6274
## seemed 0 0 0 0 0 0 0 0 0 2674
## [ reached max_feat ... 40 more features, reached max_nfeat ... 40 more features ]
plot_fcm <- textplot_network(books_fcm_select, min_freq = 0.8, vertex_size = size / max(size) * 3)
print(plot_fcm)
plot_freq <- books_dfm %>%
textstat_frequency(n = 15) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
print(plot_freq)
books_lexdiv <- textstat_lexdiv(books_dfm)
print(books_lexdiv)
## document TTR
## 1 Return to Pleasure Island 0.13917330
## 2 Star Surgeon 0.03489763
## 3 Divinity 0.25808879
## 4 The Machine That Saved The World 0.12849162
## 5 The Skull 0.18506998
## 6 Where There's Hope 0.41176471
## 7 The Dwindling Years 0.26083815
## 8 The Luckiest Man in Denv 0.28627451
## 9 Pen Pal 0.27806563
## 10 If You Was a Moklin 0.14186851
plot(books_lexdiv$TTR, type = "l", xaxt = "n", xlab = NULL, ylab = "TTR")
grid()
axis(1, at = seq_len(nrow(books_lexdiv)), labels = books_dfm$author, las = 3, hadj = 0)
books_dist <- as.dist(textstat_dist(books_dfm))
clust <- hclust(books_dist)
plot(clust, xlab = "Distance", ylab = NULL)
stat_colloc <- textstat_collocations(books_tokens, min_count = 100)
print(head(stat_colloc, 10))
## collocation count count_nested length lambda z
## 1 had been 190 0 2 4.752868 45.51074
## 2 black doctor 164 0 2 8.216841 43.57646
## 3 of the 724 0 2 1.932639 42.20224
## 4 it was 245 0 2 2.919331 38.60291
## 5 he had 249 0 2 2.866186 37.67258
## 6 hospital earth 102 0 2 8.462323 37.19547
## 7 there was 146 0 2 3.798476 36.21688
## 8 he said 212 0 2 3.027291 36.20042
## 9 in the 461 0 2 2.103731 36.18028
## 10 i don't 112 0 2 4.651049 35.41933
Collocations of three words
stat_colloc2 <- tokens_select(books_tokens, pattern = "^[A-Z]",
valuetype = "regex",
case_insensitive = FALSE,
padding = TRUE) %>%
textstat_collocations(min_count = 5, size = 3)
print(head(stat_colloc2, 20))
## collocation count count_nested length lambda z
## 1 doctor hugo tanner 8 0 3 -0.5685091 -0.3138335
## 2 transcriber's note this 6 0 3 -2.2562365 -0.8240342
## 3 the black doctor's 5 0 3 -2.3219757 -1.4348132
## 4 black doctor hugo 9 0 3 -3.1647519 -1.5307211
## 5 four-star black doctor 5 0 3 -3.2740270 -1.5775303
## 6 old man bland 5 0 3 -7.2460743 -2.3826886
## 7 general practice patrol 16 0 3 -9.3662428 -3.1002355
## 8 black doctor tanner 20 0 3 -4.6894066 -3.1355719
## 9 black doctor arnquist 22 0 3 -5.7237567 -3.7058302
## 10 the black doctor 36 0 3 -2.6180264 -3.9773838