# Load required libraries
pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, udpipe, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)
# Load the CSV file
ford_data <- read_csv("The_Ford_Family.csv", col_names = FALSE)
## Rows: 987 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(ford_data) <- c("text")
# Tokenize the text into words
tokens <- ford_data %>%
unnest_tokens(word, text)
# Remove stop words
data("stop_words")
tokens <- tokens %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
count(word, sort = TRUE)
# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
## word n
## <chr> <int>
## 1 mother 59
## 2 dad 40
## 3 uncle 40
## 4 time 38
## 5 family 33
## 6 ford 31
## 7 grandfather 31
## 8 street 31
## 9 house 30
## 10 children 28
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
top_n(30) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "#666600") +
coord_flip() +
labs(
title = "Most Common Words in A Collection of Stories by Frank William Ford",
subtitle = "Analysis by Patrick Ford 2024",
x = "Words",
y = "Frequency"
) +
theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
count(sentiment, sort = TRUE)
# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
mutate(percentage = n / sum(n) * 100)
# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
labs(
title = "Sentiment Analysis of A Collection of Stories by Frank William Ford using Bing Lexicon",
subtitle = "Analysis by Patrick Ford 2024",
x = "Sentiment",
y = "Count"
) +
theme()
# Handle many-to-many relationship warning
nrc_sentiments <- tokens %>%
inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
count(sentiment, sort = TRUE)
# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
mutate(percentage = n / sum(n) * 100)
# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) + coord_flip() +
labs(
title = "NRC Sentiment Analysis of A Collection of Stories by Frank William Ford",
subtitle = "Analysis by Patrick Ford 2024",
x = "Sentiment",
y = "Count"
) +
theme()
# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
max.words = 150,
random.order = FALSE, rot.per = 0.1,
scale = c(2.5, 0.5),
colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of A Collection of Stories by Frank William Ford", side = 3, adj = 0, line = 1, cex = 1, font = 2)
mtext("Analysis by Patrick Ford 2024", side = 3, adj = 0, line = -1, cex = 0.9, font = 3)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
count(document = row_number(), word) %>%
cast_dtm(document, word, n)
# Set the number of topics
num_topics <- 6
# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")
# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# Print the top terms for each topic
top_terms %>%
group_by(topic) %>%
summarize(terms = paste(term, collapse = ", ")) %>%
print()
## # A tibble: 6 × 2
## topic terms
## <int> <chr>
## 1 1 joyce, house, horse, street, mother, grandfather, family, dad, close, c…
## 2 2 uncle, king, left, mother, time, wife, war, house, england, dad
## 3 3 mother, uncle, grandfather, dad, time, ford, children, horse, house, lo…
## 4 4 mother, family, time, gloucester, day, life, love, home, street, father
## 5 5 ford, uncle, grandfather, time, family, dad, horse, kids, street, fruit
## 6 6 gloucester, left, dad, time, children, war, bill, ern, family, school
# Plot the top terms for each topic
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
labs(
title = "Top Terms in Each Topic in A Collection of Stories by Frank William Ford",
subtitle = "Analysis by Patrick Ford 2024",
x = "Terms",
y = "Beta"
) +
scale_x_reordered() +
theme()

# Textual Complexity: Flesch-Kincaid Readability
ford_text <- paste(ford_data$text, collapse = " ")
readability <- textstat_readability(ford_text, measure = "Flesch.Kincaid")
print(paste("Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford:", readability))
## [1] "Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford: text1"
## [2] "Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford: 4.98625170783992"
# Bigram Analysis
bigrams <- ford_data %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!is.na(word1), !is.na(word2)) %>% # Remove NA words
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
# Plot Most Common Bigrams
bigrams %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(paste(word1, word2, sep = " "), n), y = n)) +
geom_bar(stat = "identity", fill = "#cc9900") +
coord_flip() +
labs(
title = "Most Common Bigrams in A Collection of Stories by Frank William Ford",
subtitle = "Analysis by Patrick Ford 2024",
x = "Bigram",
y = "Frequency"
) +
theme()

# Named Entity Recognition (NER) using udpipe
# Download and load the English model
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /cloud/project/english-ewt-ud-2.5-191206.udpipe
## - This model has been trained on version 2.5 of data from https://universaldependencies.org
## - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
## - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
## - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
## Downloading finished, model stored at '/cloud/project/english-ewt-ud-2.5-191206.udpipe'
ud_model <- udpipe_load_model(file = model$file_model)
# Annotate text
annotations <- udpipe_annotate(ud_model, x = ford_data$text)
annotations_df <- as.data.frame(annotations)
# Extract Named Entities
named_entities <- annotations_df %>%
filter(upos == "PROPN") %>%
count(token, sort = TRUE)
# Print Named Entities
print(named_entities)
## token n
## 1 Ford 29
## 2 Gloucester 26
## 3 Joyce 24
## 4 Grandfather 22
## 5 King 21
## 6 Ern 20
## 7 England 17
## 8 David 14
## 9 Tom 13
## 10 Auntie 12
## 11 Bill 12
## 12 John 12
## 13 Albert 11
## 14 Charlie 11
## 15 Granny 11
## 16 Fred 10
## 17 Gran 10
## 18 Alf 9
## 19 Gladys 9
## 20 Mary 9
## 21 Sam 9
## 22 War 9
## 23 Clare 8
## 24 Grandmother 8
## 25 Frank 7
## 26 Henry 7
## 27 Karen 7
## 28 Mount 7
## 29 Sunday 7
## 30 America 6
## 31 Don 6
## 32 God 6
## 33 London 6
## 34 Paula 6
## 35 Street 6
## 36 World 6
## 37 Alice 5
## 38 Dora 5
## 39 George 5
## 40 Harry 5
## 41 Mr 5
## 42 Rugby 5
## 43 Alfs 4
## 44 Christmas 4
## 45 City 4
## 46 English 4
## 47 First 4
## 48 France 4
## 49 Mrs 4
## 50 Royal 4
## 51 Uncle 4
## 52 Voyce 4
## 53 street 4
## 54 Barradine 3
## 55 Beryl 3
## 56 Black 3
## 57 Brother 3
## 58 Edith 3
## 59 Elver 3
## 60 Glady 3
## 61 Great 3
## 62 India 3
## 63 Ireland 3
## 64 Juliette 3
## 65 Kings 3
## 66 Martha 3
## 67 Monday 3
## 68 Sergeant 3
## 69 Sister 3
## 70 Ted 3
## 71 Victoria 3
## 72 Watch 3
## 73 Westgate 3
## 74 Air 2
## 75 Americans 2
## 76 Archdeacon 2
## 77 Army 2
## 78 Avonmouth 2
## 79 Bottle 2
## 80 Captain 2
## 81 Cats 2
## 82 Cork 2
## 83 County 2
## 84 Em 2
## 85 Father 2
## 86 Force 2
## 87 Germans 2
## 88 Hell 2
## 89 Hogg 2
## 90 Jonathan 2
## 91 Jug 2
## 92 Kitchener 2
## 93 Later 2
## 94 Lyza 2
## 95 Major 2
## 96 Market 2
## 97 Meadow 2
## 98 Michael 2
## 99 North 2
## 100 Patrick 2
## 101 Prison 2
## 102 Queen 2
## 103 Rosemary 2
## 104 Service 2
## 105 Severn 2
## 106 Somme 2
## 107 Super 2
## 108 Terry 2
## 109 Thompson 2
## 110 Toni 2
## 111 Torquay 2
## 112 Villas 2
## 113 Violet 2
## 114 Weston 2
## 115 Will 2
## 116 William 2
## 117 Winnie 2
## 118 auntie 2
## 119 ALFS 1
## 120 Abbey 1
## 121 Antoinette 1
## 122 Aprile 1
## 123 Ass 1
## 124 August 1
## 125 Aunt 1
## 126 Australia 1
## 127 Barge 1
## 128 Bedford 1
## 129 Belgium 1
## 130 Bible 1
## 131 Biggles 1
## 132 Birmingham 1
## 133 Boughton 1
## 134 Bristol 1
## 135 Brussels 1
## 136 Bull 1
## 137 Buttercups 1
## 138 Cancer 1
## 139 Cary 1
## 140 Charles 1
## 141 Charter 1
## 142 Cheltenham 1
## 143 Chest 1
## 144 China 1
## 145 Chips 1
## 146 Clarice 1
## 147 Corp 1
## 148 Council 1
## 149 Coventry 1
## 150 Cream 1
## 151 Daisies 1
## 152 Damson 1
## 153 Dan 1
## 154 Daughters 1
## 155 Davis 1
## 156 De 1
## 157 Diddler 1
## 158 Doll 1
## 159 Doreen 1
## 160 Dorney 1
## 161 Dunkirk 1
## 162 E.N. 1
## 163 Eastgate 1
## 164 Eedie 1
## 165 Eighth 1
## 166 Elizabeth 1
## 167 Emerald 1
## 168 En 1
## 169 Ether 1
## 170 FORD 1
## 171 FRANK 1
## 172 Folk 1
## 173 Fordy 1
## 174 Fruit 1
## 175 GOT 1
## 176 Gardener 1
## 177 General 1
## 178 Gentleman 1
## 179 Germany 1
## 180 Glady's 1
## 181 Gloucesters 1
## 182 Gloucestershire 1
## 183 Gobey 1
## 184 Grans 1
## 185 Grant 1
## 186 Griffiths 1
## 187 Guildhall 1
## 188 Gun 1
## 189 H.P. 1
## 190 HOUSE 1
## 191 Harada 1
## 192 Harold 1
## 193 Harrison 1
## 194 Henrys 1
## 195 Hillsley 1
## 196 Him 1
## 197 Hiroshima 1
## 198 Hitler 1
## 199 Hollywood 1
## 200 INTO 1
## 201 Ice 1
## 202 Infantry 1
## 203 Infirmary 1
## 204 Irishman 1
## 205 Irishmen 1
## 206 Isle 1
## 207 Joke 1
## 208 Jokers 1
## 209 Karate 1
## 210 Kentucky 1
## 211 Khayyam 1
## 212 Kite 1
## 213 Kosher 1
## 214 Lee 1
## 215 Lemon 1
## 216 Life 1
## 217 Limpy 1
## 218 Lode 1
## 219 Lord 1
## 220 Lorry 1
## 221 MARRIED 1
## 222 MOVE 1
## 223 Machine 1
## 224 Mall 1
## 225 Mantles 1
## 226 Mare 1
## 227 Mastiff 1
## 228 Mayor 1
## 229 Mitsazuki 1
## 230 Mix 1
## 231 Mons 1
## 232 Montpellier 1
## 233 Mum 1
## 234 Mustard 1
## 235 NUMBER 1
## 236 Navy 1
## 237 Nell 1
## 238 Nicholas 1
## 239 Office 1
## 240 Old 1
## 241 Omar 1
## 242 Palomino 1
## 243 Peanut 1
## 244 Pegasus 1
## 245 Peter 1
## 246 Pilot 1
## 247 Poland 1
## 248 Port 1
## 249 Post 1
## 250 Priory 1
## 251 Ragged 1
## 252 Rememb 1
## 253 Rex 1
## 254 Road 1
## 255 Romany 1
## 256 Royston 1
## 257 Ruby 1
## 258 S.A 1
## 259 STORIES 1
## 260 Salvation 1
## 261 Sara 1
## 262 Saturday 1
## 263 Saturdays 1
## 264 School 1
## 265 Second 1
## 266 Seven 1
## 267 Soon 1
## 268 Spice 1
## 269 St 1
## 270 St. 1
## 271 States 1
## 272 Steve 1
## 273 Stroud 1
## 274 Stuka 1
## 275 Sundays 1
## 276 Tewkesbury 1
## 277 Thomas 1
## 278 Toms 1
## 279 Trojan 1
## 280 U.S.A 1
## 281 United 1
## 282 Vaseline 1
## 283 Veteran 1
## 284 WAR. 1
## 285 WILLIAM 1
## 286 Wagon 1
## 287 Wales 1
## 288 Walrus 1
## 289 Wanna 1
## 290 Warrington 1
## 291 Webb 1
## 292 Woodcocks 1
## 293 Works 1
## 294 Yank 1
## 295 Yorkshire 1
## 296 boys 1
## 297 cart 1
## 298 jam 1
## 299 park 1
## 300 pedestrians 1
## 301 river 1
# Plot Named Entities
named_entities %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(token, n), y = n)) +
geom_bar(stat = "identity", fill = "#3399ff") +
coord_flip() +
labs(
title = "Most Frequent Named Entities in A Collection of Stories by Frank William Ford",
subtitle = "Analysis by Patrick Ford 2024",
x = "Entity",
y = "Frequency"
) +
theme_minimal()

# Generate a Word Cloud for Named Entities
wordcloud(words = named_entities$token, freq = named_entities$n, min.freq = 2,
max.words = 150, random.order = FALSE, rot.per = 0.1,
scale = c(2.5, 0.5),
colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of Named Entities in A Collection of Stories by Frank William Ford", side = 3, adj = 0, line = 1, cex = 1, font = 2)
mtext("Analysis by Patrick Ford 2024", side = 3, adj = 0, line = -1, cex = 0.9, font = 3)
