# Load required libraries 
pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, udpipe, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)

# Load the CSV file
ford_data <- read_csv("The_Ford_Family.csv", col_names = FALSE)
## Rows: 987 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(ford_data) <- c("text")

# Tokenize the text into words
tokens <- ford_data %>%
  unnest_tokens(word, text)

# Remove stop words
data("stop_words")
tokens <- tokens %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
##    word            n
##    <chr>       <int>
##  1 mother         59
##  2 dad            40
##  3 uncle          40
##  4 time           38
##  5 family         33
##  6 ford           31
##  7 grandfather    31
##  8 street         31
##  9 house          30
## 10 children       28
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
  top_n(30) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "#666600") +
  coord_flip() +
  labs(
    title = "Most Common Words in A Collection of Stories by Frank William Ford",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Words",
    y = "Frequency"
  ) +
  theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
  labs(
    title = "Sentiment Analysis of A Collection of Stories by Frank William Ford using Bing Lexicon",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Sentiment",
    y = "Count"
  ) +
  theme()

# Handle many-to-many relationship warning
nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +  coord_flip() +
  labs(
    title = "NRC Sentiment Analysis of A Collection of Stories by Frank William Ford",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Sentiment",
    y = "Count"
  ) +
  theme()

# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud 
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
          max.words = 150,  
          random.order = FALSE, rot.per = 0.1,  
          scale = c(2.5, 0.5),  
          colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of A Collection of Stories by Frank William Ford", side = 3, adj = 0, line = 1, cex = 1, font = 2)
mtext("Analysis by Patrick Ford 2024", side = 3, adj = 0, line = -1, cex = 0.9, font = 3)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
  count(document = row_number(), word) %>%
  cast_dtm(document, word, n)

# Set the number of topics
num_topics <- 6

# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")

# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Print the top terms for each topic
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print()
## # A tibble: 6 × 2
##   topic terms                                                                   
##   <int> <chr>                                                                   
## 1     1 joyce, house, horse, street, mother, grandfather, family, dad, close, c…
## 2     2 uncle, king, left, mother, time, wife, war, house, england, dad         
## 3     3 mother, uncle, grandfather, dad, time, ford, children, horse, house, lo…
## 4     4 mother, family, time, gloucester, day, life, love, home, street, father 
## 5     5 ford, uncle, grandfather, time, family, dad, horse, kids, street, fruit 
## 6     6 gloucester, left, dad, time, children, war, bill, ern, family, school
# Plot the top terms for each topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  labs(
    title = "Top Terms in Each Topic in A Collection of Stories by Frank William Ford",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Terms",
    y = "Beta"
  ) +
  scale_x_reordered() +
  theme()

# Textual Complexity: Flesch-Kincaid Readability
ford_text <- paste(ford_data$text, collapse = " ")
readability <- textstat_readability(ford_text, measure = "Flesch.Kincaid")

print(paste("Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford:", readability))
## [1] "Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford: text1"           
## [2] "Flesch-Kincaid Readability Score; A Collection of Stories by Frank William Ford: 4.98625170783992"
# Bigram Analysis
bigrams <- ford_data %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!is.na(word1), !is.na(word2)) %>%  # Remove NA words
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

# Plot Most Common Bigrams
bigrams %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(paste(word1, word2, sep = " "), n), y = n)) +
  geom_bar(stat = "identity", fill = "#cc9900") +
  coord_flip() +
  labs(
    title = "Most Common Bigrams in A Collection of Stories by Frank William Ford",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Bigram",
    y = "Frequency"
  ) +
  theme()

# Named Entity Recognition (NER) using udpipe
# Download and load the English model
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /cloud/project/english-ewt-ud-2.5-191206.udpipe
##  - This model has been trained on version 2.5 of data from https://universaldependencies.org
##  - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
##  - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
##  - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
## Downloading finished, model stored at '/cloud/project/english-ewt-ud-2.5-191206.udpipe'
ud_model <- udpipe_load_model(file = model$file_model)

# Annotate text
annotations <- udpipe_annotate(ud_model, x = ford_data$text)
annotations_df <- as.data.frame(annotations)

# Extract Named Entities
named_entities <- annotations_df %>%
  filter(upos == "PROPN") %>%
  count(token, sort = TRUE)

# Print Named Entities
print(named_entities)
##               token  n
## 1              Ford 29
## 2        Gloucester 26
## 3             Joyce 24
## 4       Grandfather 22
## 5              King 21
## 6               Ern 20
## 7           England 17
## 8             David 14
## 9               Tom 13
## 10           Auntie 12
## 11             Bill 12
## 12             John 12
## 13           Albert 11
## 14          Charlie 11
## 15           Granny 11
## 16             Fred 10
## 17             Gran 10
## 18              Alf  9
## 19           Gladys  9
## 20             Mary  9
## 21              Sam  9
## 22              War  9
## 23            Clare  8
## 24      Grandmother  8
## 25            Frank  7
## 26            Henry  7
## 27            Karen  7
## 28            Mount  7
## 29           Sunday  7
## 30          America  6
## 31              Don  6
## 32              God  6
## 33           London  6
## 34            Paula  6
## 35           Street  6
## 36            World  6
## 37            Alice  5
## 38             Dora  5
## 39           George  5
## 40            Harry  5
## 41               Mr  5
## 42            Rugby  5
## 43             Alfs  4
## 44        Christmas  4
## 45             City  4
## 46          English  4
## 47            First  4
## 48           France  4
## 49              Mrs  4
## 50            Royal  4
## 51            Uncle  4
## 52            Voyce  4
## 53           street  4
## 54        Barradine  3
## 55            Beryl  3
## 56            Black  3
## 57          Brother  3
## 58            Edith  3
## 59            Elver  3
## 60            Glady  3
## 61            Great  3
## 62            India  3
## 63          Ireland  3
## 64         Juliette  3
## 65            Kings  3
## 66           Martha  3
## 67           Monday  3
## 68         Sergeant  3
## 69           Sister  3
## 70              Ted  3
## 71         Victoria  3
## 72            Watch  3
## 73         Westgate  3
## 74              Air  2
## 75        Americans  2
## 76       Archdeacon  2
## 77             Army  2
## 78        Avonmouth  2
## 79           Bottle  2
## 80          Captain  2
## 81             Cats  2
## 82             Cork  2
## 83           County  2
## 84               Em  2
## 85           Father  2
## 86            Force  2
## 87          Germans  2
## 88             Hell  2
## 89             Hogg  2
## 90         Jonathan  2
## 91              Jug  2
## 92        Kitchener  2
## 93            Later  2
## 94             Lyza  2
## 95            Major  2
## 96           Market  2
## 97           Meadow  2
## 98          Michael  2
## 99            North  2
## 100         Patrick  2
## 101          Prison  2
## 102           Queen  2
## 103        Rosemary  2
## 104         Service  2
## 105          Severn  2
## 106           Somme  2
## 107           Super  2
## 108           Terry  2
## 109        Thompson  2
## 110            Toni  2
## 111         Torquay  2
## 112          Villas  2
## 113          Violet  2
## 114          Weston  2
## 115            Will  2
## 116         William  2
## 117          Winnie  2
## 118          auntie  2
## 119            ALFS  1
## 120           Abbey  1
## 121      Antoinette  1
## 122          Aprile  1
## 123             Ass  1
## 124          August  1
## 125            Aunt  1
## 126       Australia  1
## 127           Barge  1
## 128         Bedford  1
## 129         Belgium  1
## 130           Bible  1
## 131         Biggles  1
## 132      Birmingham  1
## 133        Boughton  1
## 134         Bristol  1
## 135        Brussels  1
## 136            Bull  1
## 137      Buttercups  1
## 138          Cancer  1
## 139            Cary  1
## 140         Charles  1
## 141         Charter  1
## 142      Cheltenham  1
## 143           Chest  1
## 144           China  1
## 145           Chips  1
## 146         Clarice  1
## 147            Corp  1
## 148         Council  1
## 149        Coventry  1
## 150           Cream  1
## 151         Daisies  1
## 152          Damson  1
## 153             Dan  1
## 154       Daughters  1
## 155           Davis  1
## 156              De  1
## 157         Diddler  1
## 158            Doll  1
## 159          Doreen  1
## 160          Dorney  1
## 161         Dunkirk  1
## 162            E.N.  1
## 163        Eastgate  1
## 164           Eedie  1
## 165          Eighth  1
## 166       Elizabeth  1
## 167         Emerald  1
## 168              En  1
## 169           Ether  1
## 170            FORD  1
## 171           FRANK  1
## 172            Folk  1
## 173           Fordy  1
## 174           Fruit  1
## 175             GOT  1
## 176        Gardener  1
## 177         General  1
## 178       Gentleman  1
## 179         Germany  1
## 180         Glady's  1
## 181     Gloucesters  1
## 182 Gloucestershire  1
## 183           Gobey  1
## 184           Grans  1
## 185           Grant  1
## 186       Griffiths  1
## 187       Guildhall  1
## 188             Gun  1
## 189            H.P.  1
## 190           HOUSE  1
## 191          Harada  1
## 192          Harold  1
## 193        Harrison  1
## 194          Henrys  1
## 195        Hillsley  1
## 196             Him  1
## 197       Hiroshima  1
## 198          Hitler  1
## 199       Hollywood  1
## 200            INTO  1
## 201             Ice  1
## 202        Infantry  1
## 203       Infirmary  1
## 204        Irishman  1
## 205        Irishmen  1
## 206            Isle  1
## 207            Joke  1
## 208          Jokers  1
## 209          Karate  1
## 210        Kentucky  1
## 211         Khayyam  1
## 212            Kite  1
## 213          Kosher  1
## 214             Lee  1
## 215           Lemon  1
## 216            Life  1
## 217           Limpy  1
## 218            Lode  1
## 219            Lord  1
## 220           Lorry  1
## 221         MARRIED  1
## 222            MOVE  1
## 223         Machine  1
## 224            Mall  1
## 225         Mantles  1
## 226            Mare  1
## 227         Mastiff  1
## 228           Mayor  1
## 229       Mitsazuki  1
## 230             Mix  1
## 231            Mons  1
## 232     Montpellier  1
## 233             Mum  1
## 234         Mustard  1
## 235          NUMBER  1
## 236            Navy  1
## 237            Nell  1
## 238        Nicholas  1
## 239          Office  1
## 240             Old  1
## 241            Omar  1
## 242        Palomino  1
## 243          Peanut  1
## 244         Pegasus  1
## 245           Peter  1
## 246           Pilot  1
## 247          Poland  1
## 248            Port  1
## 249            Post  1
## 250          Priory  1
## 251          Ragged  1
## 252          Rememb  1
## 253             Rex  1
## 254            Road  1
## 255          Romany  1
## 256         Royston  1
## 257            Ruby  1
## 258             S.A  1
## 259         STORIES  1
## 260       Salvation  1
## 261            Sara  1
## 262        Saturday  1
## 263       Saturdays  1
## 264          School  1
## 265          Second  1
## 266           Seven  1
## 267            Soon  1
## 268           Spice  1
## 269              St  1
## 270             St.  1
## 271          States  1
## 272           Steve  1
## 273          Stroud  1
## 274           Stuka  1
## 275         Sundays  1
## 276      Tewkesbury  1
## 277          Thomas  1
## 278            Toms  1
## 279          Trojan  1
## 280           U.S.A  1
## 281          United  1
## 282        Vaseline  1
## 283         Veteran  1
## 284            WAR.  1
## 285         WILLIAM  1
## 286           Wagon  1
## 287           Wales  1
## 288          Walrus  1
## 289           Wanna  1
## 290      Warrington  1
## 291            Webb  1
## 292       Woodcocks  1
## 293           Works  1
## 294            Yank  1
## 295       Yorkshire  1
## 296            boys  1
## 297            cart  1
## 298             jam  1
## 299            park  1
## 300     pedestrians  1
## 301           river  1
# Plot Named Entities
named_entities %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(token, n), y = n)) +
  geom_bar(stat = "identity", fill = "#3399ff") +
  coord_flip() +
  labs(
    title = "Most Frequent Named Entities in A Collection of Stories by Frank William Ford",
    subtitle = "Analysis by Patrick Ford 2024",
    x = "Entity",
    y = "Frequency"
  ) +
  theme_minimal()

# Generate a Word Cloud for Named Entities
wordcloud(words = named_entities$token, freq = named_entities$n, min.freq = 2,
          max.words = 150, random.order = FALSE, rot.per = 0.1,  
          scale = c(2.5, 0.5),  
          colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of Named Entities in A Collection of Stories by Frank William Ford", side = 3, adj = 0, line = 1, cex = 1, font = 2)
mtext("Analysis by Patrick Ford 2024", side = 3, adj = 0, line = -1, cex = 0.9, font = 3)