Capstone EDA

Data Preparation

First we load appropriate libraries.

library(dplyr)
library(ggplot2)
library(ngram)
library(patchwork) # for plotting

We create a separate corpus for the given text files. To preprocess and create the n-grams we use the ngram package. The preprocessing involves removing white space, removing punctuation, removing numbers and makes everything lower case. We then use the ngram package to create 1,2 and 3 gram tables and save them to disk to save memory.

# import blogs text and create n-grams
blogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8") %>%
  lapply(ngram::preprocess, remove.punct = TRUE, remove.numbers=TRUE) %>%
  (ngram::concatenate)

# unigrams
blog_unigrams <- ngram(blogs, n=1)
blog_unigrams_freq <- get.phrasetable(blog_unigrams)
write.csv(blog_unigrams_freq, file = "blog_unigrams_freq.csv", row.names = FALSE)

# bigrams
blog_bigrams <- ngram(blogs, n=2)
blog_bigrams_freq <- get.phrasetable(blog_bigrams)
write.csv(blog_bigrams_freq, file = "blog_bigrams_freq.csv", row.names = FALSE)

# trigrams
blog_trigrams <- ngram(blogs, n=3)
blog_trigrams_freq <- get.phrasetable(blog_trigrams)
write.csv(blog_trigrams_freq, file = "blog_trigrams_freq.csv", row.names = FALSE)

# repeat for news text
news <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8") %>%
  lapply(ngram::preprocess, remove.punct = TRUE, remove.numbers=TRUE) %>%
  (ngram::concatenate)

# unigrams
news_unigrams <- ngram(news, n=1)
news_unigrams_freq <- get.phrasetable(news_unigrams)
write.csv(news_unigrams_freq, file = "news_unigrams_freq.csv", row.names = FALSE)
# bigrams
news_bigrams <- ngram(news, n=2)
news_bigrams_freq <- get.phrasetable(news_bigrams)
write.csv(news_bigrams_freq, file = "news_bigrams_freq.csv", row.names = FALSE)

# trigrams
news_trigrams <- ngram(news, n=3)
news_trigrams_freq <- get.phrasetable(news_trigrams)
write.csv(news_trigrams_freq, file = "news_trigrams_freq.csv", row.names = FALSE)

# repeat for twitter text
twitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")  %>%
  lapply(ngram::preprocess, remove.punct = TRUE, remove.numbers=TRUE) %>%
  (ngram::concatenate)

# unigrams
twitter_unigrams <- ngram(twitter, n=1)
twitter_unigrams_freq <- get.phrasetable(twitter_unigrams)
write.csv(twitter_unigrams_freq, file = "twitter_unigrams_freq.csv", row.names = FALSE)

# bigrams
twitter_bigrams <- ngram(twitter, n=2)
twitter_bigrams_freq <- get.phrasetable(twitter_bigrams)
write.csv(twitter_bigrams_freq, file = "twitter_bigrams_freq.csv", row.names = FALSE)

# trigrams
twitter_trigrams <- ngram(twitter, n=3)
twitter_trigrams_freq <- get.phrasetable(twitter_trigrams)
write.csv(twitter_trigrams_freq, file = "twitter_trigrams_freq.csv", row.names = FALSE)

Exploratory Data Analysis

Now we load the frequency tables for the 1, 2 and 3 grams.

blogs_unigrams_freq <- read.csv("./blog_unigrams_freq.csv")
blogs_bigrams_freq <- read.csv("./blog_bigrams_freq.csv")
blogs_trigrams_freq <- read.csv("./blog_trigrams_freq.csv")

news_unigrams_freq <- read.csv("./news_unigrams_freq.csv")
news_bigrams_freq <- read.csv("./news_bigrams_freq.csv")
news_trigrams_freq <- read.csv("./news_trigrams_freq.csv")

twitter_unigrams_freq <- read.csv("./twitter_unigrams_freq.csv")
twitter_bigrams_freq <- read.csv("./twitter_bigrams_freq.csv")
twitter_trigrams_freq <- read.csv("./twitter_trigrams_freq.csv")

head(blogs_unigrams_freq)

##   ngrams    freq       prop
## 1   the  1855769 0.05040434
## 2   and  1086109 0.02949969
## 3    to  1065697 0.02894528
## 4     a   896934 0.02436153
## 5    of   875027 0.02376651
## 6     i   769468 0.02089944

tail(blogs_unigrams_freq)

##               ngrams freq         prop
## 400266       gaylen     1 2.716089e-08
## 400267 ideasprompts     1 2.716089e-08
## 400268      hexbugs     1 2.716089e-08
## 400269    froshsoph     1 2.716089e-08
## 400270 daysometimes     1 2.716089e-08
## 400271      landulf     1 2.716089e-08

Notice how the third column gives us the percentage coverage of each term. We leverage this to write a function to see how many terms are necessary to cover 50, 90, 95 and 99% of the data for each grouping and n gram:

threshold <- function(x) {
  i <- 1
  j <- 0
  pct <- 0
  
  # figure out how many terms are needed in blog unigram to cover 50, 90, 95 and 99%
  # of the corpora
  while(pct < .99){
    pct <- x[i, 3] + pct
    if(pct >= .5 && j == 0) {
      print(paste("To cover 50% of the data you need",i,"terms"))
      j <- j + 1
    }
    if(pct >= .9 && j == 1) {
      print(paste("To cover 90% of the data you need",i,"terms"))
      j <- j + 1
    }
    if(pct >= .95 && j == 2) {
      print(paste("To cover 95% of the data you need",i,"terms"))
      j <- j + 1
    }
    i <- i + 1
  }
  print(paste("To cover 99% of the data you need",i,"terms"))
}

Now we figure out how many terms are needed for each 1, 2 and 3 grams of each text group (blogs, news, twitter) to represent 50, 90, 95 and 99% of the text respectively. We reassign the 99% coverage and if we need to decrease in the future for efficiency we can easily do so. For each group (blogs, news, tweets) and n gram we plot a histogram of the top 20 terms and their raw counts.

For the blog unigrams:

threshold(blogs_unigrams_freq)

## [1] "To cover 50% of the data you need 105 terms"
## [1] "To cover 90% of the data you need 6737 terms"
## [1] "To cover 95% of the data you need 16845 terms"
## [1] "To cover 99% of the data you need 105685 terms"

For efficiency we keep the terms that cover 99% of the data.

blogs_unigrams_freq <- blogs_unigrams_freq[1:105685,1:3]

The top 20 terms are:

##    ngrams    freq        prop
## 1    the  1855769 0.050404340
## 2    and  1086109 0.029499689
## 3     to  1065697 0.028945280
## 4      a   896934 0.024361527
## 5     of   875027 0.023766513
## 6      i   769468 0.020899437
## 7     in   593633 0.016123601
## 8   that   459499 0.012480402
## 9     is   431833 0.011728969
## 10    it   400905 0.010888937
## 11   for   362866 0.009855764
## 12   you   296852 0.008062765
## 13  with   286176 0.007772795
## 14   was   278002 0.007550782
## 15    on   274040 0.007443171
## 16    my   270180 0.007338330
## 17  this   257977 0.007006885
## 18    as   223357 0.006066575
## 19  have   218541 0.005935768
## 20    be   208303 0.005657695

And we continue, for the blog bigrams:

threshold(blogs_bigrams_freq)

## [1] "To cover 50% of the data you need 30445 terms"
## [1] "To cover 90% of the data you need 3008715 terms"
## [1] "To cover 95% of the data you need 4849597 terms"
## [1] "To cover 99% of the data you need 6322304 terms"

blogs_bigrams_freq <- blogs_bigrams_freq[1:6322304,1:3]

The top 20 terms are:

##       ngrams   freq         prop
## 1    of the  187243 0.0050856869
## 2    in the  154356 0.0041924467
## 3    to the   86200 0.0023412689
## 4    on the   75426 0.0020486374
## 5     to be   68042 0.0018480814
## 6   and the   58641 0.0015927419
## 7   for the   58175 0.0015800849
## 8     i was   49350 0.0013403900
## 9     and i   49141 0.0013347134
## 10   i have   47820 0.0012988339
## 11    it is   47730 0.0012963894
## 12   it was   47460 0.0012890559
## 13   at the   47427 0.0012881596
## 14     is a   45600 0.0012385367
## 15     in a   45549 0.0012371515
## 16 with the   43996 0.0011949706
## 17     i am   42126 0.0011441797
## 18   that i   38235 0.0010384967
## 19 from the   38130 0.0010356448
## 20     of a   34005 0.0009236061

For the blog trigrams:

threshold(blogs_trigrams_freq)

## [1] "To cover 50% of the data you need 2440111 terms"
## [1] "To cover 90% of the data you need 16557247 terms"
## [1] "To cover 95% of the data you need 18398129 terms"
## [1] "To cover 99% of the data you need 19870836 terms"

blogs_trigrams_freq <- blogs_trigrams_freq[1:19870836,1:3]

The top 20 terms are:

##            ngrams  freq         prop
## 1     one of the  14418 0.0003916058
## 2       a lot of  12230 0.0003321777
## 3     as well as   6886 0.0001870299
## 4        to be a   6841 0.0001858077
## 5       it was a   6793 0.0001845039
## 6    some of the   6708 0.0001821953
## 7     out of the   6488 0.0001762199
## 8     the end of   6469 0.0001757038
## 9     be able to   6227 0.0001691309
## 10   a couple of   5994 0.0001628024
## 11     i want to   5959 0.0001618518
## 12 the fact that   5419 0.0001471849
## 13     this is a   5282 0.0001434638
## 14     i have to   5265 0.0001430021
## 15   the rest of   5184 0.0001408021
## 16    there is a   5095 0.0001383847
## 17      i have a   5076 0.0001378687
## 18   part of the   4969 0.0001349625
## 19       it is a   4967 0.0001349082
## 20   i have been   4942 0.0001342291

Now for the histogram of the 20 most common terms for the blog 1, 2 and 3 grams:

News unigrams:

threshold(news_unigrams_freq)

## [1] "To cover 50% of the data you need 191 terms"
## [1] "To cover 90% of the data you need 8479 terms"
## [1] "To cover 95% of the data you need 19661 terms"
## [1] "To cover 99% of the data you need 93709 terms"

news_unigrams_freq <- news_unigrams_freq[1:93709,1:3]

The top 20 terms are:

##    ngrams    freq        prop
## 1    the  1970687 0.058870107
## 2     to   901227 0.026922251
## 3    and   884180 0.026413008
## 4      a   875419 0.026151291
## 5     of   771564 0.023048843
## 6     in   673960 0.020133130
## 7    for   352942 0.010543396
## 8   that   346539 0.010352120
## 9     is   284063 0.008485781
## 10    on   266887 0.007972685
## 11  with   254686 0.007608206
## 12  said   250342 0.007478438
## 13   was   228905 0.006838053
## 14    he   228364 0.006821892
## 15    it   218682 0.006532662
## 16    at   211945 0.006331409
## 17    as   187787 0.005609739
## 18   his   157567 0.004706981
## 19     i   157075 0.004692283
## 20    be   152175 0.004545906

News bigrams:

threshold(news_bigrams_freq)

## [1] "To cover 50% of the data you need 50246 terms"
## [1] "To cover 90% of the data you need 3348001 terms"
## [1] "To cover 95% of the data you need 5021760 terms"
## [1] "To cover 99% of the data you need 6360767 terms"

news_bigrams_freq <- news_bigrams_freq[1:6360767,1:3]

The top 20 terms are:

##       ngrams   freq         prop
## 1    of the  186848 0.0055816890
## 2    in the  180201 0.0053831239
## 3    to the   84723 0.0025309205
## 4    on the   72976 0.0021800037
## 5   for the   69318 0.0020707287
## 6    at the   58513 0.0017479522
## 7   and the   52361 0.0015641742
## 8      in a   51388 0.0015351079
## 9     to be   46903 0.0014011280
## 10 with the   43602 0.0013025176
## 11 from the   37221 0.0011118987
## 12   with a   34331 0.0010255661
## 13  he said   33287 0.0009943788
## 14     of a   33239 0.0009929449
## 15     as a   31653 0.0009455665
## 16    for a   30509 0.0009113919
## 17     is a   28665 0.0008563063
## 18   by the   28212 0.0008427739
## 19 that the   28156 0.0008411010
## 20  will be   27948 0.0008348874

News trigrams:

threshold(news_trigrams_freq)

## [1] "To cover 50% of the data you need 2864032 terms"
## [1] "To cover 90% of the data you need 16077338 terms"
## [1] "To cover 95% of the data you need 17751097 terms"
## [1] "To cover 99% of the data you need 19090105 terms"

news_trigrams_freq <- news_trigrams_freq[1:19090105,1:3]

The top 20 terms are:

##                ngrams  freq         prop
## 1         one of the  14580 0.0004355467
## 2           a lot of  11543 0.0003448227
## 3         as well as   6242 0.0001864666
## 4        part of the   5699 0.0001702456
## 5         the end of   5624 0.0001680051
## 6   according to the   5606 0.0001674674
## 7         out of the   5566 0.0001662725
## 8        some of the   5468 0.0001633449
## 9            to be a   5386 0.0001608954
## 10      in the first   5220 0.0001559365
## 11       going to be   5136 0.0001534271
## 12 the united states   4479 0.0001338007
## 13          it was a   4311 0.0001287820
## 14    the first time   4169 0.0001245401
## 15        be able to   4075 0.0001217320
## 16         said in a   3723 0.0001112168
## 17        end of the   3598 0.0001074826
## 18       of the year   3484 0.0001040771
## 19       he said the   3477 0.0001038680
## 20     for the first   3449 0.0001030316

Now for the histogram of the 20 most common terms for the news 1, 2 and 3 grams:

Twitter unigrams:

threshold(twitter_unigrams_freq)

## [1] "To cover 50% of the data you need 120 terms"
## [1] "To cover 90% of the data you need 5588 terms"
## [1] "To cover 95% of the data you need 15692 terms"
## [1] "To cover 99% of the data you need 186124 terms"

twitter_unigrams_freq <- twitter_unigrams_freq[1:186124,1:3]

The top 20 terms are:

##    ngrams   freq        prop
## 1    the  934168 0.031822751
## 2     to  786629 0.026796785
## 3      i  713682 0.024311821
## 4      a  608612 0.020732575
## 5    you  543694 0.018521121
## 6    and  433686 0.014773661
## 7    for  384535 0.013099316
## 8     in  377033 0.012843758
## 9     of  358981 0.012228810
## 10    is  357543 0.012179824
## 11    it  291391 0.009926334
## 12    my  290517 0.009896561
## 13    on  276263 0.009410993
## 14  that  232906 0.007934022
## 15    me  200065 0.006815282
## 16    be  187176 0.006376214
## 17    at  185524 0.006319938
## 18  with  172995 0.005893134
## 19  your  170771 0.005817372
## 20  have  168051 0.005724715

Twitter bigrams:

threshold(twitter_bigrams_freq)

## [1] "To cover 50% of the data you need 31605 terms"
## [1] "To cover 90% of the data you need 3172120 terms"
## [1] "To cover 95% of the data you need 4639887 terms"
## [1] "To cover 99% of the data you need 5814102 terms"

twitter_bigrams_freq <- twitter_bigrams_freq[1:5814102,1:3]

The top 20 terms are:

##         ngrams  freq         prop
## 1      in the  78372 0.0026697690
## 2     for the  74000 0.0025208353
## 3      of the  56939 0.0019396465
## 4      on the  48567 0.0016544515
## 5       to be  46895 0.0015974942
## 6      to the  43515 0.0014823534
## 7  thanks for  42778 0.0014572472
## 8      at the  37340 0.0012719999
## 9      i love  35452 0.0012076845
## 10   going to  34183 0.0011644556
## 11     have a  33566 0.0011434373
## 12     if you  32943 0.0011222146
## 13  thank you  32886 0.0011202728
## 14     i have  31217 0.0010634178
## 15      for a  29666 0.0010105824
## 16       i am  29380 0.0010008397
## 17     i dont  28941 0.0009858851
## 18     to get  26982 0.0009191511
## 19     to see  26871 0.0009153698
## 20       is a  26758 0.0009115204

Twitter trigrams:

threshold(twitter_trigrams_freq)

## [1] "To cover 50% of the data you need 2325326 terms"
## [1] "To cover 90% of the data you need 14052504 terms"
## [1] "To cover 95% of the data you need 15520271 terms"
## [1] "To cover 99% of the data you need 16694486 terms"

twitter_trigrams_freq <- twitter_trigrams_freq[1:16694486,1:3]

The top 20 terms are:

##                 ngrams  freq         prop
## 1      thanks for the  23533 0.0008016597
## 2  looking forward to   8713 0.0002968113
## 3       thank you for   8602 0.0002930301
## 4        cant wait to   8289 0.0002823676
## 5          i love you   8250 0.0002810391
## 6      for the follow   7803 0.0002658119
## 7         going to be   7394 0.0002518791
## 8           i want to   7048 0.0002400925
## 9            a lot of   6227 0.0002121249
## 10            to be a   5993 0.0002041536
## 11          i need to   5824 0.0001983966
## 12           i have a   5694 0.0001939681
## 13        im going to   5626 0.0001916516
## 14         one of the   5551 0.0001890967
## 15       have a great   5376 0.0001831353
## 16         to see you   5305 0.0001807166
## 17          i have to   4944 0.0001684191
## 18        is going to   4667 0.0001589830
## 19        i dont know   4652 0.0001584720
## 20         you have a   4423 0.0001506710

Interestingly we see for the twitter dataset there’s much more emotional qualitative language like “i love you” and such, which makes sense given the informal context.

Next Steps

From here the plan is the following:

Implement the trigram Katz Backoff Model with Good-Turing estimation.
Create a shiny app that takes a sentence and gives a choice of three words as a prediction for the next word.
Make a slide deck to present the final algorithm.

Capstone EDA

Scott Hootman-Ng

2025-08-15

Data Preparation

Exploratory Data Analysis

Next Steps