Introduction

The goal of this capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis.

Loading datasets (Blogs, News, Twitter)

blogs <- readLines("en_US.blogs.txt", warn = FALSE, n = 5000)
news <- readLines("en_US.news.txt", warn = FALSE, n = 5000)
twitter <- readLines("en_US.twitter.txt", warn = FALSE, n = 5000)

Sampling the data

library(stringi) # stats files
# Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size  / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes
# Number of lines num.lines
len_blogs <- length(blogs) # 899,288 lines
len_news <- length(news)  # 1,010,242 lines
len_twitter <- length(twitter) # 2,360,148
# Number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))
# Counting the words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) # words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news))  # words at news =  34,762,395
nword_twitter <-sum(stri_count_words(twitter)) # words at twitter = 30,093,410
# create table 
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
##   file.name files.size.MB num.lines num.character num.words
## 1     blogs      200.4242      5000       1139725    206913
## 2      news      196.2775      5000       1013388    173566
## 3   twitter      159.3641      5000        340150     63252

Now, we will remove all non-English characters and then compile a sample dataset that is composed of 1% of each of the 3 original datasets.

set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
               sample(news1,length(news1)*0.01),
               sample(twitter1,length(twitter1)*0.01))

Summary of the Data

Since Data sets is too big for processing, so using sample() function, I sample 1% of each file.

install.packages("quanteda")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(quanteda)
## Package version: 4.3.1
## Unicode version: 13.0
## ICU version: 66.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
# Create a small corpus from your sample
corpus_blogs <- corpus(blogs)
summary(corpus_blogs)
## Corpus consisting of 5000 documents, showing 100 documents:
## 
##     Text Types Tokens Sentences
##    text1    19     20         1
##    text2     6      7         1
##    text3   104    154         7
##    text4    36     43         1
##    text5    91    119         5
##    text6    13     13         1
##    text7     6      6         1
##    text8    52     62         3
##    text9    45     49         3
##   text10    94    152         7
##   text11   104    188         9
##   text12    53     69         8
##   text13    35     44         5
##   text14    62     78         2
##   text15     8      8         1
##   text16    17     19         1
##   text17    61     83         3
##   text18    20     22         1
##   text19    69    106         4
##   text20     9      9         1
##   text21    53     77         6
##   text22   135    240        13
##   text23    39     47         1
##   text24    42     56         1
##   text25    12     15         1
##   text26    62     91         5
##   text27    20     23         1
##   text28   103    155        11
##   text29     4      4         1
##   text30    45     52         3
##   text31    30     36         1
##   text32     6      6         1
##   text33    39     61         3
##   text34    42     49         2
##   text35     8      8         1
##   text36    24     28         1
##   text37    30     41         2
##   text38     5      5         1
##   text39    41     53         4
##   text40    31     34         1
##   text41    59     69         3
##   text42    42     50         3
##   text43    51     62         2
##   text44    38     41         1
##   text45     6      6         1
##   text46    73    100         5
##   text47    55     69         5
##   text48     8      8         1
##   text49     3      3         1
##   text50   157    251         6
##   text51    64     94         6
##   text52    33     34         1
##   text53    22     23         1
##   text54    36     40         1
##   text55   113    170        12
##   text56     4      4         2
##   text57    73     97         5
##   text58    17     19         1
##   text59    33     42         2
##   text60     6      6         1
##   text61    60     88         2
##   text62     8      8         1
##   text63    43     51         2
##   text64    54     71         3
##   text65   171    304        17
##   text66     7      7         1
##   text67    31     37         2
##   text68     1      1         1
##   text69    70     92         3
##   text70    40     52         5
##   text71     2      2         1
##   text72    39     56         2
##   text73     5      5         1
##   text74    23     23         1
##   text75    19     19         1
##   text76    61     78         4
##   text77    38     59         4
##   text78    66     82         1
##   text79    22     25         2
##   text80    12     12         1
##   text81     8      8         1
##   text82    28     40         3
##   text83    42     45         2
##   text84     3      3         1
##   text85    21     27         2
##   text86    15     16         1
##   text87    34     40         1
##   text88    20     22         1
##   text89    20     22         1
##   text90     6      6         1
##   text91     9      9         1
##   text92    68     88         2
##   text93    20     27         1
##   text94    17     20         1
##   text95    17     18         1
##   text96    16     16         1
##   text97     9      9         1
##   text98    78    126         7
##   text99     8      8         1
##  text100   117    196         7

Cleaning and Building the Corpus

library(quanteda)

# sample_data should be a character vector of text lines
# Convert to corpus
corpus <- corpus(sample_data)

# Tokenize & clean
tokens_clean <- tokens(
  corpus,
  remove_punct = TRUE,     # remove punctuation
  remove_symbols = TRUE,   # remove symbols like $, %, etc.
  remove_numbers = TRUE    # remove digits
)

# Convert to lowercase
tokens_clean <- tokens_tolower(tokens_clean)

# Remove English stopwords
tokens_clean <- tokens_remove(tokens_clean, stopwords("english"))

# Create a document-feature matrix (similar to tm::DocumentTermMatrix)
dfm_clean <- dfm(tokens_clean)

# Quick peek at first few rows/columns
dfm_clean[1:5, 1:10]
## Document-feature matrix of: 5 documents, 10 features (80.00% sparse) and 0 docvars.
##        features
## docs    worried wasnt going like unhealthy love comic book heroes unrealistic
##   text1       2     1     1    2         1    1     1    1      1           1
##   text2       0     0     0    0         0    0     0    0      0           0
##   text3       0     0     0    0         0    0     0    0      0           0
##   text4       0     0     0    0         0    0     0    0      0           0
##   text5       0     0     0    0         0    0     0    0      0           0

Building N-Grams

library(quanteda)

# sample_data should be a character vector of text lines
corpus <- corpus(sample_data)

# Tokenize + lowercase + remove punctuation/numbers
toks <- tokens(corpus, remove_punct = TRUE, remove_numbers = TRUE)
toks <- tokens_tolower(toks)
toks <- tokens_remove(toks, stopwords("english"))

# Create n-grams
toks1 <- toks
toks2 <- tokens_ngrams(toks, n = 2)
toks3 <- tokens_ngrams(toks, n = 3)

# Create document-feature matrices
dfm1 <- dfm(toks1)
dfm2 <- dfm(toks2)
dfm3 <- dfm(toks3)

# Get top 20 frequency tables using topfeatures (no extra package needed)
freq1 <- topfeatures(dfm1, 20)   # top 20 unigrams
freq2 <- topfeatures(dfm2, 20)   # top 20 bigrams
freq3 <- topfeatures(dfm3, 20)   # top 20 trigrams

# Show results
freq1
##    just    time     day    year    said    even     one   going    like  people 
##      13      12      11      11      11      10       9       8       8       8 
##    know     new      go     can    also   great million    find    look  little 
##       8       8       8       7       7       7       7       6       6       6
freq2
##           $_million         people_know        angry_hunger           years_old 
##                   4                   2                   2                   2 
##            sim_card     choosing_papers        paul_andrews   looking_ice-cream 
##                   2                   2                   2                   2 
##           find_iron          iron_tooth       nicole_kidman              f_word 
##                   2                   2                   2                   2 
##           new_house             new_law       million_euros             euros_$ 
##                   2                   2                   2                   2 
##            year_ago         high_school received_widespread          league_one 
##                   2                   2                   2                   2
freq3
##                  find_iron_tooth                  million_euros_$ 
##                                2                                2 
##                  euros_$_million              worried_wasnt_going 
##                                2                                1 
##                 wasnt_going_like             going_like_unhealthy 
##                                1                                1 
##              like_unhealthy_love             unhealthy_love_comic 
##                                1                                1 
##                  love_comic_book                comic_book_heroes 
##                                1                                1 
##          book_heroes_unrealistic  heroes_unrealistic_expectations 
##                                1                                1 
## unrealistic_expectations_worried       expectations_worried_seems 
##                                1                                1 
##             worried_seems_people                seems_people_know 
##                                1                                1 
##              people_know_already                know_already_seen 
##                                1                                1 
##             already_seen_enjoyed                seen_enjoyed_even 
##                                1                                1
two_table <- dfm2          # assign your DFM to two_table
two_corpus <- rownames(dfm2)  # or select specific documents if needed

# Sum frequencies across documents
two_corpus_num <- colSums(as.matrix(two_table[two_corpus, ]))

# Create a data frame
two_corpus_table <- data.frame(
  Word = names(two_corpus_num),
  frequency = two_corpus_num
)

# Sort by descending frequency
two_corpus_sort <- two_corpus_table[order(-two_corpus_table$frequency), ]

# Show top results
head(two_corpus_sort)
##                            Word frequency
## $_million             $_million         4
## people_know         people_know         2
## angry_hunger       angry_hunger         2
## years_old             years_old         2
## sim_card               sim_card         2
## choosing_papers choosing_papers         2
thr_table <- dfm3          # assign your DFM to thr_table
thr_corpus <- rownames(dfm3)  # or select specific documents if needed

# Sum frequencies across documents (columns)
thr_corpus_num <- colSums(as.matrix(thr_table[thr_corpus, ]))

# Create a data frame
thr_corpus_table <- data.frame(
  Word = names(thr_corpus_num),
  frequency = thr_corpus_num
)

# Sort by descending frequency
thr_corpus_sort <- thr_corpus_table[order(-thr_corpus_table$frequency), ]

# Show top results
head(thr_corpus_sort)
##                                      Word frequency
## find_iron_tooth           find_iron_tooth         2
## million_euros_$           million_euros_$         2
## euros_$_million           euros_$_million         2
## worried_wasnt_going   worried_wasnt_going         1
## wasnt_going_like         wasnt_going_like         1
## going_like_unhealthy going_like_unhealthy         1
# Install ggplot2 (only need to do this once)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
# Load the library
library(ggplot2)

Exploratory Analysis (Graphs & Visualizations)

The frequency distribution of each n-grams category were visualized into 3 different bar plots.

library(ggplot2)

# Plot top 20 trigrams
ggplot(head(thr_corpus_sort, 20), aes(x = reorder(Word, frequency), y = frequency)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Trigrams", x = "Trigram", y = "Frequency") +
  theme_minimal()

Unigrams

library(quanteda)
library(ggplot2)

# Assuming sample_data is your character vector
corpus1 <- corpus(sample_data)

# Tokenize + lowercase + remove punctuation/numbers
toks1 <- tokens(corpus1, remove_punct = TRUE, remove_numbers = TRUE)
toks1 <- tokens_tolower(toks1)
toks1 <- tokens_remove(toks1, stopwords("english"))

# Create unigram DFM
dfm1 <- dfm(toks1)

# Calculate unigram frequencies
one_corpus_num <- colSums(as.matrix(dfm1))  # sum across documents
one_corpus_table <- data.frame(
  Word = names(one_corpus_num),
  frequency = one_corpus_num
)
one_corpus_sort <- one_corpus_table[order(-one_corpus_table$frequency), ]

# Plot top 10 unigrams
one_g <- ggplot(one_corpus_sort[1:10, ], 
                aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Unigrams", x = "Words", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_flip()  # flips the bars horizontally for readability

one_g

Bigrams

library(quanteda)
library(ggplot2)

# Assuming sample_data is your character vector
corpus2 <- corpus(sample_data)

# Tokenize + lowercase + remove punctuation/numbers
toks2 <- tokens(corpus2, remove_punct = TRUE, remove_numbers = TRUE)
toks2 <- tokens_tolower(toks2)
toks2 <- tokens_remove(toks2, stopwords("english"))

# Create bigram DFM
dfm2 <- tokens_ngrams(toks2, n = 2) %>% dfm()

# Calculate bigram frequencies
two_corpus_num <- colSums(as.matrix(dfm2))  # sum across documents
two_corpus_table <- data.frame(
  Word = names(two_corpus_num),
  frequency = two_corpus_num
)
two_corpus_sort <- two_corpus_table[order(-two_corpus_table$frequency), ]

# Plot top 10 bigrams
two_g <- ggplot(two_corpus_sort[1:10, ], 
                aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Bigrams", x = "Words", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_flip()  # horizontal bars

two_g

Trigrams

library(quanteda)
library(ggplot2)

# Assuming sample_data is your character vector
corpus3 <- corpus(sample_data)

# Tokenize + lowercase + remove punctuation/numbers
toks3 <- tokens(corpus3, remove_punct = TRUE, remove_numbers = TRUE)
toks3 <- tokens_tolower(toks3)
toks3 <- tokens_remove(toks3, stopwords("english"))

# Create trigram DFM
dfm3 <- tokens_ngrams(toks3, n = 3) %>% dfm()

# Calculate trigram frequencies
thr_corpus_num <- colSums(as.matrix(dfm3))  # sum across documents
thr_corpus_table <- data.frame(
  Word = names(thr_corpus_num),
  frequency = thr_corpus_num
)
thr_corpus_sort <- thr_corpus_table[order(-thr_corpus_table$frequency), ]

# Plot top 10 trigrams
thr_g <- ggplot(thr_corpus_sort[1:10, ], 
                 aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Trigrams", x = "Words", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90)) +
  coord_flip()  # horizontal bars

thr_g

Summary

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00