The goal of this capstone project is to create a predictive text model using a large text corpus of documents as training data. Natural language processing techniques will be used to perform the analysis.
blogs <- readLines("en_US.blogs.txt", warn = FALSE, n = 5000)
news <- readLines("en_US.news.txt", warn = FALSE, n = 5000)
twitter <- readLines("en_US.twitter.txt", warn = FALSE, n = 5000)
library(stringi) # stats files
# Size of Files file.size
size_blogs <- file.info("en_US.blogs.txt")$size / 1024^2 # Megabytes
size_news <- file.info("en_US.news.txt")$size / 1024^2 # Megabytes
size_twitter <- file.info("en_US.twitter.txt")$size / 1024^2 # Megabytes
# Number of lines num.lines
len_blogs <- length(blogs) # 899,288 lines
len_news <- length(news) # 1,010,242 lines
len_twitter <- length(twitter) # 2,360,148
# Number of characters
nchar_blogs <- sum(nchar(blogs))
nchar_news <- sum(nchar(news))
nchar_twitter <- sum(nchar(twitter))
# Counting the words (num.words)
nword_blogs <- sum(stri_count_words(blogs)) # words at blogs = 37,546,246
nword_news <- sum(stri_count_words(news)) # words at news = 34,762,395
nword_twitter <-sum(stri_count_words(twitter)) # words at twitter = 30,093,410
# create table
data.frame(file.name = c("blogs", "news", "twitter"),
files.size.MB = c(size_blogs,size_news,size_twitter),
num.lines = c(len_blogs,len_news,len_twitter),
num.character = c(nchar_blogs,nchar_news,nchar_twitter),
num.words = c(nword_blogs,nword_news,nword_twitter))
## file.name files.size.MB num.lines num.character num.words
## 1 blogs 200.4242 5000 1139725 206913
## 2 news 196.2775 5000 1013388 173566
## 3 twitter 159.3641 5000 340150 63252
Now, we will remove all non-English characters and then compile a sample dataset that is composed of 1% of each of the 3 original datasets.
set.seed(12345)
blogs1 <-iconv(blogs,"latin1","ASCII",sub="")
news1 <-iconv(news,"latin1","ASCII",sub="")
twitter1 <-iconv(twitter,"latin1","ASCII",sub="")
# sample data set only 1% of each file
sample_data <-c(sample(blogs1,length(blogs1)*0.01),
sample(news1,length(news1)*0.01),
sample(twitter1,length(twitter1)*0.01))
Since Data sets is too big for processing, so using sample() function, I sample 1% of each file.
install.packages("quanteda")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(quanteda)
## Package version: 4.3.1
## Unicode version: 13.0
## ICU version: 66.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
# Create a small corpus from your sample
corpus_blogs <- corpus(blogs)
summary(corpus_blogs)
## Corpus consisting of 5000 documents, showing 100 documents:
##
## Text Types Tokens Sentences
## text1 19 20 1
## text2 6 7 1
## text3 104 154 7
## text4 36 43 1
## text5 91 119 5
## text6 13 13 1
## text7 6 6 1
## text8 52 62 3
## text9 45 49 3
## text10 94 152 7
## text11 104 188 9
## text12 53 69 8
## text13 35 44 5
## text14 62 78 2
## text15 8 8 1
## text16 17 19 1
## text17 61 83 3
## text18 20 22 1
## text19 69 106 4
## text20 9 9 1
## text21 53 77 6
## text22 135 240 13
## text23 39 47 1
## text24 42 56 1
## text25 12 15 1
## text26 62 91 5
## text27 20 23 1
## text28 103 155 11
## text29 4 4 1
## text30 45 52 3
## text31 30 36 1
## text32 6 6 1
## text33 39 61 3
## text34 42 49 2
## text35 8 8 1
## text36 24 28 1
## text37 30 41 2
## text38 5 5 1
## text39 41 53 4
## text40 31 34 1
## text41 59 69 3
## text42 42 50 3
## text43 51 62 2
## text44 38 41 1
## text45 6 6 1
## text46 73 100 5
## text47 55 69 5
## text48 8 8 1
## text49 3 3 1
## text50 157 251 6
## text51 64 94 6
## text52 33 34 1
## text53 22 23 1
## text54 36 40 1
## text55 113 170 12
## text56 4 4 2
## text57 73 97 5
## text58 17 19 1
## text59 33 42 2
## text60 6 6 1
## text61 60 88 2
## text62 8 8 1
## text63 43 51 2
## text64 54 71 3
## text65 171 304 17
## text66 7 7 1
## text67 31 37 2
## text68 1 1 1
## text69 70 92 3
## text70 40 52 5
## text71 2 2 1
## text72 39 56 2
## text73 5 5 1
## text74 23 23 1
## text75 19 19 1
## text76 61 78 4
## text77 38 59 4
## text78 66 82 1
## text79 22 25 2
## text80 12 12 1
## text81 8 8 1
## text82 28 40 3
## text83 42 45 2
## text84 3 3 1
## text85 21 27 2
## text86 15 16 1
## text87 34 40 1
## text88 20 22 1
## text89 20 22 1
## text90 6 6 1
## text91 9 9 1
## text92 68 88 2
## text93 20 27 1
## text94 17 20 1
## text95 17 18 1
## text96 16 16 1
## text97 9 9 1
## text98 78 126 7
## text99 8 8 1
## text100 117 196 7
library(quanteda)
# sample_data should be a character vector of text lines
# Convert to corpus
corpus <- corpus(sample_data)
# Tokenize & clean
tokens_clean <- tokens(
corpus,
remove_punct = TRUE, # remove punctuation
remove_symbols = TRUE, # remove symbols like $, %, etc.
remove_numbers = TRUE # remove digits
)
# Convert to lowercase
tokens_clean <- tokens_tolower(tokens_clean)
# Remove English stopwords
tokens_clean <- tokens_remove(tokens_clean, stopwords("english"))
# Create a document-feature matrix (similar to tm::DocumentTermMatrix)
dfm_clean <- dfm(tokens_clean)
# Quick peek at first few rows/columns
dfm_clean[1:5, 1:10]
## Document-feature matrix of: 5 documents, 10 features (80.00% sparse) and 0 docvars.
## features
## docs worried wasnt going like unhealthy love comic book heroes unrealistic
## text1 2 1 1 2 1 1 1 1 1 1
## text2 0 0 0 0 0 0 0 0 0 0
## text3 0 0 0 0 0 0 0 0 0 0
## text4 0 0 0 0 0 0 0 0 0 0
## text5 0 0 0 0 0 0 0 0 0 0
library(quanteda)
# sample_data should be a character vector of text lines
corpus <- corpus(sample_data)
# Tokenize + lowercase + remove punctuation/numbers
toks <- tokens(corpus, remove_punct = TRUE, remove_numbers = TRUE)
toks <- tokens_tolower(toks)
toks <- tokens_remove(toks, stopwords("english"))
# Create n-grams
toks1 <- toks
toks2 <- tokens_ngrams(toks, n = 2)
toks3 <- tokens_ngrams(toks, n = 3)
# Create document-feature matrices
dfm1 <- dfm(toks1)
dfm2 <- dfm(toks2)
dfm3 <- dfm(toks3)
# Get top 20 frequency tables using topfeatures (no extra package needed)
freq1 <- topfeatures(dfm1, 20) # top 20 unigrams
freq2 <- topfeatures(dfm2, 20) # top 20 bigrams
freq3 <- topfeatures(dfm3, 20) # top 20 trigrams
# Show results
freq1
## just time day year said even one going like people
## 13 12 11 11 11 10 9 8 8 8
## know new go can also great million find look little
## 8 8 8 7 7 7 7 6 6 6
freq2
## $_million people_know angry_hunger years_old
## 4 2 2 2
## sim_card choosing_papers paul_andrews looking_ice-cream
## 2 2 2 2
## find_iron iron_tooth nicole_kidman f_word
## 2 2 2 2
## new_house new_law million_euros euros_$
## 2 2 2 2
## year_ago high_school received_widespread league_one
## 2 2 2 2
freq3
## find_iron_tooth million_euros_$
## 2 2
## euros_$_million worried_wasnt_going
## 2 1
## wasnt_going_like going_like_unhealthy
## 1 1
## like_unhealthy_love unhealthy_love_comic
## 1 1
## love_comic_book comic_book_heroes
## 1 1
## book_heroes_unrealistic heroes_unrealistic_expectations
## 1 1
## unrealistic_expectations_worried expectations_worried_seems
## 1 1
## worried_seems_people seems_people_know
## 1 1
## people_know_already know_already_seen
## 1 1
## already_seen_enjoyed seen_enjoyed_even
## 1 1
two_table <- dfm2 # assign your DFM to two_table
two_corpus <- rownames(dfm2) # or select specific documents if needed
# Sum frequencies across documents
two_corpus_num <- colSums(as.matrix(two_table[two_corpus, ]))
# Create a data frame
two_corpus_table <- data.frame(
Word = names(two_corpus_num),
frequency = two_corpus_num
)
# Sort by descending frequency
two_corpus_sort <- two_corpus_table[order(-two_corpus_table$frequency), ]
# Show top results
head(two_corpus_sort)
## Word frequency
## $_million $_million 4
## people_know people_know 2
## angry_hunger angry_hunger 2
## years_old years_old 2
## sim_card sim_card 2
## choosing_papers choosing_papers 2
thr_table <- dfm3 # assign your DFM to thr_table
thr_corpus <- rownames(dfm3) # or select specific documents if needed
# Sum frequencies across documents (columns)
thr_corpus_num <- colSums(as.matrix(thr_table[thr_corpus, ]))
# Create a data frame
thr_corpus_table <- data.frame(
Word = names(thr_corpus_num),
frequency = thr_corpus_num
)
# Sort by descending frequency
thr_corpus_sort <- thr_corpus_table[order(-thr_corpus_table$frequency), ]
# Show top results
head(thr_corpus_sort)
## Word frequency
## find_iron_tooth find_iron_tooth 2
## million_euros_$ million_euros_$ 2
## euros_$_million euros_$_million 2
## worried_wasnt_going worried_wasnt_going 1
## wasnt_going_like wasnt_going_like 1
## going_like_unhealthy going_like_unhealthy 1
# Install ggplot2 (only need to do this once)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
# Load the library
library(ggplot2)
The frequency distribution of each n-grams category were visualized into 3 different bar plots.
library(ggplot2)
# Plot top 20 trigrams
ggplot(head(thr_corpus_sort, 20), aes(x = reorder(Word, frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Trigrams", x = "Trigram", y = "Frequency") +
theme_minimal()
library(quanteda)
library(ggplot2)
# Assuming sample_data is your character vector
corpus1 <- corpus(sample_data)
# Tokenize + lowercase + remove punctuation/numbers
toks1 <- tokens(corpus1, remove_punct = TRUE, remove_numbers = TRUE)
toks1 <- tokens_tolower(toks1)
toks1 <- tokens_remove(toks1, stopwords("english"))
# Create unigram DFM
dfm1 <- dfm(toks1)
# Calculate unigram frequencies
one_corpus_num <- colSums(as.matrix(dfm1)) # sum across documents
one_corpus_table <- data.frame(
Word = names(one_corpus_num),
frequency = one_corpus_num
)
one_corpus_sort <- one_corpus_table[order(-one_corpus_table$frequency), ]
# Plot top 10 unigrams
one_g <- ggplot(one_corpus_sort[1:10, ],
aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Unigrams", x = "Words", y = "Frequency") +
theme(axis.text.x = element_text(angle = 90)) +
coord_flip() # flips the bars horizontally for readability
one_g
library(quanteda)
library(ggplot2)
# Assuming sample_data is your character vector
corpus2 <- corpus(sample_data)
# Tokenize + lowercase + remove punctuation/numbers
toks2 <- tokens(corpus2, remove_punct = TRUE, remove_numbers = TRUE)
toks2 <- tokens_tolower(toks2)
toks2 <- tokens_remove(toks2, stopwords("english"))
# Create bigram DFM
dfm2 <- tokens_ngrams(toks2, n = 2) %>% dfm()
# Calculate bigram frequencies
two_corpus_num <- colSums(as.matrix(dfm2)) # sum across documents
two_corpus_table <- data.frame(
Word = names(two_corpus_num),
frequency = two_corpus_num
)
two_corpus_sort <- two_corpus_table[order(-two_corpus_table$frequency), ]
# Plot top 10 bigrams
two_g <- ggplot(two_corpus_sort[1:10, ],
aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Bigrams", x = "Words", y = "Frequency") +
theme(axis.text.x = element_text(angle = 90)) +
coord_flip() # horizontal bars
two_g
library(quanteda)
library(ggplot2)
# Assuming sample_data is your character vector
corpus3 <- corpus(sample_data)
# Tokenize + lowercase + remove punctuation/numbers
toks3 <- tokens(corpus3, remove_punct = TRUE, remove_numbers = TRUE)
toks3 <- tokens_tolower(toks3)
toks3 <- tokens_remove(toks3, stopwords("english"))
# Create trigram DFM
dfm3 <- tokens_ngrams(toks3, n = 3) %>% dfm()
# Calculate trigram frequencies
thr_corpus_num <- colSums(as.matrix(dfm3)) # sum across documents
thr_corpus_table <- data.frame(
Word = names(thr_corpus_num),
frequency = thr_corpus_num
)
thr_corpus_sort <- thr_corpus_table[order(-thr_corpus_table$frequency), ]
# Plot top 10 trigrams
thr_g <- ggplot(thr_corpus_sort[1:10, ],
aes(x = reorder(Word, -frequency), y = frequency, fill = frequency)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Trigrams", x = "Words", y = "Frequency") +
theme(axis.text.x = element_text(angle = 90)) +
coord_flip() # horizontal bars
thr_g
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00