The dataset consists of text files in four different languages: English (en_US), German (de_DE), Russian (ru_RU), and Finnish (fi_FI). These files contain text from blogs, news, and Twitter.
# Define the URL and destination file path
data_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
# Check if the file exists before downloading
if (!file.exists(destfile)) {
download.file(data_url, destfile, mode = "wb")
}
# Unzip the dataset
unzip(destfile, exdir = "data")
# Define file paths
files <- list.files("data/final/en_US", full.names = TRUE)
# Read a sample from the dataset
con_twitter <- file(files[3], "r")
con_blogs <- file(files[1], "r")
con_news <- file(files[2], "r")
en_twitter <- readLines(con_twitter, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
en_blogs <- readLines(con_blogs, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
en_news <- readLines(con_news, encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
close(con_twitter)
close(con_blogs)
close(con_news)
# Check the first few lines of each file
head(en_twitter, 5)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"
## [5] "Words from a complete stranger! Made my birthday even better :)"
head(en_blogs, 5)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."
## [5] "With graduation season right around the corner, Nancy has whipped up a fun set to help you out with not only your graduation cards and gifts, but any occasion that brings on a change in one's life. I stamped the images in Memento Tuxedo Black and cut them out with circle Nestabilities. I embossed the kraft and red cardstock with TE's new Stars Impressions Plate, which is double sided and gives you 2 fantastic patterns. You can see how to use the Impressions Plates in this tutorial Taylor created. Just one pass through your die cut machine using the Embossing Pad Kit is all you need to do - super easy!"
head(en_news, 5)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
## [4] "The Alaimo Group of Mount Holly was up for a contract last fall to evaluate and suggest improvements to Trenton Water Works. But campaign finance records released this week show the two employees donated a total of $4,500 to the political action committee (PAC) Partners for Progress in early June. Partners for Progress reported it gave more than $10,000 in both direct and in-kind contributions to Mayor Tony Mack in the two weeks leading up to his victory in the mayoral runoff election June 15."
## [5] "And when it's often difficult to predict a law's impact, legislators should think twice before carrying any bill. Is it absolutely necessary? Is it an issue serious enough to merit their attention? Will it definitely not make the situation worse?"
set.seed(123)
sample_data <- function(file_path, sample_size = 0.1) {
con <- file(file_path, "r")
lines <- readLines(con, warn = FALSE)
close(con)
sample_lines <- sample(lines, size = floor(length(lines) * sample_size))
return(sample_lines)
}
sample_twitter <- sample_data(files[3])
sample_blogs <- sample_data(files[1])
sample_news <- sample_data(files[2])
library(tokenizers)
## Warning: package 'tokenizers' was built under R version 4.3.3
tokenize_text <- function(text) {
tokens <- unlist(tokenize_words(text, lowercase = TRUE, stopwords = NULL))
return(tokens)
}
twitter_tokens <- tokenize_text(sample_twitter)
blogs_tokens <- tokenize_text(sample_blogs)
news_tokens <- tokenize_text(sample_news)
# Load a list of profanity words (example list)
profanity_words <- c("fuck", "shit", "fucking")
filter_profanity <- function(tokens) {
return(tokens[!tokens %in% profanity_words])
}
twitter_tokens_clean <- filter_profanity(twitter_tokens)
blogs_tokens_clean <- filter_profanity(blogs_tokens)
news_tokens_clean <- filter_profanity(news_tokens)
This step ensures that we: - Load and explore the dataset - Sample a subset of the data to make processing efficient - Tokenize the text into words - Filter out profanity and unwanted words
The cleaned data will be used in the next steps to build a predictive text model.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
word_frequency <- function(tokens) {
freq_table <- as.data.frame(table(tokens)) %>%
arrange(desc(Freq))
return(freq_table)
}
twitter_freq <- word_frequency(twitter_tokens_clean)
blogs_freq <- word_frequency(blogs_tokens_clean)
news_freq <- word_frequency(news_tokens_clean)
plot_word_freq <- function(freq_table, title) {
ggplot(freq_table[1:20,], aes(x = reorder(tokens, -Freq), y = Freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = title, x = "Words", y = "Frequency")
}
plot_word_freq(twitter_freq, "Top 20 Words in Twitter Sample")
plot_word_freq(blogs_freq, "Top 20 Words in Blogs Sample")
plot_word_freq(news_freq, "Top 20 Words in News Sample")
library(tokenizers)
ngram_frequency <- function(text, n) {
ngrams <- unlist(tokenize_ngrams(text, n = n))
freq_table <- as.data.frame(table(ngrams)) %>%
arrange(desc(Freq))
return(freq_table)
}
twitter_bigrams <- ngram_frequency(sample_twitter, 2)
twitter_trigrams <- ngram_frequency(sample_twitter, 3)
coverage_analysis <- function(freq_table, coverage = 0.5) {
total_words <- sum(freq_table$Freq)
cumulative_sum <- cumsum(freq_table$Freq)
required_words <- which(cumulative_sum / total_words >= coverage)[1]
return(required_words)
}
words_50 <- coverage_analysis(twitter_freq, 0.5)
words_90 <- coverage_analysis(twitter_freq, 0.9)
The goal of this task is to build a predictive text model using n-grams to predict the next word based on previous words.
We will start by building a basic n-gram model (unigram, bigram, trigram) to predict the next word based on previous words.
library(ngram)
## Warning: package 'ngram' was built under R version 4.3.2
# Function to create n-grams
create_ngrams <- function(tokens, n = 2) {
if (length(tokens) < n) {
return(NULL) # If there are fewer than 'n' tokens, return NULL
}
# Combine tokens into a single string (for ngram package)
text <- paste(tokens, collapse = " ")
# Create the n-grams
ngrams <- ngram::ngram(text, n = n)
return(ngrams)
}
# Sample data from tokens (adjust based on your needs)
sample_twitter_tokens <- twitter_tokens_clean[1:100000]
# Generate n-grams
unigrams <- create_ngrams(sample_twitter_tokens, 1)
bigrams <- create_ngrams(sample_twitter_tokens, 2)
trigrams <- create_ngrams(sample_twitter_tokens, 3)
# Extract the n-grams from the ngram object
unigram_tokens <- get.ngrams(unigrams)
bigram_tokens <- get.ngrams(bigrams)
trigram_tokens <- get.ngrams(trigrams)
# Now, calculate the frequency using table()
unigram_freq <- table(unigram_tokens)
bigram_freq <- table(bigram_tokens)
trigram_freq <- table(trigram_tokens)
# View the top 10 most frequent n-grams
cat("Top 10 Unigrams:\n")
## Top 10 Unigrams:
print(head(sort(unigram_freq, decreasing = TRUE), 10))
## unigram_tokens
## __ ___ ____ ________ __________ 0 0.50
## 1 1 1 1 1 1 1
## 0_o 00 00pm
## 1 1 1
cat("Top 10 Bigrams:\n")
## Top 10 Bigrams:
print(head(sort(bigram_freq, decreasing = TRUE), 10))
## bigram_tokens
## __ i __ indeed __ rt __ what __ when ___ 3 ___ its ___ lol
## 1 1 1 1 1 1 1 1
## ___ why ____ no
## 1 1
cat("Top 10 Trigrams:\n")
## Top 10 Trigrams:
print(head(sort(trigram_freq, decreasing = TRUE), 10))
## trigram_tokens
## __ i take __ indeed i __ rt even __ what about __ when you
## 1 1 1 1 1
## ___ 3 loving ___ its cray ___ lol drinkin ___ why aren't ____ no you
## 1 1 1 1 1
# Function for Laplace smoothing
laplace_smoothing <- function(ngram_freq, vocab_size, alpha = 1) {
smoothed_freq <- ngram_freq + alpha
smoothed_prob <- smoothed_freq / (sum(smoothed_freq) + vocab_size * alpha)
return(smoothed_prob)
}
# Apply Laplace smoothing to unigram, bigram, trigram frequencies
vocab_size <- length(unique(twitter_tokens_clean)) # total number of unique words
smoothed_unigrams <- laplace_smoothing(unigram_freq, vocab_size)
smoothed_bigrams <- laplace_smoothing(bigram_freq, vocab_size)
smoothed_trigrams <- laplace_smoothing(trigram_freq, vocab_size)
# Display the smoothed n-grams' probabilities
cat("Smoothed Unigrams:\n")
## Smoothed Unigrams:
print(head(smoothed_unigrams, 10))
## unigram_tokens
## __ ___ ____ ________ __________ 0
## 1.621297e-05 1.621297e-05 1.621297e-05 1.621297e-05 1.621297e-05 1.621297e-05
## 0.50 0_o 00 00pm
## 1.621297e-05 1.621297e-05 1.621297e-05 1.621297e-05
cat("Smoothed Bigrams:\n")
## Smoothed Bigrams:
print(head(smoothed_bigrams, 10))
## bigram_tokens
## __ i __ indeed __ rt __ what __ when ___ 3
## 8.734158e-06 8.734158e-06 8.734158e-06 8.734158e-06 8.734158e-06 8.734158e-06
## ___ its ___ lol ___ why ____ no
## 8.734158e-06 8.734158e-06 8.734158e-06 8.734158e-06
cat("Smoothed Trigrams:\n")
## Smoothed Trigrams:
print(head(smoothed_trigrams, 10))
## trigram_tokens
## __ i take __ indeed i __ rt even __ what about __ when you
## 7.042898e-06 7.042898e-06 7.042898e-06 7.042898e-06 7.042898e-06
## ___ 3 loving ___ its cray ___ lol drinkin ___ why aren't ____ no you
## 7.042898e-06 7.042898e-06 7.042898e-06 7.042898e-06 7.042898e-06
# Function to calculate perplexity
calculate_perplexity <- function(test_data, model) {
log_likelihood <- 0
total_words <- 0
for (i in 1:(length(test_data) - 1)) {
word_pair <- paste(test_data[i], test_data[i + 1])
prob <- model[word_pair]
if (!is.na(prob)) {
log_likelihood <- log_likelihood + log(prob)
total_words <- total_words + 1
}
}
perplexity <- exp(-log_likelihood / total_words)
return(perplexity)
}
# Example of evaluating model on test data (using a sample sequence of words)
test_data_sample <- sample_twitter[1:10000] # Taking a smaller sample for testing
perplexity_value <- calculate_perplexity(test_data_sample, smoothed_bigrams)
cat("Perplexity Value:\n")
## Perplexity Value:
print(perplexity_value)
## [1] NaN
# Check the memory usage of the bigram model
cat("Memory usage of smoothed bigrams:\n")
## Memory usage of smoothed bigrams:
print(object.size(smoothed_bigrams), units = "Mb")
## 5 Mb
# Profile the runtime of a prediction
Rprof("profile.out")
# Backoff model not defined in the original code, skipping the call
# backoff_model(query_example, smoothed_trigrams, smoothed_bigrams, smoothed_unigrams)
Rprof(NULL)
# Summarize profiling results
summaryRprof("profile.out")
## $by.self
## [1] self.time self.pct total.time total.pct
## <0 rows> (or 0-length row.names)
##
## $by.total
## [1] total.time total.pct self.time self.pct
## <0 rows> (or 0-length row.names)
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 0