Context

The objective of this milestone report is to provide a step by step process to prepare a dataset which will be used to train a prediction model.

There are 3 data sources provided to be used for this project (Blogs, News, Twitter). Even though there are multiple languages provided, English language files will be used for this project.

Loading the libraries

library(stringi); library(xtable); library(tm); library(tidytext); library(tokenizers);library(stringr);library(dplyr);library(ggplot2);
library(quanteda); library(quanteda.textstats);library(ngram)

Read the data

dataset_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "dataset.zip"

if (!file.exists(dest_file)) {
        download.file(dataset_url, destfile=dest_file, method="curl")
}

Unzip the data

dest_dir <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final"
if (!dir.exists(dest_dir)) {
        path <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project"
        unzip(dest_file, exdir = path)        
}

File Processing

Assign the data files to variables in R

blog_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.blogs.txt"
news_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.news.txt"
twitter_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.twitter.txt"

con_blog <- file(blog_file, open = "r")
con_news <- file(news_file, open = "r")
con_twitter <- file(twitter_file, open = "r")

rblog <- readLines(con = con_blog, encoding = "UTF-8", skipNul = TRUE)
rnews <- readLines(con = con_news, encoding = "UTF-8", skipNul = TRUE)
rtwitter <- readLines(con = con_twitter, encoding = "UTF-8", skipNul = TRUE)

close(con_blog); close(con_news); close(con_twitter)

File Sizes in MBs

blog_size <- round(file.info(blog_file)$size / 1024^2)
news_size <- round(file.info(news_file)$size / 1024^2)
twitter_size <- round(file.info(twitter_file)$size / 1024^2)

file_sizes <- c(blog_size, news_size, twitter_size)

Number of lines

number_of_lines<- sapply(list(rblog, rnews, rtwitter), length)

Number of Words Per Line

words_per_line <- lapply(list(rblog, rnews, rtwitter), function(x) stri_count_words(x))
ggplot(data.frame(words_per_line[[1]]), aes(x = words_per_line[[1]])) +
  geom_histogram(binwidth = 50, color = "black", fill = "white") +
  labs(title = "Blog", x = "Words per Line", y = "Frequency")

ggplot(data.frame(words_per_line[[2]]), aes(x = words_per_line[[2]])) +
  geom_histogram(binwidth = 50, color = "black", fill = "white") +
  labs(title = "News", x = "Words per Line", y = "Frequency")

ggplot(data.frame(words_per_line[[3]]), aes(x = words_per_line[[3]])) +
  geom_histogram(binwidth = 5, color = "black", fill = "white") +
  labs(title = "Twitter", x = "Words per Line", y = "Frequency")

wpl_summary <- sapply(list(rblog, rnews, rtwitter), 
                      function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wpl_summary) <- c("Min", "Mean", "Max")
colnames(wpl_summary) <- c("Blog", "News", "Twitter")
wpl_summary
##            Blog       News  Twitter
## Min     0.00000    1.00000  1.00000
## Mean   41.75109   34.40997 12.75065
## Max  6726.00000 1796.00000 47.00000
rm(words_per_line)

Number of Words

number_of_words <- sapply(list(rblog, rnews, rtwitter), stri_stats_latex)[4,]

Summary

sum_files <- data.frame(
                File_Name = c("Blog", "News", "Twitter"),
                File_Size_MB = file_sizes,
                Lines = number_of_lines,
                Words = number_of_words
)
print(sum_files)
##   File_Name File_Size_MB   Lines    Words
## 1      Blog          200  899288 37570839
## 2      News          196 1010242 34494539
## 3   Twitter          159 2360148 30451170

Sampling the Dataset

Sampling at 1% of Each Dataset

set.seed(1234)
sample_size <- 0.01

rblog_sampled <- sample(rblog, length(rblog) * sample_size, replace = FALSE)
rnews_sampled <- sample(rnews, length(rnews) * sample_size, replace = FALSE)
rtwit_sampled <- sample(rtwitter, length(rtwitter) * sample_size, replace = FALSE)

Cleaning the Non-English Characters

# remove all non-English characters from the sampled data
rblog_sampled <- iconv(rblog_sampled, "latin1", "ASCII", sub = "")
rnews_sampled <- iconv(rnews_sampled, "latin1", "ASCII", sub = "")
rtwit_sampled <- iconv(rtwit_sampled, "latin1", "ASCII", sub = "")

Building the Corpus and Cleaning

Initial Corpus Build

corpus <- VCorpus(VectorSource(c(rblog_sampled, rnews_sampled, rtwit_sampled)))
rm(rblog, rnews, rtwitter)

Remove profanity words, URLs, and other unwanted patterns from the corpus

# Load the profanity words
profanity_words <- readLines("https://github.com/coffee-and-fun/google-profanity-words/raw/main/data/list.txt")

profanity <- iconv(profanity_words, "latin1", "ASCII", sub = "")
corpus <- tm_map(corpus, removeWords, profanity)

# URLs, Twitter handles, email patterns, and stopwords from the corpus
replace_with_space <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, replace_with_space, "http\\S+|www\\.\\S+")
corpus <- tm_map(corpus, replace_with_space, "@\\w+")
corpus <- tm_map(corpus, replace_with_space, "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b")
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, content_transformer(stripWhitespace))
rm(rblog_sampled, rnews_sampled, rtwit_sampled, profanity, profanity_words)

Finding and Plotting Highest Frequency Words

# Create a term-document matrix
tdm <- TermDocumentMatrix(corpus)

# Convert the tdm to a data frame
freq_df <- as.data.frame(inspect(tdm))
## <<TermDocumentMatrix (terms: 54792, documents: 42695)>>
## Non-/sparse entries: 523436/2338821004
## Sparsity           : 100%
## Maximal term length: 48
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  1143 1594 4106 4706 6077 6349 6844 7172 8167 849
##   can     0    0    2    2    0    1    2    1    1   1
##   get     1    0    0    2    0    0    0    2    3   0
##   good    2    0    0    1    0    0    0    0    0   0
##   just    1    2    0    1    0    0    0    0    3   1
##   like    3    4    1    4    0    3    1    1    0   0
##   new     0    0    1    0    0    0    0    1    0   0
##   one     2    1    1    2    0    3    0    2    0   4
##   said    0    0    0    0    0    0    4    0    0   0
##   the     1    1    4    2    3    2    3    0    2   0
##   time    1    3    0    1    1    0    0    0    0   1
# Compute the total frequency of each term
freq_df$freq <- rowSums(freq_df[, -1])

# Sort the data frame by frequency in descending order
freq_df_sorted <- freq_df %>% arrange(desc(freq))

# Select the top 10 most frequent terms
top_terms <- head(freq_df_sorted$freq, n = 10)

# Create a bar plot of the top 10 most frequent terms
ggplot(freq_df_sorted[1:10,], aes(x = reorder(row.names(freq_df_sorted[1:10,]),-freq), y = freq)) + 
  geom_bar(stat = "identity") +
  ggtitle("Top 10 Most Frequent Words") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Term") + ylab("Frequency")

rm(tdm, freq_df, freq_df_sorted)

Tokenizing and Building N-Grams

Creating Unigrams, Bigrams, and Trigrams & Plotting Top 10

# Convert the corpus to a character vector
corpus_chr <- sapply(corpus, as.character)

# Tokenize the corpus
unigrams <- tokenize_ngrams(corpus_chr, n = 1)
bigrams <- tokenize_ngrams(corpus_chr, n = 2)
trigrams <- tokenize_ngrams(corpus_chr, n = 3)

# Get top 10 items from each list
top_unigrams <- head(sort(table(unlist(unigrams)), decreasing = TRUE), 10)
top_bigrams <- head(sort(table(unlist(bigrams)), decreasing = TRUE), 10)
top_trigrams <- head(sort(table(unlist(trigrams)), decreasing = TRUE), 10)

# Print the top 10 items from each list
cat("Top 10 Unigrams: ", paste(names(top_unigrams), collapse = ", "), "\n")
## Top 10 Unigrams:  i, the, said, just, one, like, can, im, get, time
cat("Top 10 Bigrams: ", paste(names(top_bigrams), collapse = ", "), "\n")
## Top 10 Bigrams:  i think, i know, i love, i just, i can, i want, right now, i like, i really, i feel
cat("Top 10 Trigrams: ", paste(names(top_trigrams), collapse = ", "))
## Top 10 Trigrams:  i feel like, i know i, i think i, i wish i, i dont know, i just want, happy mothers day, feel like i, i thought i, i dont think
unigrams <- as.list(unclass(unigrams))
bigrams <- data.frame(t(bigrams))
trigrams <- data.frame(t(trigrams))

# Create an empty list to store the n-gram models
ngram_models <- list()

# Loop through each n-gram and estimate its probability
for (n in 1:3) {
  if (n == 1) {
    ngram_models[[n]] <- prop.table(table(unlist(unigrams)))
  } else if (n == 2) {
    ngram_models[[n]] <- prop.table(table(unlist(bigrams)))
  } else if (n == 3) {
    ngram_models[[n]] <- prop.table(table(unlist(trigrams)))
  }
}