The objective of this milestone report is to provide a step by step process to prepare a dataset which will be used to train a prediction model.
There are 3 data sources provided to be used for this project (Blogs, News, Twitter). Even though there are multiple languages provided, English language files will be used for this project.
library(stringi); library(xtable); library(tm); library(tidytext); library(tokenizers);library(stringr);library(dplyr);library(ggplot2);
library(quanteda); library(quanteda.textstats);library(ngram)
dataset_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "dataset.zip"
if (!file.exists(dest_file)) {
download.file(dataset_url, destfile=dest_file, method="curl")
}
dest_dir <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final"
if (!dir.exists(dest_dir)) {
path <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project"
unzip(dest_file, exdir = path)
}
blog_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.blogs.txt"
news_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.news.txt"
twitter_file <- "/Users/metehansoysal/Documents/Projects/R Programming Class/Capstone Project/final/en_US/en_US.twitter.txt"
con_blog <- file(blog_file, open = "r")
con_news <- file(news_file, open = "r")
con_twitter <- file(twitter_file, open = "r")
rblog <- readLines(con = con_blog, encoding = "UTF-8", skipNul = TRUE)
rnews <- readLines(con = con_news, encoding = "UTF-8", skipNul = TRUE)
rtwitter <- readLines(con = con_twitter, encoding = "UTF-8", skipNul = TRUE)
close(con_blog); close(con_news); close(con_twitter)
blog_size <- round(file.info(blog_file)$size / 1024^2)
news_size <- round(file.info(news_file)$size / 1024^2)
twitter_size <- round(file.info(twitter_file)$size / 1024^2)
file_sizes <- c(blog_size, news_size, twitter_size)
number_of_lines<- sapply(list(rblog, rnews, rtwitter), length)
words_per_line <- lapply(list(rblog, rnews, rtwitter), function(x) stri_count_words(x))
ggplot(data.frame(words_per_line[[1]]), aes(x = words_per_line[[1]])) +
geom_histogram(binwidth = 50, color = "black", fill = "white") +
labs(title = "Blog", x = "Words per Line", y = "Frequency")
ggplot(data.frame(words_per_line[[2]]), aes(x = words_per_line[[2]])) +
geom_histogram(binwidth = 50, color = "black", fill = "white") +
labs(title = "News", x = "Words per Line", y = "Frequency")
ggplot(data.frame(words_per_line[[3]]), aes(x = words_per_line[[3]])) +
geom_histogram(binwidth = 5, color = "black", fill = "white") +
labs(title = "Twitter", x = "Words per Line", y = "Frequency")
wpl_summary <- sapply(list(rblog, rnews, rtwitter),
function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wpl_summary) <- c("Min", "Mean", "Max")
colnames(wpl_summary) <- c("Blog", "News", "Twitter")
wpl_summary
## Blog News Twitter
## Min 0.00000 1.00000 1.00000
## Mean 41.75109 34.40997 12.75065
## Max 6726.00000 1796.00000 47.00000
rm(words_per_line)
number_of_words <- sapply(list(rblog, rnews, rtwitter), stri_stats_latex)[4,]
sum_files <- data.frame(
File_Name = c("Blog", "News", "Twitter"),
File_Size_MB = file_sizes,
Lines = number_of_lines,
Words = number_of_words
)
print(sum_files)
## File_Name File_Size_MB Lines Words
## 1 Blog 200 899288 37570839
## 2 News 196 1010242 34494539
## 3 Twitter 159 2360148 30451170
set.seed(1234)
sample_size <- 0.01
rblog_sampled <- sample(rblog, length(rblog) * sample_size, replace = FALSE)
rnews_sampled <- sample(rnews, length(rnews) * sample_size, replace = FALSE)
rtwit_sampled <- sample(rtwitter, length(rtwitter) * sample_size, replace = FALSE)
# remove all non-English characters from the sampled data
rblog_sampled <- iconv(rblog_sampled, "latin1", "ASCII", sub = "")
rnews_sampled <- iconv(rnews_sampled, "latin1", "ASCII", sub = "")
rtwit_sampled <- iconv(rtwit_sampled, "latin1", "ASCII", sub = "")
corpus <- VCorpus(VectorSource(c(rblog_sampled, rnews_sampled, rtwit_sampled)))
rm(rblog, rnews, rtwitter)
# Load the profanity words
profanity_words <- readLines("https://github.com/coffee-and-fun/google-profanity-words/raw/main/data/list.txt")
profanity <- iconv(profanity_words, "latin1", "ASCII", sub = "")
corpus <- tm_map(corpus, removeWords, profanity)
# URLs, Twitter handles, email patterns, and stopwords from the corpus
replace_with_space <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, replace_with_space, "http\\S+|www\\.\\S+")
corpus <- tm_map(corpus, replace_with_space, "@\\w+")
corpus <- tm_map(corpus, replace_with_space, "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b")
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
corpus <- tm_map(corpus, content_transformer(stripWhitespace))
rm(rblog_sampled, rnews_sampled, rtwit_sampled, profanity, profanity_words)
# Create a term-document matrix
tdm <- TermDocumentMatrix(corpus)
# Convert the tdm to a data frame
freq_df <- as.data.frame(inspect(tdm))
## <<TermDocumentMatrix (terms: 54792, documents: 42695)>>
## Non-/sparse entries: 523436/2338821004
## Sparsity : 100%
## Maximal term length: 48
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1143 1594 4106 4706 6077 6349 6844 7172 8167 849
## can 0 0 2 2 0 1 2 1 1 1
## get 1 0 0 2 0 0 0 2 3 0
## good 2 0 0 1 0 0 0 0 0 0
## just 1 2 0 1 0 0 0 0 3 1
## like 3 4 1 4 0 3 1 1 0 0
## new 0 0 1 0 0 0 0 1 0 0
## one 2 1 1 2 0 3 0 2 0 4
## said 0 0 0 0 0 0 4 0 0 0
## the 1 1 4 2 3 2 3 0 2 0
## time 1 3 0 1 1 0 0 0 0 1
# Compute the total frequency of each term
freq_df$freq <- rowSums(freq_df[, -1])
# Sort the data frame by frequency in descending order
freq_df_sorted <- freq_df %>% arrange(desc(freq))
# Select the top 10 most frequent terms
top_terms <- head(freq_df_sorted$freq, n = 10)
# Create a bar plot of the top 10 most frequent terms
ggplot(freq_df_sorted[1:10,], aes(x = reorder(row.names(freq_df_sorted[1:10,]),-freq), y = freq)) +
geom_bar(stat = "identity") +
ggtitle("Top 10 Most Frequent Words") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Term") + ylab("Frequency")
rm(tdm, freq_df, freq_df_sorted)
# Convert the corpus to a character vector
corpus_chr <- sapply(corpus, as.character)
# Tokenize the corpus
unigrams <- tokenize_ngrams(corpus_chr, n = 1)
bigrams <- tokenize_ngrams(corpus_chr, n = 2)
trigrams <- tokenize_ngrams(corpus_chr, n = 3)
# Get top 10 items from each list
top_unigrams <- head(sort(table(unlist(unigrams)), decreasing = TRUE), 10)
top_bigrams <- head(sort(table(unlist(bigrams)), decreasing = TRUE), 10)
top_trigrams <- head(sort(table(unlist(trigrams)), decreasing = TRUE), 10)
# Print the top 10 items from each list
cat("Top 10 Unigrams: ", paste(names(top_unigrams), collapse = ", "), "\n")
## Top 10 Unigrams: i, the, said, just, one, like, can, im, get, time
cat("Top 10 Bigrams: ", paste(names(top_bigrams), collapse = ", "), "\n")
## Top 10 Bigrams: i think, i know, i love, i just, i can, i want, right now, i like, i really, i feel
cat("Top 10 Trigrams: ", paste(names(top_trigrams), collapse = ", "))
## Top 10 Trigrams: i feel like, i know i, i think i, i wish i, i dont know, i just want, happy mothers day, feel like i, i thought i, i dont think
unigrams <- as.list(unclass(unigrams))
bigrams <- data.frame(t(bigrams))
trigrams <- data.frame(t(trigrams))
# Create an empty list to store the n-gram models
ngram_models <- list()
# Loop through each n-gram and estimate its probability
for (n in 1:3) {
if (n == 1) {
ngram_models[[n]] <- prop.table(table(unlist(unigrams)))
} else if (n == 2) {
ngram_models[[n]] <- prop.table(table(unlist(bigrams)))
} else if (n == 3) {
ngram_models[[n]] <- prop.table(table(unlist(trigrams)))
}
}