This report consist of an exploratory analysis for the data that is being used to generate a word prediction model. The goal is to build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words. The data are from 3 different sources: 1. Blogs, 2. Twitter 3. News. In this intermidiatory stage of the project, the data will be sampled, preprocessed and tokenized to generate n-grams. The report also incudes the future planning for generating a word prediction App.
| source | size_MB | line_count | word_count |
|---|---|---|---|
| blogs | 200.4242 | 899288 | 37334131 |
| 159.3641 | 2360148 | 30373583 | |
| news | 196.2775 | 77259 | 2643969 |
5% of data is being randomly sampled to represent the larger data set. Following table shows the summary of the samples.
# create a list of random variables
set.seed(123)
percent <- 0.05
blogs_sample <- fn_sample(blogs,percent)
twitter_sample <- fn_sample(twitter,percent)
news_sample <- fn_sample(news,percent)
rm(blogs)
rm(twitter)
rm(news)
# create a list to store samples
list_sample <- list(blogs = blogs_sample, twitter = twitter_sample , news = news_sample)
# create a data frame for samples
df_sample <- data.frame(source = c("blogs", "twitter", "news"), line_count = NA, word_count = NA)
# get counts of samples
df_sample$line_count <- sapply(list_sample, length)
df_sample$word_count <- sapply(list_sample, fn_words)
rm(list_sample)
kable(df_sample)
| source | line_count | word_count |
|---|---|---|
| blogs | 44964 | 1873491 |
| 118007 | 1521049 | |
| news | 3862 | 132905 |
Data preprocessing does the following tasks:
# Preprocess Data
### helper functions
stringi_toLower <- function(x) stringi::stri_trans_tolower(x)
remove_URL <- function(x) gsub("http:[[:alnum:]]*", "", x)
remove_HashTags <- function(x) gsub("#\\S+", "", x)
remove_TwitterHandles <- function(x) gsub("@\\S+", "", x)
remove_nonAscii <- function(x) gsub("[^\x01-\x7F]", "", x)
fix_whitespaces <- function(x) qdapRegex::rm_white(x)
## List of Bad Words and Top Swear Words Banned by Google
profanity_words <- fn_rlines("list.txt")
# function to Preprocess
fn_preprocess <- function(list_text){
corpus_text <- tm::Corpus(VectorSource(list_text))
corpus_text <- tm::tm_map(corpus_text, content_transformer(remove_URL))
corpus_text <- tm::tm_map(corpus_text, content_transformer(remove_HashTags))
corpus_text <- tm::tm_map(corpus_text, content_transformer(remove_TwitterHandles))
corpus_text <- tm::tm_map(corpus_text, content_transformer(remove_nonAscii))
corpus_text <- tm::tm_map(corpus_text, content_transformer(stringi_toLower))
corpus_text <- tm::tm_map(corpus_text, removeWords, stopwords("en"))
corpus_text <- tm::tm_map(corpus_text, removeWords, profanity_words)
corpus_text <- tm::tm_map(corpus_text, removePunctuation)
corpus_text <- tm::tm_map(corpus_text, removeNumbers)
corpus_text <- tm::tm_map(corpus_text, content_transformer(fix_whitespaces))
return (corpus_text)
}
Encoding(blogs_sample) <- "UTF-8"
Encoding(twitter_sample) <- "UTF-8"
Encoding(news_sample) <- "UTF-8"
corpus_blogs <- fn_preprocess(blogs_sample)
corpus_twitter <- fn_preprocess(twitter_sample)
corpus_news <- fn_preprocess(news_sample)
rm(blogs_sample)
rm(twitter_sample)
rm(news_sample)
R’s package ‘ngram’ is used to tokenize the data.
# ngram tokenizing
str_blogs <- concatenate ( lapply ( corpus_blogs , "[", 1) )
str_twitter <- concatenate ( lapply ( corpus_twitter , "[", 1) )
str_news <- concatenate ( lapply ( corpus_news , "[", 1) )
rm(corpus_blogs)
rm(corpus_twitter)
rm(corpus_news)
full_string <- concatenate (str_blogs,str_twitter,str_news)
rm(str_blogs)
rm(str_twitter)
rm(str_news)
## 1-grams
ng1 <- ngram (full_string , n =1)
df_1gram <- get.phrasetable ( ng1 )
df_1gram <- df_1gram[order(df_1gram$freq,decreasing = TRUE),]
## 2-grams
ng2 <- ngram (full_string , n =2)
df_2gram <- get.phrasetable ( ng2 )
## 3-grams
ng3 <- ngram (full_string , n =3)
df_3gram <- get.phrasetable ( ng3 )
## 4-grams
ng4 <- ngram (full_string , n =4)
df_4gram <- get.phrasetable ( ng4 )
The word frequencies for 1-grams, 2-grams and 3-grams are shown in the following plots (first 30 highest frequency phrases are plotted):
Some importants points to consider: