Task #1: Getting and Cleaning Data

Task to Accomplish

  1. Tokenization - identifying appropriate tokens such as words, punctuation, and numbers. Writing a function that takes a file as input and returns a tokenized version of it.

  2. Profanity filtering - removing profanity and other words you do not want to predict.

First of all, let’s just load the libraries and read the data

suppressMessages(library(stringi))
suppressMessages(library(tm))
suppressMessages(library(textstem))
suppressMessages(library(tokenizers))
suppressMessages(library(openNLP))
suppressMessages(library(quanteda))
suppressMessages(library(ggplot2))
suppressMessages(library(RWeka))
suppressMessages(library(tidytext))
suppressMessages(library(wordcloud))
suppressMessages(library(stringr))
set.seed(12345)


suppressWarnings(twitter_US <- readLines("final/en_US/en_US.twitter.txt"))
suppressWarnings(blogs_us <- readLines("final/en_US/en_US.blogs.txt"))
suppressWarnings(news_us <- readLines("final/en_US/en_US.news.txt"))
suppressWarnings(profanity <- readLines("profanity.txt"))

Now,let’s check the summary of the files.

Twitter_lines <- length(twitter_US)
blogs_lines <- length(blogs_us)
News_lines <- length(news_us)

Twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024**2
Blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024**2
News_size <- file.info("final/en_US/en_US.news.txt")$size / 1024**2

twitter_words <- sum(stri_count_words(twitter_US))
blogs_words <- sum(stri_count_words(blogs_us))
news_words <- sum(stri_count_words(news_us))

summary_data <- data.frame(type = c("Twitter", "Blog", "News"), Size_in_MBs = c(Twitter_size, Blogs_size, News_size), Total_Lines = c(Twitter_lines, blogs_lines, News_lines), Total_words = c(twitter_words, blogs_words, news_words))

The data is too big. Let’s sample the data and select 1000 lines form each file.

twitter_US <- sample(twitter_US, 1000)
blogs_us <- sample(blogs_us, 1000)
news_us <- sample(news_us, 1000)
sampled_data <- c(twitter_US, blogs_us, news_us)

Now we clean the data by using NLP.

Corpus_Data <- VCorpus(VectorSource(sampled_data))
toSpace <- content_transformer(function(x, pattern) {gsub(pattern, " ", x)})
Corpus_Data <- tm_map(Corpus_Data, toSpace, "http.*")
Corpus_Data <- tm_map(Corpus_Data, tolower)
Corpus_Data <- tm_map(Corpus_Data, removePunctuation)
Corpus_Data <- tm_map(Corpus_Data, removeNumbers)
Corpus_Data <- tm_map(Corpus_Data, removeWords, stopwords("en"))
Corpus_Data <- tm_map(Corpus_Data, removeWords, profanity)
Corpus_Data <- tm_map(Corpus_Data, stripWhitespace)
Corpus_Data <- tm_map(Corpus_Data, lemmatize_strings)
Corpus_Data_plain <- tm_map(Corpus_Data,PlainTextDocument)

Task#2: Exploratory Data Analysis

  1. Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.

  2. Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

For this task, we have to convert the corpus data into Term Document Matrix (TDM) and then use n-grams to calculate frequencies of words.

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

tdm.bigram = TermDocumentMatrix(Corpus_Data_plain, control = list(tokenize = BigramTokenizer))
tdm.trigram = TermDocumentMatrix(Corpus_Data_plain, control = list(tokenize = TrigramTokenizer))
tdm.unigram = TermDocumentMatrix(Corpus_Data_plain, control = list(tokenize = UnigramTokenizer))

freq1 = sort(rowSums(as.matrix(tdm.unigram)),decreasing = TRUE)
freq.df1 = data.frame(word=names(freq1), freq=freq1)

freq2 = sort(rowSums(as.matrix(tdm.bigram)),decreasing = TRUE)
freq.df2 = data.frame(word=names(freq2), freq=freq2)

freq3 = sort(rowSums(as.matrix(tdm.trigram)),decreasing = TRUE)
freq.df3 = data.frame(word=names(freq3), freq=freq3)

Now words cloud are plotted to explore frequencies of words.

pal=brewer.pal(8,"Blues")
pal=pal[-(1:3)]

wordcloud(freq.df1$word,freq.df1$freq,max.words=100,random.order = F, colors=pal)

wordcloud(freq.df2$word,freq.df2$freq,max.words=100,random.order = F, colors=pal)

wordcloud(freq.df3$word,freq.df3$freq,max.words=100,random.order = F, colors=pal)
## Warning in wordcloud(freq.df3$word, freq.df3$freq, max.words = 100, random.order
## = F, : average point rebound could not be fit on page. It will not be plotted.
## Warning in wordcloud(freq.df3$word, freq.df3$freq, max.words = 100, random.order
## = F, : completely open border could not be fit on page. It will not be plotted.
## Warning in wordcloud(freq.df3$word, freq.df3$freq, max.words = 100, random.order
## = F, : covenantal offer god could not be fit on page. It will not be plotted.