Introduction

In this report we look at three corpora of US English text namely - a set of blogs posts, a set of news articles, and a set of twitter messages.

DataSet

We downloaded the data from http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. It takes about 5 minutes to download 548mb.

# library for character string analysis
library(stringi)
library(stringr)

suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(RWeka)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(DT)))
suppressWarnings(suppressMessages(library(SnowballC)))
suppressWarnings(suppressMessages(library(wordcloud)))

Set Working Directory

We are interested in the English version and i set my working directory to below where I downloaded to my drives.

setwd("E:\\capstone\\final\\en_US")

File information

blogs_txt <- round(file.info("en_US.blogs.txt")$size / 1024^2,0)
news_txt <- round(file.info("en_US.news.txt")$size / 1024^2,0)
twitter_txt <- round(file.info("en_US.twitter.txt")$size / 1024^2,0)
  1. Size of blogs is 200 MB
  2. Size of news is 196 MB
  3. Size of twitter is 159 MB

Reading text file

To speed up reading, writing and loading of data, the below R-codes are commented-out, as it takes time to run. For first time running, read the comments and remove the #.

## [1] "   899288  37272578 ./en_US.blogs.txt"  
## [2] "  1010242  34309642 ./en_US.news.txt"   
## [3] "  2360148  30341028 ./en_US.twitter.txt"
## [4] "    42521    551219 ./text.corpus.txt"  
## [5] "  4312199 102474467 total"

Plot and summary information of each line size

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    44.0    88.0   133.7   153.0 40830.0

Plot and summary information on number of words by line in text file

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn

summary(word_stats$wordcount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.00   17.00   24.46   28.00 6850.00
summary(word_stats[word_stats$file=="twitter","wordcount"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   13.13   19.00   47.00
summary(word_stats[word_stats$file=="blogs","wordcount"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   42.17   60.00 6850.00
summary(word_stats[word_stats$file=="news","wordcount"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   19.00   32.00   35.16   47.00 1928.00

Remove Profanities

Transform all the data to remove profane language. We will use a txt-file from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt to filter them out.

badwords <- read.delim("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt",sep = ":",header = FALSE)
badwords <- badwords[,1]

Sample data

# Binomial sampling of the data and create the relevant files
# i take x % of the sample data

# how long it takes to select and save the sample data
sample_start_time <- proc.time()

train_function <- function(data, percent)
{
  return(data[as.logical(rbinom(length(data),1,percent))])
}

sample_blogs   <- train_function(blogs, 0.01)
sample_news    <- train_function(news, 0.01)
sample_twitter <- train_function(twitter, 0.01)


dir.create("sample", showWarnings = FALSE)
write(sample_blogs, "sample/sample_blogs.txt")
write(sample_news, "sample/sample_news.txt")
write(sample_twitter, "sample/sample_twitter.txt")

system("wc -w -l ./sample/*.txt", intern =T)
## [1] "   8981  370529 ./sample/sample_blogs.txt"  
## [2] "  10124  346746 ./sample/sample_news.txt"   
## [3] "  23448  300316 ./sample/sample_twitter.txt"
## [4] "  42553 1017591 total"
# Save all SAMPLE data in Rdata for future use
saveRDS(sample_twitter,"sample_twitter.RData")
saveRDS(sample_blogs,"sample_blogs.RData")
saveRDS(sample_news,"sample_news.RData")

sample_end_time <- proc.time() - sample_start_time
sample_end_time
##    user  system elapsed 
##    1.21    0.17    1.62

Cleaning Data

# reload data using samples based on the above sampling percentage.

blogs <- readRDS("sample_blogs.RData")
news <- readRDS("sample_news.RData")
twitter <- readRDS("sample_twitter.RData")

corpus <- c(blogs,news,twitter)
text.corpus <- Corpus(VectorSource(list(corpus)))


# A very good example is http://datascienceplus.com/building-wordclouds-in-r/#disqus_thread
# transform all the data to lower case and remove numbers, punctuations etc using Text Mining tool (tm)

# how long it take to clean data
clean_start_time <- proc.time()

text.corpus <- tm_map(text.corpus, content_transformer(tolower))
text.corpus <- tm_map(text.corpus, removePunctuation)
text.corpus <- tm_map(text.corpus, removeNumbers)
text.corpus <- tm_map(text.corpus, removeWords, stopwords("english"))
text.corpus <- tm_map(text.corpus, removeWords, badwords)
text.corpus <- tm_map(text.corpus, stripWhitespace)

text.corpus <- tm_map(text.corpus,function(x) gsub('[])(;:#%$^*\\~{}[&+=@/"`|<>_]+',"",x))
text.corpus <- tm_map(text.corpus,stemDocument)


# treat your preprocessed documents as text documents
text.corpus <- tm_map(text.corpus, PlainTextDocument)

writeCorpus(text.corpus, filenames="text.corpus.txt")

end_clean_time <- proc.time() - clean_start_time
end_clean_time
##    user  system elapsed 
##   15.68    0.00   15.68

Tokenization

To tokenize the data, we use RWeka package to create the different n-grams of the sample corpus. Objective is to predict the next word IN a sentence.

Unigram Analysis (1 word analysis)

This will show us which words are the most frequent and what their frequency is. We will use an effcient Ngrams_Tokenizer published by Maciej Szymkiewicz. We will pass the argumemnt 1 to get the unigrams. This will create a unigram Dataframe, which we will then be manipulated so we can chart of the frequencies using ggplot.

# Effcient Ngrams_Tokenizer published by Maciej Szymkiewicz
# https://github.com/zero323/r-snippets/blob/master/R/ngram_tokenizer.R

source("E:\\capstone\\final\\en_US\\ngram_tokenizer.r")

# reload data using samples
text.corpus <- readLines("text.corpus.txt")
#how long it take to run unigram
unigram_start <- proc.time()

unigram_tokenizer <- ngram_tokenizer(1)
wordlist <- unigram_tokenizer(text.corpus)

unigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram_df) <- c("word","freq")
unigram_df <- unigram_df[with(unigram_df, order(-unigram_df$freq)),]
row.names(unigram_df) <- NULL

unigram_end <- proc.time() - unigram_start
unigram_end
##    user  system elapsed 
##    7.83    0.11    7.94
ggplot(head(unigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="red") +
  geom_text(aes(label=freq), vjust = -1.0) +
  ggtitle("Most frequent Unigrams (1 word) in the Sample Corpus") +
  ylab("Frequency") +
  xlab("Term (Word)")

datatable(head(unigram_df,100), class = 'cell-border stripe', caption = 'Table 1: Unigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html

Bigram (2 consecutive word) Analysis

We follow exactly the same process as the Unigram Analysis, but this time we will pass the argument 2.

#how long it take to run bigram
bigram_start <- proc.time()


bigram_tokenizer <- ngram_tokenizer(2)
wordlist <- bigram_tokenizer(text.corpus)
bigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram_df) <- c("word","freq")
bigram_df <- bigram_df[with(bigram_df, order(-bigram_df$freq)),]
row.names(bigram_df) <- NULL

bigram_end <- proc.time() - bigram_start
bigram_end
##    user  system elapsed 
##   30.97    0.03   31.00
ggplot(head(bigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="blue") +
  geom_text(aes(label=freq), vjust = -1.0) +
  ggtitle("Most frequent Bigrams (2 words) in the Sample Corpus") +
  ylab("Frequency") +
  xlab("Term (Words)")

datatable(head(bigram_df,100), class = 'cell-border stripe', caption = 'Table 2: Bigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html

Trigram (3 consecutive word) Analysis

We follow exactly the same process as the Unigram Analysis, but this time we will pass the argument 3.

#how long it take to run trigram
trigram_start <- proc.time()

trigram_tokenizer <- ngram_tokenizer(3)
wordlist <- trigram_tokenizer(text.corpus)
trigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram_df) <- c("word","freq")
trigram_df <- trigram_df[with(trigram_df, order(-trigram_df$freq)),]
row.names(trigram_df) <- NULL

trigram_end <- proc.time() - trigram_start
trigram_end
##    user  system elapsed 
##   38.19    0.03   38.22
ggplot(head(trigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="darkgreen") +
  geom_text(aes(label=freq), vjust = -1.0) +
  ggtitle("Most Frequent Trigrams (3 Words) in the Sample Corpus") +
  ylab("Frequency") +
  xlab("Term")

datatable(head(trigram_df,100), class = 'cell-border stripe', caption = 'Table 3: Trigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html

How about more than 3 consecutive words ?

#how long it take to run n-gram
nth_gram_start <- proc.time()

nth_gram_tokenizer <- ngram_tokenizer(4)
wordlist <- nth_gram_tokenizer(text.corpus)
nth_gram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(nth_gram_df) <- c("word","freq")
nth_gram_df <- nth_gram_df[with(nth_gram_df, order(-nth_gram_df$freq)),]
row.names(nth_gram_df) <- NULL

nth_gram_end <- proc.time() - nth_gram_start
nth_gram_end
##    user  system elapsed 
##   45.10    0.11   45.22
ggplot(head(nth_gram_df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="pink") +
  geom_text(aes(label=freq), vjust = -1.0) +
  ggtitle("Most Frequent nth grams (4 Words) in the Sample Corpus") +
  ylab("Frequency") +
  xlab("Term")

datatable(head(nth_gram_df,100), class = 'cell-border stripe', caption = 'Table 4: 4 gram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html

Word Cloud

Next Goal

Next goal is to create a better algorithm and a shiny application using a subset of data. The 2-gram and 3-gram files can be used to determine the next word in a sequence. we can extend to 4 or 5 gram, but this is insufficient because language has long-distance dependencies.