In this report we look at three corpora of US English text namely - a set of blogs posts, a set of news articles, and a set of twitter messages.
We downloaded the data from http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. It takes about 5 minutes to download 548mb.
# library for character string analysis
library(stringi)
library(stringr)
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(RWeka)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(DT)))
suppressWarnings(suppressMessages(library(SnowballC)))
suppressWarnings(suppressMessages(library(wordcloud)))
We are interested in the English version and i set my working directory to below where I downloaded to my drives.
setwd("E:\\capstone\\final\\en_US")
blogs_txt <- round(file.info("en_US.blogs.txt")$size / 1024^2,0)
news_txt <- round(file.info("en_US.news.txt")$size / 1024^2,0)
twitter_txt <- round(file.info("en_US.twitter.txt")$size / 1024^2,0)
To speed up reading, writing and loading of data, the below R-codes are commented-out, as it takes time to run. For first time running, read the comments and remove the #.
## [1] " 899288 37272578 ./en_US.blogs.txt"
## [2] " 1010242 34309642 ./en_US.news.txt"
## [3] " 2360148 30341028 ./en_US.twitter.txt"
## [4] " 42521 551219 ./text.corpus.txt"
## [5] " 4312199 102474467 total"
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 44.0 88.0 133.7 153.0 40830.0
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
summary(word_stats$wordcount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 8.00 17.00 24.46 28.00 6850.00
summary(word_stats[word_stats$file=="twitter","wordcount"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 13.13 19.00 47.00
summary(word_stats[word_stats$file=="blogs","wordcount"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.17 60.00 6850.00
summary(word_stats[word_stats$file=="news","wordcount"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 19.00 32.00 35.16 47.00 1928.00
Transform all the data to remove profane language. We will use a txt-file from http://www.cs.cmu.edu/~biglou/resources/bad-words.txt to filter them out.
badwords <- read.delim("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt",sep = ":",header = FALSE)
badwords <- badwords[,1]
# Binomial sampling of the data and create the relevant files
# i take x % of the sample data
# how long it takes to select and save the sample data
sample_start_time <- proc.time()
train_function <- function(data, percent)
{
return(data[as.logical(rbinom(length(data),1,percent))])
}
sample_blogs <- train_function(blogs, 0.01)
sample_news <- train_function(news, 0.01)
sample_twitter <- train_function(twitter, 0.01)
dir.create("sample", showWarnings = FALSE)
write(sample_blogs, "sample/sample_blogs.txt")
write(sample_news, "sample/sample_news.txt")
write(sample_twitter, "sample/sample_twitter.txt")
system("wc -w -l ./sample/*.txt", intern =T)
## [1] " 8981 370529 ./sample/sample_blogs.txt"
## [2] " 10124 346746 ./sample/sample_news.txt"
## [3] " 23448 300316 ./sample/sample_twitter.txt"
## [4] " 42553 1017591 total"
# Save all SAMPLE data in Rdata for future use
saveRDS(sample_twitter,"sample_twitter.RData")
saveRDS(sample_blogs,"sample_blogs.RData")
saveRDS(sample_news,"sample_news.RData")
sample_end_time <- proc.time() - sample_start_time
sample_end_time
## user system elapsed
## 1.21 0.17 1.62
# reload data using samples based on the above sampling percentage.
blogs <- readRDS("sample_blogs.RData")
news <- readRDS("sample_news.RData")
twitter <- readRDS("sample_twitter.RData")
corpus <- c(blogs,news,twitter)
text.corpus <- Corpus(VectorSource(list(corpus)))
# A very good example is http://datascienceplus.com/building-wordclouds-in-r/#disqus_thread
# transform all the data to lower case and remove numbers, punctuations etc using Text Mining tool (tm)
# how long it take to clean data
clean_start_time <- proc.time()
text.corpus <- tm_map(text.corpus, content_transformer(tolower))
text.corpus <- tm_map(text.corpus, removePunctuation)
text.corpus <- tm_map(text.corpus, removeNumbers)
text.corpus <- tm_map(text.corpus, removeWords, stopwords("english"))
text.corpus <- tm_map(text.corpus, removeWords, badwords)
text.corpus <- tm_map(text.corpus, stripWhitespace)
text.corpus <- tm_map(text.corpus,function(x) gsub('[])(;:#%$^*\\~{}[&+=@/"`|<>_]+',"",x))
text.corpus <- tm_map(text.corpus,stemDocument)
# treat your preprocessed documents as text documents
text.corpus <- tm_map(text.corpus, PlainTextDocument)
writeCorpus(text.corpus, filenames="text.corpus.txt")
end_clean_time <- proc.time() - clean_start_time
end_clean_time
## user system elapsed
## 15.68 0.00 15.68
To tokenize the data, we use RWeka package to create the different n-grams of the sample corpus. Objective is to predict the next word IN a sentence.
This will show us which words are the most frequent and what their frequency is. We will use an effcient Ngrams_Tokenizer published by Maciej Szymkiewicz. We will pass the argumemnt 1 to get the unigrams. This will create a unigram Dataframe, which we will then be manipulated so we can chart of the frequencies using ggplot.
# Effcient Ngrams_Tokenizer published by Maciej Szymkiewicz
# https://github.com/zero323/r-snippets/blob/master/R/ngram_tokenizer.R
source("E:\\capstone\\final\\en_US\\ngram_tokenizer.r")
# reload data using samples
text.corpus <- readLines("text.corpus.txt")
#how long it take to run unigram
unigram_start <- proc.time()
unigram_tokenizer <- ngram_tokenizer(1)
wordlist <- unigram_tokenizer(text.corpus)
unigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram_df) <- c("word","freq")
unigram_df <- unigram_df[with(unigram_df, order(-unigram_df$freq)),]
row.names(unigram_df) <- NULL
unigram_end <- proc.time() - unigram_start
unigram_end
## user system elapsed
## 7.83 0.11 7.94
ggplot(head(unigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="red") +
geom_text(aes(label=freq), vjust = -1.0) +
ggtitle("Most frequent Unigrams (1 word) in the Sample Corpus") +
ylab("Frequency") +
xlab("Term (Word)")
datatable(head(unigram_df,100), class = 'cell-border stripe', caption = 'Table 1: Unigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
We follow exactly the same process as the Unigram Analysis, but this time we will pass the argument 2.
#how long it take to run bigram
bigram_start <- proc.time()
bigram_tokenizer <- ngram_tokenizer(2)
wordlist <- bigram_tokenizer(text.corpus)
bigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram_df) <- c("word","freq")
bigram_df <- bigram_df[with(bigram_df, order(-bigram_df$freq)),]
row.names(bigram_df) <- NULL
bigram_end <- proc.time() - bigram_start
bigram_end
## user system elapsed
## 30.97 0.03 31.00
ggplot(head(bigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="blue") +
geom_text(aes(label=freq), vjust = -1.0) +
ggtitle("Most frequent Bigrams (2 words) in the Sample Corpus") +
ylab("Frequency") +
xlab("Term (Words)")
datatable(head(bigram_df,100), class = 'cell-border stripe', caption = 'Table 2: Bigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
We follow exactly the same process as the Unigram Analysis, but this time we will pass the argument 3.
#how long it take to run trigram
trigram_start <- proc.time()
trigram_tokenizer <- ngram_tokenizer(3)
wordlist <- trigram_tokenizer(text.corpus)
trigram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram_df) <- c("word","freq")
trigram_df <- trigram_df[with(trigram_df, order(-trigram_df$freq)),]
row.names(trigram_df) <- NULL
trigram_end <- proc.time() - trigram_start
trigram_end
## user system elapsed
## 38.19 0.03 38.22
ggplot(head(trigram_df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="darkgreen") +
geom_text(aes(label=freq), vjust = -1.0) +
ggtitle("Most Frequent Trigrams (3 Words) in the Sample Corpus") +
ylab("Frequency") +
xlab("Term")
datatable(head(trigram_df,100), class = 'cell-border stripe', caption = 'Table 3: Trigram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
#how long it take to run n-gram
nth_gram_start <- proc.time()
nth_gram_tokenizer <- ngram_tokenizer(4)
wordlist <- nth_gram_tokenizer(text.corpus)
nth_gram_df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(nth_gram_df) <- c("word","freq")
nth_gram_df <- nth_gram_df[with(nth_gram_df, order(-nth_gram_df$freq)),]
row.names(nth_gram_df) <- NULL
nth_gram_end <- proc.time() - nth_gram_start
nth_gram_end
## user system elapsed
## 45.10 0.11 45.22
ggplot(head(nth_gram_df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="pink") +
geom_text(aes(label=freq), vjust = -1.0) +
ggtitle("Most Frequent nth grams (4 Words) in the Sample Corpus") +
ylab("Frequency") +
xlab("Term")
datatable(head(nth_gram_df,100), class = 'cell-border stripe', caption = 'Table 4: 4 gram')
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
Next goal is to create a better algorithm and a shiny application using a subset of data. The 2-gram and 3-gram files can be used to determine the next word in a sequence. we can extend to 4 or 5 gram, but this is insufficient because language has long-distance dependencies.