This data, provided by SwiftKey, compromises large corpus of text from three main sources: Twitter, Blogs, and News. We give a brieft description of each source. Then, we inspect the ngrams of the corpus to find the most common combinations of words. The obtained ngrams are used for word prediction, which would ease daily texting.
Data can be obtained from the provied URL. Besides data, we need to eliminate inappropiate words. This is achieved using the “Carnigie Mellon” profanity list.
if (!file.exists("final")){
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip","SwiftKey.zip")
unzip("SwiftKey.zip")
}
if (!file.exists("profanity.csv")) {
download.file("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt",destfile="./profanity.csv")
}
profanity_list <- read.csv("profanity.csv",header=FALSE,stringsAsFactors = F)[,1]
The following packages are required for text mining, creating ngrams, manipulating strings, and plotting data.
library(tm)
library(stringr)
library(qdap)
library(RWeka)
library(ngram)
library(dplyr)
library(ggplot2)
First, we import the “.txt” files into R using the “readlines” function. Then, we evaluate summaries of each source such as line count, word count, and object size. Also, a sample text is displayed.
# Import data from a given source.
read_data<- function(data_source){
file_connection <- file(paste("final/en_US/en_US.",data_source,".txt",sep = ""), open = "r")
Content <- readLines(file_connection,skipNul=TRUE)
close(file_connection)
Content
}
inspect_data <- function(data){
line_count <- length(data)
word_count <- wordcount(data)
object_size <- format(object.size(data), units = "Mb")
c(line_count,word_count,object_size)
}
sources = c("blogs","news","twitter")
data_summary = c()
all_data = c()
for (source in sources){
data = read_data(source)
print(paste("Sample from ", source))
print(str(data))
data_summary <- rbind(data_summary,c(source,inspect_data(data)))
all_data = c(all_data,data)
}
## [1] "Sample from blogs"
## chr [1:899288] "In the years thereafter, most of the Oil fields and platforms were named after pagan â\200œgodsâ\200\235." ...
## NULL
## Warning in readLines(file_connection, skipNul = TRUE): incomplete final line
## found on 'final/en_US/en_US.news.txt'
## [1] "Sample from news"
## chr [1:77259] "He wasn't home alone, apparently." ...
## NULL
## [1] "Sample from twitter"
## chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...
## NULL
data_summary = data.frame(data_summary)
colnames(data_summary) = c("Source","Line Count", "Word Count", "Object Size")
print("Data Summary")
## [1] "Data Summary"
print(data_summary)
## Source Line Count Word Count Object Size
## 1 blogs 899288 37334131 255.4 Mb
## 2 news 77259 2643969 19.8 Mb
## 3 twitter 2360148 30373583 319 Mb
The data summary suggest that both blogs and twitter sources have almost same number of words; however, blogs have fewer instances. On the other hand, news has relatively few words and lines.
These helper functions are used for cleaning the corpus (“clean_corpus” function) through the following:
Also, The “corpus_token” function creates the tokens using the “ngram” library, and sort by decreasing frequency. Finally, we create two additional columns seperating the last word and the rest of the ngram instance.
# Based on John Joyce: helper function (nlp_clean.R)
# https://github.com/jjoyce1000/Natural-Language-Processing
## Function to clean a corpus with the tm package.
clean_corpus <- function(corpus) {
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, from="latin1", to = "ASCII", sub="")))
corpus <- tm_map(corpus, content_transformer(function(x) {gsub("-", " ", x)}))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, profanity_list)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(function(x) str_trim(x)))
corpus
}
#Tokenize the cleaned corpus, and sort by frequency.
corpus_token <- function(corpus, ngram_num) {
my_tokenizer <- function(corpus) NGramTokenizer(corpus, Weka_control(min=ngram_num, max = ngram_num))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = my_tokenizer))
top_words <- rowSums(as.matrix(tdm))
top_words <- data.frame(frequency = sort(top_words, decreasing = TRUE))
top_words$coverage <- cumsum(top_words$frequency)/sum(top_words$frequency)*100
coverage_80 <- sum(top_words$coverage <= 80)
ngram <- rownames(top_words)
rownames(top_words) <- NULL
top_words <- data.frame(cbind(ngram[1:coverage_80], top_words[1:coverage_80,1]))
colnames(top_words) <- c("ngram", "frequency")
convert_factor <- sapply(top_words, is.factor)
top_words[convert_factor] <- lapply(top_words[convert_factor], as.character)
top_words
}
## Extract the last word of an instance.
word_split <- function(input, ngram_num) {
initial <- word(input$ngram, 1, ngram_num-1)
last_word <- word(input$ngram,ngram_num,ngram_num)
ngram_split <- cbind(input, initial, last_word)
ngram_split
}
To increase memory efficiency, the corpus is sampled randomly for several times. This would result in similiar outcomes of processing the whole corpus at once, but at a higher efficiency. Creating Ngrams is done by first extracting sentences from the sampled text. Then, it is cleaned using the helper functions. Tokens are generated for different Ngrams. For a given Ngram, the samples are binded in a single data frame. In this project, we used a sample of 0.05 percent for a 100 times.
N = length(all_data)
sample_percentage = 0.02
no_samples = 100
gram_1 = data.frame()
gram_2 = data.frame()
gram_3 = data.frame()
gram_4 = data.frame()
gram_5 = data.frame()
for (i in 1:no_samples){
print(i)
sampled_data <- sample(all_data, N * (sample_percentage/100))
sent_data <- sent_detect_nlp(sampled_data)
corpus <- VCorpus(VectorSource(sent_data))
corupus_cleaned <- clean_corpus(corpus)
gram_1 <- rbind(gram_1,corpus_token(corupus_cleaned,1))
gram_2 <- rbind(gram_2,corpus_token(corupus_cleaned,2))
gram_3 <- rbind(gram_3,corpus_token(corupus_cleaned,3))
gram_4 <- rbind(gram_4,corpus_token(corupus_cleaned,4))
gram_5 <- rbind(gram_5,corpus_token(corupus_cleaned,5))
}
gram_list <- list (gram_1,gram_2,gram_3,gram_4,gram_5)
We summarize the data from the different samples of Ngrams through grouping by ngram and summing frequencies. We save these data in RDS format for later use in word prediction.
for (j in 1:5){
gram <- gram_list[[j]]
gram$frequency <- as.integer(gram$frequency)
gram <- gram %>% group_by(ngram) %>% summarise(frequency=sum(frequency)) %>% arrange(desc(frequency))
split_gram <- word_split(gram,j)
saveRDS(split_gram, paste("./ngram_",j,".rds", sep = ""))
}
Through the calculation of the cumulative sum of ngram frequencies, the word coverage can be estimated.
unigrams <- readRDS("ngram_1.rds")
unigrams$coverage <- cumsum(unigrams$frequency)/sum(unigrams$frequency)*100
unigrams$idu <- as.numeric(row.names(unigrams))
coverage_50 <- sum(unigrams$coverage <= 50)
coverage_95 <- sum(unigrams$coverage <= 95)
ggplot(unigrams,aes(x=idu,y=coverage))+geom_line(lwd=2)+
geom_hline(yintercept=50,linetype="dashed", color = "red")+
geom_vline(xintercept=coverage_50,linetype="dashed", color = "red")+
geom_hline(yintercept=95,linetype="dashed", color = "blue")+
geom_vline(xintercept=coverage_95,linetype="dashed", color = "blue")+
labs(title = "Word Coverage",y="Percentage Coverage",x="Num of Words")
The red lines shows that number of words for 50% coverage is about 246 words, while 95% coverage is about 21017 words.
Following are the most common 20 Ngrams within the analyzed corpus. Most of them can be characterized as stop words (words common in usage but don’t deliver insights about the text).
bigrams <- readRDS("ngram_2.rds")
ggplot(bigrams[1:20,],aes(x=reorder(ngram,frequency),frequency))+geom_bar(stat="identity")+coord_flip()+
labs(title = "Common Bigrams",y="Bigram Combination",x="Frequency Count")
trigrams <- readRDS("ngram_3.rds")
ggplot(trigrams[1:20,],aes(x=reorder(ngram,frequency),frequency))+geom_bar(stat="identity")+coord_flip()+
labs(title = "Common Trigrams",y="Trigram Combination",x="Frequency Count")
quadgrams <- readRDS("ngram_4.rds")
ggplot(quadgrams[1:20,],aes(x=reorder(ngram,frequency),frequency))+geom_bar(stat="identity")+coord_flip()+
labs(title = "Common Quadgrams",y="Quadgram Combination",x="Frequency Count")
quadgrams <- readRDS("ngram_5.rds")
ggplot(quadgrams[1:20,],aes(x=reorder(ngram,frequency),frequency))+geom_bar(stat="identity")+coord_flip()+
labs(title = "Common Pentagrams",y="Pentagram Combination",x="Frequency Count")
In the comming weeks, the obtained ngrams will be used in a prediction model with the following features: