This report explores Swiftkey dataset provided to use for a predictive model for text. The main objective in this phase is to understand the distribution of words and relationships in the corpora as a first step towards building the predictive model.
Some of the following questions will be addressed in the next sections:
1- what are the distributions of word frequencies?
2- What are the frequencies of 2-grams and 3-grams in the dataset?
3- How many unique words do we need in a frequency sorted dictionary to cover 50% or 90% of all word instances in the language?
In this section, we explore the given dataset, starting by summaries about their content, then we will look deeper at the frequency of n-grams in portions of the documents.
The following table shows for each document, the size in bytes, the number of lines (each line is a tweet, blog post or article), and the count of characters.
| Size | Number_of_Lines | Number_of_Char | |
|---|---|---|---|
| 316037344 | 2360148 | 162384825 | |
| Blogs | 260564320 | 899288 | 208361438 |
| News | 20111392 | 77259 | 15683765 |
Since the size of the documents is large and it will take a long processing time, we will take a sample of each for further analysis.
Now we will extract samples from each document (0.5% of twitter/blogs abd 5% of news) and perform the following:
Then we can plot the most frequently mentioned Unigrams, Bigrams and Trigrams as follows:
As we discovered and counted the n-grams frequencies in the sample documents, we can estimate how many unique words we need in a frequency sorted dictionary to cover 50% or 90% of all word instances. The following graph shows the increase in coverage in each document.
And the following table shows the number of words in each sample and the 50% and 90% coverage. It seems that News have a higher diversity in words included which makes sense considering the language used in News.
| Total_Words | Words_to_cover_50_percent | Words_to_cover_90_percent | |
|---|---|---|---|
| Twitter_Sample | 147950 | 123 | 4768 |
| Blogs_Sample | 185478 | 108 | 5917 |
| News_Sample | 129688 | 192 | 6784 |
This preliminary analysis gave us an overview about the dataset we have. This will be a starting point for the next steps towards building text predictive model. We will need to:
# load libraries
library(dplyr)
library(knitr)
library(stringi)
library(stringr)
library(tm)
library(SnowballC)
library(RWeka)
library(qdap)
library(purrr)
library(tidyr)
library(ggplot2)
library(cowplot)
# read given documents
twitter<-readLines('../data/text_source/en_US.twitter.txt')
blogs<-readLines('../data/text_source/en_US.blogs.txt')
news<-readLines('../data/text_source/en_US.news.txt')
# functin to find doc summary
GetDocSummary <- function(x)
{
c(Size=object.size(x),
Number_of_Lines=length(x),
Number_of_Char=sum(nchar(x)))
}
# put summaries for all documents in one df
Doc_summary <- rbind(GetDocSummary(twitter),
GetDocSummary(blogs),
GetDocSummary(news)) %>%
as.data.frame(row.names=c("Twitter","Blogs","News"))
# a function that takes x: char vector and ns: number of lines to sample
GetNgrams <- function(x,ns=0.005)
{
set.seed(1005)
#sample doc
x_sample <- sample(x, ns*length(x), replace=FALSE)
# form corpus
docs<-Corpus(VectorSource(x_sample))
# # pre-processing
docs <- tm_map(docs, content_transformer(strip), char.keep="'") # strip all except comma
docs <- tm_map(docs, content_transformer(stri_trans_general), id="latin-ascii") # preserve the text as much as possible
docs <- tm_map(docs, tolower) ## to lower case
docs <- tm_map(docs, removeNumbers) ## remove numbers
docs <- tm_map(docs, stripWhitespace) ## remove White space
docs <- tm_map(docs, removePunctuation) ## remove punctuation
# find unigrams
ng1 <- NGramTokenizer(docs, Weka_control(min=1, max=1))%>%
table %>%
as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>%
arrange(desc(Freq))
ng1 <- rename(ng1,Unigram=.)
# find bigrams
ng2 <- NGramTokenizer(docs, Weka_control(min=2, max=2)) %>%
table %>%
as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>%
arrange(desc(Freq))
# find trigram
ng2 <- rename(ng2,Bigram=.)
ng3 <- NGramTokenizer(docs, Weka_control(min=3, max=3)) %>%
table %>%
as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>%
arrange(desc(Freq))
ng3 <- rename(ng3,Trigram=.)
# form and return a list of unigrams, bigrams, trigrams
return(list(ng1,ng2,ng3))
}
# Get n-grams for each sample
twitter_ng <- GetNgrams(twitter,0.005)
blogs_ng <- GetNgrams(blogs,0.005)
news_ng <- GetNgrams(news,0.05)
# remove original docs
rm(twitter)
rm(blogs)
rm(news)
# a function that takes x: a dataframe with n-gram --- Frq
# y: name of n-gram
# and produce a barplot with frequencies
gx <- function(x,ylabel,title=" ")
{
ggplot(data=x , aes(x=reorder(x[,1],Freq),y=Freq, fill=-Freq))+
geom_bar(stat="identity")+
coord_flip()+
theme_classic()+
xlab(ylabel)+
ggtitle(title)+
guides(fill=FALSE)
}
# plot unigrams
plot_grid(gx(twitter_ng[[1]][1:20,],"Unigram"),
gx(blogs_ng[[1]][1:20,],"Unigram"),
gx(news_ng[[1]][1:20,],"Unigram"),
ncol=3,
labels = c("Twitter","Blogs","News"))
# plot bigrams
plot_grid(gx(twitter_ng[[2]][1:20,],"Bigarm"),
gx(blogs_ng[[2]][1:20,],"Bigram"),
gx(news_ng[[2]][1:20,],"Bigram"),
ncol=3,
labels = c("Twitter","Blogs","News"))
# plot trigrams
plot_grid(gx(twitter_ng[[3]][1:20,],"Trigram"),
gx(blogs_ng[[3]][1:20,],"Trigarm"),
gx(news_ng[[3]][1:20,],"Trigram"),
ncol=3,
labels = c("Twitter","Blogs","News"))
# a function to get the % cumulative sum of words
GetCumPercent <- function(x){
# print(deparse(substitute(x)))
x %>%
arrange(desc(Freq))%>%
mutate(Words=seq_along(Freq),
precentage=100*cumsum(Freq)/sum(Freq, na.rm=T)) %>%
select(Words,precentage)
}
# find % cumulative sum in each document
twitter_cum <- GetCumPercent(twitter_ng[[1]]) %>%
rename(Twitter_uni_percent=precentage)
blogs_cum <- GetCumPercent(blogs_ng[[1]]) %>%
rename(blogs_uni_percent=precentage)
news_cum <- GetCumPercent(news_ng[[1]]) %>%
rename(news_uni_percent=precentage)
# create a df with cumulative % from all documents
word_percent <- Reduce(function(x,y) merge(x,y,
by="Words",
all.x=TRUE,
all.y=TRUE),
list(twitter_cum,blogs_cum,news_cum
))
# convert from wide to long
word_percent_long <- gather(word_percent,"Type","Percentage",2:4)
# remove na
word_percent_long <- word_percent_long %>%
filter(!is.na(Percentage))
#plot coverage
ggplot(word_percent_long,aes(x=Percentage,y=Words,col=Type))+
geom_line()+
theme_classic()
# function to get the number of words that comprise a n percent in x document
getWordPercent <- function(x,n)
{
which(floor(x[,2])==n)[1]
}
# total words in each sample
FreqDict <- data.frame(Total_Words=c(sum(twitter_ng[[1]]$Freq),
sum(blogs_ng[[1]]$Freq),
sum(news_ng[[1]]$Freq)),
# 50% coverage
Words_to_cover_50_percent=sapply(list(twitter_cum,
blogs_cum,
news_cum),
function(x) which(floor(x[,2])==50)[1] ),
# 90 % coverage
Words_to_cover_90_percent=sapply(list(twitter_cum,
blogs_cum
,news_cum),
function(x) which(floor(x[,2])==90)[1] ),
row.names = c("Twitter_Sample","Blogs_Sample","News_Sample")
)