Overview

This report explores Swiftkey dataset provided to use for a predictive model for text. The main objective in this phase is to understand the distribution of words and relationships in the corpora as a first step towards building the predictive model.

Some of the following questions will be addressed in the next sections:

1- what are the distributions of word frequencies?

2- What are the frequencies of 2-grams and 3-grams in the dataset?

3- How many unique words do we need in a frequency sorted dictionary to cover 50% or 90% of all word instances in the language?

Exploratory Analysis

In this section, we explore the given dataset, starting by summaries about their content, then we will look deeper at the frequency of n-grams in portions of the documents.

Basic Summary

The following table shows for each document, the size in bytes, the number of lines (each line is a tweet, blog post or article), and the count of characters.

Size Number_of_Lines Number_of_Char
Twitter 316037344 2360148 162384825
Blogs 260564320 899288 208361438
News 20111392 77259 15683765

Since the size of the documents is large and it will take a long processing time, we will take a sample of each for further analysis.

n-grams Frequencies

Now we will extract samples from each document (0.5% of twitter/blogs abd 5% of news) and perform the following:

  • convert this sample into corpus
  • perform initial cleaning (convert to lower case, remove numbers, symbols, punctuation,..etc.)
  • find unigrams, bigrams and trigrams
  • return the n-grams in a list

Then we can plot the most frequently mentioned Unigrams, Bigrams and Trigrams as follows:

Unigrams

Bigrams

Trigrams

Word Instance Coverage

As we discovered and counted the n-grams frequencies in the sample documents, we can estimate how many unique words we need in a frequency sorted dictionary to cover 50% or 90% of all word instances. The following graph shows the increase in coverage in each document.

And the following table shows the number of words in each sample and the 50% and 90% coverage. It seems that News have a higher diversity in words included which makes sense considering the language used in News.

Total_Words Words_to_cover_50_percent Words_to_cover_90_percent
Twitter_Sample 147950 123 4768
Blogs_Sample 185478 108 5917
News_Sample 129688 192 6784

Next Steps

This preliminary analysis gave us an overview about the dataset we have. This will be a starting point for the next steps towards building text predictive model. We will need to:

Appendix : Code

# load libraries

library(dplyr)
library(knitr)
library(stringi)
library(stringr)
library(tm)  
library(SnowballC)  
library(RWeka)
library(qdap)
library(purrr)
library(tidyr)
library(ggplot2)
library(cowplot)
# read given documents
twitter<-readLines('../data/text_source/en_US.twitter.txt')
blogs<-readLines('../data/text_source/en_US.blogs.txt')
news<-readLines('../data/text_source/en_US.news.txt')

# functin to find doc summary
GetDocSummary <- function(x)
{
        c(Size=object.size(x),
              Number_of_Lines=length(x),
              Number_of_Char=sum(nchar(x)))
}

# put summaries for all documents in one df
Doc_summary <- rbind(GetDocSummary(twitter),
      GetDocSummary(blogs),
      GetDocSummary(news)) %>% 
        as.data.frame(row.names=c("Twitter","Blogs","News"))
# a function that takes x: char vector and ns: number of lines to sample
GetNgrams <- function(x,ns=0.005)
        {
        
        set.seed(1005)
        
        #sample doc
        x_sample <- sample(x, ns*length(x), replace=FALSE)
        
        # form corpus
        docs<-Corpus(VectorSource(x_sample))
        
        # # pre-processing
        docs <- tm_map(docs, content_transformer(strip), char.keep="'") # strip all except comma
        docs <- tm_map(docs, content_transformer(stri_trans_general), id="latin-ascii") # preserve the text as much as possible
        docs <- tm_map(docs, tolower)               ## to lower case
        docs <- tm_map(docs, removeNumbers)         ## remove numbers
        docs <- tm_map(docs, stripWhitespace)       ## remove  White space
        docs <- tm_map(docs, removePunctuation)     ## remove punctuation
        
        # find unigrams
        ng1 <- NGramTokenizer(docs, Weka_control(min=1, max=1))%>%
                 table %>%
                 as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% 
                 arrange(desc(Freq))
                
        ng1 <- rename(ng1,Unigram=.)

        # find bigrams
        ng2 <- NGramTokenizer(docs, Weka_control(min=2, max=2)) %>%
                table %>%
                as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% 
                arrange(desc(Freq))
        
        # find trigram
        ng2 <- rename(ng2,Bigram=.)
        
        ng3 <- NGramTokenizer(docs, Weka_control(min=3, max=3)) %>%
                table %>%
                as.data.frame(row.names=NULL, stringsAsFactors=FALSE) %>% 
                arrange(desc(Freq))
        
         ng3 <- rename(ng3,Trigram=.)

         # form and return a list of unigrams, bigrams, trigrams
         return(list(ng1,ng2,ng3))
        
}

# Get n-grams for each sample
twitter_ng <- GetNgrams(twitter,0.005)
blogs_ng <-  GetNgrams(blogs,0.005) 
news_ng <- GetNgrams(news,0.05) 

# remove original docs
rm(twitter)
rm(blogs)
rm(news)
# a function that takes x: a dataframe with n-gram --- Frq
#                         y: name of n-gram
# and produce a barplot with frequencies
gx <- function(x,ylabel,title=" ")
{
        ggplot(data=x , aes(x=reorder(x[,1],Freq),y=Freq, fill=-Freq))+
        geom_bar(stat="identity")+
        coord_flip()+
                theme_classic()+
                xlab(ylabel)+
                ggtitle(title)+
                guides(fill=FALSE)
}


# plot unigrams
plot_grid(gx(twitter_ng[[1]][1:20,],"Unigram"),
          gx(blogs_ng[[1]][1:20,],"Unigram"),
          gx(news_ng[[1]][1:20,],"Unigram"),
          ncol=3,
          labels = c("Twitter","Blogs","News"))

# plot bigrams
plot_grid(gx(twitter_ng[[2]][1:20,],"Bigarm"),
          gx(blogs_ng[[2]][1:20,],"Bigram"),
          gx(news_ng[[2]][1:20,],"Bigram"),
          ncol=3,
          labels = c("Twitter","Blogs","News"))

# plot trigrams
plot_grid(gx(twitter_ng[[3]][1:20,],"Trigram"),
          gx(blogs_ng[[3]][1:20,],"Trigarm"),
          gx(news_ng[[3]][1:20,],"Trigram"),
          ncol=3,
          labels = c("Twitter","Blogs","News"))
# a function to get the % cumulative sum of words
GetCumPercent <- function(x){
        # print(deparse(substitute(x)))

        x %>%
                arrange(desc(Freq))%>%
                mutate(Words=seq_along(Freq),
                       precentage=100*cumsum(Freq)/sum(Freq, na.rm=T)) %>%
                select(Words,precentage)
}

# find % cumulative sum in each document
twitter_cum <- GetCumPercent(twitter_ng[[1]]) %>%
        rename(Twitter_uni_percent=precentage)

blogs_cum <- GetCumPercent(blogs_ng[[1]]) %>%
        rename(blogs_uni_percent=precentage)

news_cum <- GetCumPercent(news_ng[[1]]) %>%
        rename(news_uni_percent=precentage)

# create a df with cumulative % from all documents
word_percent <- Reduce(function(x,y) merge(x,y,
                           by="Words",
                           all.x=TRUE,
                           all.y=TRUE),
       list(twitter_cum,blogs_cum,news_cum
            ))

# convert from wide to long
word_percent_long <- gather(word_percent,"Type","Percentage",2:4)

# remove na
word_percent_long <- word_percent_long %>%
        filter(!is.na(Percentage))

#plot coverage
ggplot(word_percent_long,aes(x=Percentage,y=Words,col=Type))+
        geom_line()+
        theme_classic()
# function to get the number of words that comprise a n percent in x document
getWordPercent <- function(x,n)
{
   which(floor(x[,2])==n)[1]
}

# total words in each sample
FreqDict <- data.frame(Total_Words=c(sum(twitter_ng[[1]]$Freq),
                                     sum(blogs_ng[[1]]$Freq),
                                     sum(news_ng[[1]]$Freq)),

# 50% coverage
Words_to_cover_50_percent=sapply(list(twitter_cum,
                                      blogs_cum,
                                      news_cum),
                                 function(x) which(floor(x[,2])==50)[1] ),

# 90 % coverage
Words_to_cover_90_percent=sapply(list(twitter_cum,
                                      blogs_cum
                                      ,news_cum),
                                 function(x) which(floor(x[,2])==90)[1] ),

                       row.names = c("Twitter_Sample","Blogs_Sample","News_Sample")
                       )