In this document we describe exploratory analysis of English texts corpus from the Data Science Specialization Capstone Project.

Exploring data

We have three files with texts from different sources. Data obtaining and loading process fully described in Avanced section of this document.

par(mfrow = c(4, 1))
l1 <- c(length(twitter), length(news), length(blogs))
names(l1) <- c("Twitter", "News", "Blogs")
barplot(l1, main = "Texts (lines) per corpus")

n1 <- unlist(lapply(twitter, nchar))
n2 <- unlist(lapply(news, nchar))
n3 <- unlist(lapply(blogs, nchar))
boxplot(n1, main = "Letters per text (line) for Twitter", ylab = "Letters")
boxplot(n2, main = "Letters per text (line) for News", ylab = "Letters")
boxplot(n3, main = "Letters per text (line) for Blogs", ylab = "Letters")

Twitter has a lot more texts (2360148), but messages are restricted to 140 characters, 68.68 on average. News and Blogs have a lot more characters (201.16 and 229.99 on average respectively) and smaller corpus size (1010242 and 899288).

Words

For words statistics we will use relatively small sample size and process it to remove punctuation and split texts on sentences:

set.seed(198609)
max.size <- 10000
twitter.sample <- processTexts(sample(twitter, max.size))
news.sample <- processTexts(sample(news, max.size))
blogs.sample <- processTexts(sample(blogs, max.size))

Function processTexts described extensively in Advanced section.

We want to be able to predict whether word is whole upper case (abbreviation), begin with capital letter or is common word. For dictionary we’ll use original text:

words.twitter <- WordTokenizer(twitter.sample, control = Weka_control(dilimiters = " "))
words.news <- WordTokenizer(news.sample, control = Weka_control(dilimiters = " "))
words.blogs <- WordTokenizer(blogs.sample, control = Weka_control(dilimiters = " "))

Number of words per 10000 sampled documents:

rbind(data.frame(stat = "Number of words", twitter = length(words.twitter), news = length(words.news), blogs = length(words.blogs)),
      data.frame(stat = "Unique words", twitter = length(unique(words.twitter)), news = length(unique(words.news)), blogs = length(unique(words.blogs))))
##              stat twitter  news blogs
## 1 Number of words   12975 34473 42072
## 2    Unique words    3916  8418  8628

Top 10 popular words for Twitter per 10000 sampled documents:

dictionary.twitter <- data.table(Word = words.twitter)[Word != "[number]", list(amount = .N), by = Word][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount), 2))]

dictionary.twitter[order(-amount)][1:10, list(Word, "Amount" = amount, "% of total" = p.amount, "Cumulative % of total" = pcum.amount)]
##     Word Amount % of total Cumulative % of total
##  1:  the    361       2.83                  2.83
##  2:    I    351       2.75                  5.58
##  3:   to    316       2.47                  8.05
##  4:    a    239       1.87                  9.92
##  5:  you    200       1.57                 11.49
##  6:  and    163       1.28                 12.76
##  7:   in    154       1.21                 13.97
##  8:   of    154       1.21                 15.18
##  9:   is    152       1.19                 16.37
## 10:  for    145       1.14                 17.50

Note here that using cased words in dictionary will allow us to predict that “I” should be capitalized.

Top 10 popular words in news:

dictionary.news <- data.table(Word = words.news)[Word != "[number]", list(amount = .N), by = Word][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount), 2))]

dictionary.news[order(-amount)][1:10, list(Word, "Amount" = amount, "% of total" = p.amount, "Cumulative % of total" = pcum.amount)]
##     Word Amount % of total Cumulative % of total
##  1:  the   1652       4.93                  4.93
##  2:   to    872       2.60                  7.54
##  3:    a    858       2.56                 10.10
##  4:  and    821       2.45                 12.55
##  5:   of    782       2.34                 14.89
##  6:   in    624       1.86                 16.75
##  7:    s    359       1.07                 17.82
##  8:  for    350       1.05                 18.87
##  9: that    317       0.95                 19.81
## 10:   is    264       0.79                 20.60

Top 10 popular words in blogs:

dictionary.blogs <- data.table(Word = words.blogs)[Word != "[number]", list(amount = .N), by = Word][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount), 2))]

dictionary.blogs[order(-amount)][1:10, list(Word, "Amount" = amount, "% of total" = p.amount, "Cumulative % of total" = pcum.amount)]
##     Word Amount % of total Cumulative % of total
##  1:  the   1876       4.52                  4.52
##  2:  and   1132       2.72                  7.24
##  3:   to   1102       2.65                  9.89
##  4:   of    987       2.38                 12.27
##  5:    a    961       2.31                 14.58
##  6:    I    904       2.18                 16.76
##  7:   in    541       1.30                 18.06
##  8:   it    451       1.09                 19.15
##  9: that    448       1.08                 20.22
## 10:   is    441       1.06                 21.29

Note here that news have no “I” in top words as news are usually impersonal.

Here we provide different dictionary sizes including only popular words:

data.table("Percent of words in dictionary" = c(25, 50, 75, 90),
           "Twitter, words" = c(nrow(dictionary.twitter[pcum.amount < 25]), nrow(dictionary.twitter[pcum.amount < 50]), nrow(dictionary.twitter[pcum.amount < 75]), nrow(dictionary.twitter[pcum.amount < 90])),
           "News, words" = c(nrow(dictionary.news[pcum.amount < 25]), nrow(dictionary.news[pcum.amount < 50]), nrow(dictionary.news[pcum.amount < 75]), nrow(dictionary.news[pcum.amount < 90])),
           "Blogs, words" = c(nrow(dictionary.blogs[pcum.amount < 25]), nrow(dictionary.blogs[pcum.amount < 50]), nrow(dictionary.blogs[pcum.amount < 75]), nrow(dictionary.blogs[pcum.amount < 90])))
##    Percent of words in dictionary Twitter, words News, words Blogs, words
## 1:                             25             20          16           14
## 2:                             50            151         215          130
## 3:                             75            974        1691         1200
## 4:                             90           2637        5066         4470

N-grams

To predict next word we would use bi- (frequency for pairs of words) and tri-grams (frequency for triplets of words). Sizes of this dictionaries is quite large and here we’ll transform word to lowercase:

twitter.sample <- tolower(twitter.sample)
bi.twitter <- NGramTokenizer(twitter.sample, control = Weka_control(min = 2, max = 2, dilimiters = " "))
tri.twitter <- NGramTokenizer(twitter.sample, control = Weka_control(min = 3, max = 3, dilimiters = " "))
news.sample <- tolower(news.sample)
bi.news <- NGramTokenizer(news.sample, control = Weka_control(min = 2, max = 2, dilimiters = " "))
tri.news <- NGramTokenizer(news.sample, control = Weka_control(min = 3, max = 3, dilimiters = " "))
blogs.sample <- tolower(blogs.sample)
bi.blogs <- NGramTokenizer(blogs.sample, control = Weka_control(min = 2, max = 2, dilimiters = " "))
tri.blogs <- NGramTokenizer(blogs.sample, control = Weka_control(min = 3, max = 3, dilimiters = " "))

Number of pairs and triplets:

rbind(data.frame(stat = "Number of pairs", twitter = length(bi.twitter), news = length(bi.news), blogs = length(bi.blogs)),
      data.frame(stat = "Unique pairs", twitter = length(unique(bi.twitter)), news = length(unique(bi.news)), blogs = length(unique(bi.blogs))),
      data.frame(stat = "Number of triplets", twitter = length(tri.twitter), news = length(tri.news), blogs = length(tri.blogs)),
      data.frame(stat = "Unique triplets", twitter = length(unique(tri.twitter)), news = length(unique(tri.news)), blogs = length(unique(tri.blogs))))
##                 stat twitter  news blogs
## 1    Number of pairs   11199 32139 38971
## 2       Unique pairs    8737 24144 27034
## 3 Number of triplets    9583 30007 36040
## 4    Unique triplets    9245 28779 34158

Here we provide different dictionary sizes including only popular pairs:

pairs.twitter <- data.table(pair = bi.twitter)[, list(amount = .N), by = pair][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]
pairs.news <- data.table(pair = bi.news)[, list(amount = .N), by = pair][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]
pairs.blogs <- data.table(pair = bi.blogs)[, list(amount = .N), by = pair][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]

data.table("Percent of pairs in dictionary" = c(25, 50, 75, 90),
           "Twitter, pairs" = c(nrow(pairs.twitter[pcum.amount < 25]), nrow(pairs.twitter[pcum.amount < 50]), nrow(pairs.twitter[pcum.amount < 75]), nrow(pairs.twitter[pcum.amount < 90])),
           "News, pairs" = c(nrow(pairs.news[pcum.amount < 25]), nrow(pairs.news[pcum.amount < 50]), nrow(pairs.news[pcum.amount < 75]), nrow(pairs.news[pcum.amount < 90])),
           "Blogs, pairs" = c(nrow(pairs.blogs[pcum.amount < 25]), nrow(pairs.blogs[pcum.amount < 50]), nrow(pairs.blogs[pcum.amount < 75]), nrow(pairs.blogs[pcum.amount < 90])))
##    Percent of pairs in dictionary Twitter, pairs News, pairs Blogs, pairs
## 1:                             25            637        1324         1043
## 2:                             50           3081        7913         7353
## 3:                             75           5881       15948        17096
## 4:                             90           7561       20769        22942

Here we provide different dictionary sizes including only popular triplets:

triplets.twitter <- data.table(triplet = tri.twitter)[, list(amount = .N), by = triplet][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]
triplets.news <- data.table(triplet = tri.news)[, list(amount = .N), by = triplet][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]
triplets.blogs <- data.table(triplet = tri.blogs)[, list(amount = .N), by = triplet][order(-amount), `:=`(p.amount = round(100 * amount / sum(amount), 2), pcum.amount = round(100 * cumsum(amount) / sum(amount)))]

data.table("Percent of triplets in dictionary" = c(25, 50, 75, 90),
           "Twitter, tri" = c(nrow(triplets.twitter[pcum.amount < 25]), nrow(triplets.twitter[pcum.amount < 50]), nrow(triplets.twitter[pcum.amount < 75]), nrow(triplets.twitter[pcum.amount < 90])),
           "News, tri" = c(nrow(triplets.news[pcum.amount < 25]), nrow(triplets.news[pcum.amount < 50]), nrow(triplets.news[pcum.amount < 75]), nrow(triplets.news[pcum.amount < 90])),
           "Blogs, tri" = c(nrow(triplets.blogs[pcum.amount < 25]), nrow(triplets.blogs[pcum.amount < 50]), nrow(triplets.blogs[pcum.amount < 75]), nrow(triplets.blogs[pcum.amount < 90])))
##    Percent of triplets in dictionary Twitter, tri News, tri Blogs, tri
## 1:                                25         2009      6123       6947
## 2:                                50         4405     13625      15957
## 3:                                75         6801     21127      24967
## 4:                                90         8238     25628      30373

Application plans

Our main goal is to create application that can predict next word from some previous input or without it.

  1. Create word dictionaries for each data sets, so we can use appropriate dictionary depending on application;

  2. Create word pairs (bi-grams) dictionary to predict next word by previous for each data set;

  3. Create word triplets (tri-grams) dictionary to predict next word by previous two for each data set;

  4. Cut dictionaries to sizes that allow fast response in predicting, also it’ll help to get rid of profanity.

  5. If we can get current user application - use appropriate dictionary or mix them for all unknown applications.

Advanced details

Here we provide some technical details.

External packages and functions

This report uses some external packages:

suppressPackageStartupMessages(require(data.table))
suppressPackageStartupMessages(require(RWeka))

Obtaining data

The data for this project can be obtained from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip:

if (!file.exists("data/")) {
    dir.create("data") # create data directory
}

if (!file.exists("data/data.zip")) {
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", "data/data.zip", method = "curl") # download file as we have no
}

if (!file.exists("data/en_US")) {
    unzip("data/data.zip", exdir = "data") # unzip files
}

Loading data

The data provided is quite big and loading it in R can take a lot of time:

transform(file.info(list.files("data/final/en_US/", full.names = T))[, "size", drop = FALSE], size = paste0(round(size / 1024 / 1024, 2), "MB"))
##                                         size
## data/final/en_US//en_US.blogs.txt   200.42MB
## data/final/en_US//en_US.news.txt    196.28MB
## data/final/en_US//en_US.twitter.txt 159.36MB

Loading:

suppressWarnings(twitter <- readLines("data/final/en_US/en_US.twitter.txt"))
suppressWarnings(blogs <- readLines("data/final/en_US/en_US.blogs.txt"))
suppressWarnings(news <- readLines("data/final/en_US/en_US.news.txt"))

Processing lines

This function used to process lines.

processTexts <- function(texts) {
    # We'll process not a whole text lines, but sentencies. 
    # Just before dividing lines by sentencies we want to exclude text in brackets as different sentencies.
    # I think that text in bracket can be out of context and can't be used in predicting by previous words. 
    texts <- c(gsub("[(].+?[)]", "", texts), regmatches(texts, regexpr("[(].+?[)]", texts)))
    
    # Divide lines to sentencies:
    texts <- unlist(strsplit(texts, "[!.?]"))
    
    #Exclude punctuation as it'll not help in predicting:
    texts <- gsub("\\]", " ", gsub("\\[", " ", gsub("[…|•“”!\"#&$%\\(\\)*+./:;<=>?@^_`\\{|\\}~,/\\-]", " ", texts)))
    
    #We don't want to predict numbers, but we can gain additional prediction power using placeholder for all numbers:
    texts <- gsub("[0-9]+", " [number] ", texts)

    # Clear whitespaces:
    texts <- gsub("[ ]{2, }", " ", texts)
    
    texts
}

System information

This report can be rebuilt using this environment:

sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=ru_RU.UTF-8          LC_NUMERIC=C                 
##  [3] LC_TIME=ru_RU.UTF-8           LC_COLLATE=ru_RU.UTF-8       
##  [5] LC_MONETARY=ru_RU.UTF-8       LC_MESSAGES=ru_RU.UTF-8      
##  [7] LC_PAPER=ru_RU.UTF-8          LC_NAME=ru_RU.UTF-8          
##  [9] LC_ADDRESS=ru_RU.UTF-8        LC_TELEPHONE=ru_RU.UTF-8     
## [11] LC_MEASUREMENT=ru_RU.UTF-8    LC_IDENTIFICATION=ru_RU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] RWeka_0.4-23     data.table_1.9.4
## 
## loaded via a namespace (and not attached):
##  [1] chron_2.3-45       digest_0.6.4       evaluate_0.5.5    
##  [4] formatR_1.0        grid_3.1.2         htmltools_0.2.6   
##  [7] knitr_1.7          plyr_1.8.1         Rcpp_0.11.3       
## [10] reshape2_1.4       rJava_0.9-6        rmarkdown_0.3.3   
## [13] RWekajars_3.7.11-1 stringr_0.6.2      tools_3.1.2       
## [16] yaml_2.1.13