Week 2 - Milestone Report

Data processing

Preparatory work

Loading necessary libraries, adjusting parallel calculations and setting the seed:

library(doParallel)
library(quanteda)
library(ggplot2)

CPU <- parallel::detectCores()
registerDoParallel(makeCluster(CPU))

set.seed(12345)

Brief summary about dataset’s

pathToFiles <- "./Coursera-SwiftKey/final/en_US"
listFiles <- list.files(pathToFiles, full.names = TRUE)
filesNames <- list.files(pathToFiles)


briefSummary <- data.frame()
linesExample <- data.frame()

for (i in 1:length(listFiles)) {
  fileName <- listFiles[i]
  con <- file(fileName,open="r")
  lin <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
  close(con)
  
  linesExample[i, 1] <- lin[i]
  
  briefSummary[i, 1] <- length(lin)
  briefSummary[i, 2] <- sum(length(unlist(strsplit(lin, " "))))
}

rownames(briefSummary) <- filesNames
colnames(briefSummary) <- c('Strings Number', 'Words Number')

briefSummary # dataset's summary table

##                   Strings Number Words Number
## en_US.blogs.txt           899288     37334131
## en_US.news.txt             77259      2643969
## en_US.twitter.txt        2360148     30373583

linesExample[, 1] # some strings from datasets

## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan <U+0093>gods<U+0094>."                                                             
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "they've decided its more fun if I don't."

So, we can make next conclusions:
1. For processing loaded datasets it’s necessary a huge amount of memory and/or time for calculating. Therefore we have to sampling datasets.
2. Text in datasets has to be preprocessed, because it contains:
- uppercase and lowercase letters;
- numbers;
- punctuation marks;
- whitespaces;
- Twitter signs;
- hyphens;
- stop words.

Sampling datasets

For report’s purposes and the computing power of my computer we will use only randomly sampled 5% of each datasets:

# loading and sampling blogs dataset
fileName <- "./Coursera-Swiftkey/final/en_US/en_US.blogs.txt"
con <- file(fileName, open="r")
blogs <- readLines(con, encoding="UTF-8", skipNul = TRUE)
close(con)
blogsSample <- sample(blogs, (5/100 * length(blogs)), replace = FALSE)

# loading and sampling news dataset
fileName <- "./Coursera-SwiftKey/final/en_US/en_US.news.txt"    
con <- file(fileName,open="r")
news <- readLines(con, encoding="UTF-8")
close(con)
newsSample <- sample(news, (5/100 * length(news)), replace = FALSE)

# loading and sampling twitter dataset
fileName <- "./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
con <- file(fileName,open="r")
twitter <- readLines(con, encoding="UTF-8", skipNul = TRUE)
close(con)
twitterSample <- sample(twitter, (5/100 * length(twitter)), replace = FALSE)

# updating summary table
briefSummary[1, 3] <- length(blogsSample)
briefSummary[2, 3] <- length(newsSample)
briefSummary[3, 3] <- length(twitterSample)

briefSummary[1, 4] <- sum(length(unlist(strsplit(blogsSample, " "))))
briefSummary[2, 4] <- sum(length(unlist(strsplit(newsSample, " "))))
briefSummary[3, 4] <- sum(length(unlist(strsplit(twitterSample, " "))))

colnames(briefSummary) <- c('Strings Number (dataset)',
                            'Words Number (dataset)',
                            'Strings Number (sample)',
                            'Words Number (sample)')

briefSummary # dataset's summary table

##                   Strings Number (dataset) Words Number (dataset)
## en_US.blogs.txt                     899288               37334131
## en_US.news.txt                       77259                2643969
## en_US.twitter.txt                  2360148               30373583
##                   Strings Number (sample) Words Number (sample)
## en_US.blogs.txt                     44964               1856757
## en_US.news.txt                       3862                133166
## en_US.twitter.txt                  118007               1517919

1-Grams

Now we will divide texts into smaller units (tokens) like words or phrases. For making predictive models is strongly related concept is the N-gram, a running sequence of N words. Before do this we transform the source to a Corpus objects. Further we will clean the data and do TermDocumentMatrix. In this part we will do this work only for single words.

# loading and sampling twitter dataset
blogsCorpus <- corpus(blogsSample)
blogs1Gram <- dfm(blogsCorpus, ngrams = 1, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 44,964 documents
##    ... indexing features: 65,123 feature types
##    ... removed 174 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 21680 feature variants
##    ... created a 44964 x 43270 sparse dfm
##    ... complete. 
## Elapsed time: 6.11 seconds.

blogs1GramTop <- topfeatures(blogs1Gram, n = 30, decreasing = TRUE)

newsCorpus <- corpus(newsSample)
news1Gram <- dfm(newsCorpus, ngrams = 1, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 3,862 documents
##    ... indexing features: 17,515 feature types
##    ... removed 165 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 5267 feature variants
##    ... created a 3862 x 12084 sparse dfm
##    ... complete. 
## Elapsed time: 0.28 seconds.

news1GramTop <- topfeatures(news1Gram, n = 30, decreasing = TRUE)

twitterCorpus <- corpus(twitterSample)
twitter1Gram <- dfm(twitterCorpus, ngrams = 1, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 118,007 documents
##    ... indexing features: 62,671 feature types
##    ... removed 172 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 15129 feature variants
##    ... created a 118007 x 47371 sparse dfm
##    ... complete. 
## Elapsed time: 4.1 seconds.

twitter1GramTop <- topfeatures(twitter1Gram, n = 30, decreasing = TRUE)

Let’s plot top 30 of most frequent words:

blogsDataFrameTop_1Gram <- data.frame(feature = names(blogs1GramTop), 
                                count = blogs1GramTop,
                                source = rep("Blogs", length(blogs1GramTop)))
newsDataFrameTop_1Gram <- data.frame(feature = names(news1GramTop), 
                                count = news1GramTop,
                                source = rep("News", length(news1GramTop)))
twitterDataFrameTop_1Gram <- data.frame(feature = names(twitter1GramTop), 
                                count = twitter1GramTop,
                                source = rep("Twitter", length(twitter1GramTop)))
allTop_1Gram <- rbind(blogsDataFrameTop_1Gram, newsDataFrameTop_1Gram, twitterDataFrameTop_1Gram)

g1 <- ggplot(allTop_1Gram, aes(feature, count, fill = source))
g1 + geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  facet_wrap(~ source) +
  labs(x = "Most frequent 1-Grams", y = "Count")

2-Grams

Here we will obtain from datasets the 2-grams - two word sequences:

# loading and sampling twitter dataset
blogs2Gram <- dfm(blogsCorpus, ngrams = 2, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 44,964 documents
##    ... indexing features: 669,708 feature types
##    ... removed 366,604 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 10830 feature variants
##    ... created a 44964 x 292275 sparse dfm
##    ... complete. 
## Elapsed time: 29.72 seconds.

blogs2GramTop <- topfeatures(blogs2Gram, n = 30, decreasing = TRUE)

news2Gram <- dfm(newsCorpus, ngrams = 2, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 3,862 documents
##    ... indexing features: 84,737 feature types
##    ... removed 50,520 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 475 feature variants
##    ... created a 3862 x 33743 sparse dfm
##    ... complete. 
## Elapsed time: 2.97 seconds.

news2GramTop <- topfeatures(news2Gram, n = 30, decreasing = TRUE)

twitter2Gram <- dfm(twitterCorpus, ngrams = 2, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 118,007 documents
##    ... indexing features: 537,332 feature types
##    ... removed 260,982 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 9052 feature variants
##    ... created a 118007 x 267299 sparse dfm
##    ... complete. 
## Elapsed time: 24.13 seconds.

twitter2GramTop <- topfeatures(twitter2Gram, n = 30, decreasing = TRUE)

Let’s plot top 30 of most frequent 2-grams:

blogsDataFrameTop_2Gram <- data.frame(feature = names(blogs2GramTop), 
                                count = blogs2GramTop,
                                source = rep("Blogs", length(blogs2GramTop)))
newsDataFrameTop_2Gram <- data.frame(feature = names(news2GramTop), 
                                count = news2GramTop,
                                source = rep("News", length(news2GramTop)))
twitterDataFrameTop_2Gram <- data.frame(feature = names(twitter2GramTop), 
                                count = twitter2GramTop,
                                source = rep("Twitter", length(twitter2GramTop)))
allTop_2Gram <- rbind(blogsDataFrameTop_2Gram, newsDataFrameTop_2Gram, twitterDataFrameTop_2Gram)

g1 <- ggplot(allTop_2Gram, aes(feature, count, fill = source))
g1 + geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  facet_wrap(~ source) +
  labs(x = "Most frequent 2-Grams", y = "Count")

3-Grams

Here we will obtain from datasets the 3-grams - three word sequences:

# loading and sampling twitter dataset
blogs3Gram <- dfm(blogsCorpus, ngrams = 3, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 44,964 documents
##    ... indexing features: 1,362,412 feature types
##    ... removed 1,205,187 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 274 feature variants
##    ... created a 44964 x 156952 sparse dfm
##    ... complete. 
## Elapsed time: 42.09 seconds.

blogs3GramTop <- topfeatures(blogs3Gram, n = 30, decreasing = TRUE)

news3Gram <- dfm(newsCorpus, ngrams = 3, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 3,862 documents
##    ... indexing features: 116,092 feature types
##    ... removed 97,664 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 20 feature variants
##    ... created a 3862 x 18409 sparse dfm
##    ... complete. 
## Elapsed time: 3.06 seconds.

news3GramTop <- topfeatures(news3Gram, n = 30, decreasing = TRUE)

twitter3Gram <- dfm(twitterCorpus, ngrams = 3, what = "word", 
                   removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
                   removeTwitter = TRUE, removeHyphens = TRUE, ignoredFeatures=stopwords("english"),
                   stem=TRUE)

## Creating a dfm from a corpus ...
##    ... lowercasing
##    ... tokenizing
##    ... indexing documents: 118,007 documents
##    ... indexing features: 970,104 feature types
##    ... removed 799,657 features, from 174 supplied (glob) feature types
##    ... stemming features (English), trimmed 309 feature variants
##    ... created a 118007 x 170139 sparse dfm
##    ... complete. 
## Elapsed time: 35.75 seconds.

twitter3GramTop <- topfeatures(twitter3Gram, n = 30, decreasing = TRUE)

Let’s plot top 30 of most frequent 3-grams:

blogsDataFrameTop_3Gram <- data.frame(feature = names(blogs3GramTop), 
                                count = blogs3GramTop,
                                source = rep("Blogs", length(blogs3GramTop)))
newsDataFrameTop_3Gram <- data.frame(feature = names(news3GramTop), 
                                count = news3GramTop,
                                source = rep("News", length(news3GramTop)))
twitterDataFrameTop_3Gram <- data.frame(feature = names(twitter3GramTop), 
                                count = twitter3GramTop,
                                source = rep("Twitter", length(twitter3GramTop)))
allTop_3Gram <- rbind(blogsDataFrameTop_3Gram, newsDataFrameTop_3Gram, twitterDataFrameTop_3Gram)

g1 <- ggplot(allTop_3Gram, aes(feature, count, fill = source))
g1 + geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  facet_wrap(~ source) +
  labs(x = "Most frequent 3-Grams", y = "Count")

Week 2 - Milestone Report

Zanin Pavel

June 13, 2016

Introduction