Introduction

This is a report covering text and corpus properties for NLP.

Data is imported from three files:

The text analysis will:

  1. Import the data
  2. Clean the data
  3. Take a sample of the data
  4. Write the sample to disk
  5. Create a corpus from the samples
  6. Clean up the text in the corpus
  7. Gather and present properties of the text files
  8. Create N-grams showing most frequent terms
  9. Present graphical results of top 1-gram, 2-gram, and 3-gram terms

Library Load

R version: 3.5.1 (2018-07-02) – “Feather Spray”

Packages used:

Package Version
tm 0.7.5
stringr 1.3.1
dplyr 0.7.7
ggplot2 3.0.0
RWeka 0.4.39

To prevent Java (used for RWeka) from running out of memory, we also adjust the Java Xmx parameter.

library(tm)
library(stringr)
library(dplyr)
library(ggplot2)
# Increase Java memory limits before loading RWeka
options(java.parameters = "-Xmx8000m")
library(RWeka)

Initial Data Manipulation

Import Data and quick deduplication

Force encoding to UTF-8, skip empty rows, and import list of swearwords.
Remove duplicate rows/lines.

## Import Data
blogFile <- "C:/R/Datasets/swiftkey/en_US.blogs.txt"
newsFile <- "C:/R/Datasets/swiftkey/en_US.news.txt"
twitFile <- "C:/R/Datasets/swiftkey/en_US.twitter.txt"

blogs <- readLines(blogFile, encoding = "UTF-8", skipNul = T)
news <- readLines(newsFile, encoding = "UTF-8", skipNul = T)
twitter <- readLines(twitFile, encoding = "UTF-8", skipNul = T)
swearwords <- readLines("C:/R/Datasets/swiftkey/swear.txt")

## Remove duplicate lines
twitter <- unique(twitter)
blogs <- unique(blogs)
news <- unique(news)

Create Dataset with Text File Summary

Create data frame with data on source text files.

# Text Statistics
## File Information
fileNames <- c("Blogs", "News", "Twitter")
fileSizes <- round(file.size(c(blogFile, newsFile, twitFile))
                   / (1024 * 1000), 2)
noLines <- sapply(list(blogs, news, twitter),
                  function(x){length(x)})
noChars <- sapply(list(blogs, news, twitter),
                  function(x){sum(nchar(x))})
noWords <- sapply(list(blogs, news, twitter),
                  function(x){length(word(x))})
textSummary <- data.frame("FileName" = fileNames,
                          "SizeinMB" = fileSizes,
                          "NumberLines" = noLines,
                          "Characters" = noChars,
                          "Words" = noWords)
textSummary
##   FileName SizeinMB NumberLines Characters   Words
## 1    Blogs   205.23      899288  206824505  899288
## 2     News   200.99       77259   15639408   77259
## 3  Twitter   163.19     2305923  160656274 2305923

Create Samples

Create 10% samples from source text files, and write sampled text to disk.

## Create Samples
set.seed(12345)
twi <- sample(twitter, size = (0.1 * length(twitter)))
blo <- sample(blogs, size = (0.1 * length(blogs)))
nws <- sample(news, size = (0.1 * length(news)))

# Write samples to disk
writeLines(twi, "C:/R/Datasets/swiftkey/samples/twi.txt")
writeLines(blo, "C:/R/Datasets/swiftkey/samples/blo.txt")
writeLines(nws, "C:/R/Datasets/swiftkey/samples/nws.txt")

Corpus

Create Corpus

Create corpus from text files (samples) on disk, and display corpus information.

## Create Corpus
fileLoc <- "C:/R/Datasets/swiftkey/samples"
textData <- VCorpus(DirSource(fileLoc, encoding = "UTF-8"))

## Inspect Corpus
inspect(textData)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 20790578
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 1585061
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 16138513

Transform and Clean Up Corpus

Strip whitespace, remove numbers, all lower case, remove English stopwords, remove punctuation, and remove profanity.
When predictive model will be built, stopwords will be left in

# Corpus Transformation
textData <- tm_map(textData, stripWhitespace)
textData <- tm_map(textData, removeNumbers)
textData <- tm_map(textData, content_transformer(tolower))
textData <- tm_map(textData, removeWords, stopwords("english"))
textData <- tm_map(textData, content_transformer(removePunctuation))
textData <- tm_map(textData, removeWords, swearwords)

Create N-grams

Create document term matrix, 1-gram, 2-gram, and 3-gram N-grams.

## Document Term Matrix
dtm <- DocumentTermMatrix(textData)
inspect(removeSparseTerms(dtm, 0.4))
## <<DocumentTermMatrix (documents: 3, terms: 44431)>>
## Non-/sparse entries: 106257/27036
## Sparsity           : 20%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##          Terms
## Docs       can   get good  just  like  love  now   one time  will
##   blo.txt 9635  7052 4792  9988  9800  4445 5932 12391 8896 11331
##   nws.txt  453   346  223   405   417   102  269   635  406   837
##   twi.txt 8749 10966 9707 14856 12184 10293 8092  8126 7389  9450
# N-Grams
## 1-gram
uni <- function(x) NGramTokenizer(x, Weka_control(min = 1,
                                                  max = 1))
uniTDM <- TermDocumentMatrix(textData,
                             control = list(tokenize = uni))
uniFT <- findFreqTerms(uniTDM,
                       lowfreq = 20)
uniTerms <- rowSums(as.matrix(uniTDM[uniFT,]))
uniTerms <- data.frame(Term = names(uniTerms),
                       Frequency = uniTerms)
uniTerms <- arrange(uniTerms,
                    desc(Frequency))
oneGram <- head(uniTerms, 10)
oneGram
##    Term Frequency
## 1  just     25249
## 2  like     22401
## 3  will     21618
## 4   one     21152
## 5   can     18837
## 6   get     18364
## 7  time     16691
## 8  love     14840
## 9  good     14722
## 10  now     14293
## 2-gram
bi <- function(x) NGramTokenizer(x, Weka_control(min = 2,
                                                 max = 2))
biTDM <- TermDocumentMatrix(textData,
                            control = list(tokenize = bi))
biFT <- findFreqTerms(biTDM,
                      lowfreq = 20)
biTerms <- rowSums(as.matrix(biTDM[biFT,]))
biTerms <- data.frame(Term = names(biTerms),
                      Frequency = biTerms)
biTerms <- arrange(biTerms,
                   desc(Frequency))
twoGram <- head(biTerms, 10)
twoGram
##               Term Frequency
## 1        right now      2216
## 2       last night      1510
## 3        feel like      1165
## 4  looking forward      1071
## 5         new york       925
## 6       looks like       866
## 7          can get       863
## 8         just got       796
## 9         let know       790
## 10      first time       773
## 3-gram
tri <- function(x) NGramTokenizer(x, Weka_control(min = 3,
                                                  max = 3))
triTDM <- TermDocumentMatrix(textData,
                             control = list(tokenize = tri))
triFT <- findFreqTerms(triTDM,
                       lowfreq = 20)
triTerms <- rowSums(as.matrix(triTDM[triFT,]))
triTerms <- data.frame(Term = names(triTerms),
                       Frequency = triTerms)
triTerms <- arrange(triTerms,
                    desc(Frequency))
threeGram <- head(triTerms, 10)
threeGram
##                      Term Frequency
## 1       happy mothers day       301
## 2             let us know       231
## 3          happy new year       169
## 4           new york city       136
## 5           cinco de mayo        92
## 6  looking forward seeing        87
## 7          new york times        78
## 8           just got back        71
## 9         st patricks day        71
## 10   happy valentines day        66

Graphing Text Analysis

Graphs showing frequently used terms in the N-grams generated.