This is a report covering text and corpus properties for NLP.
Data is imported from three files:
The text analysis will:
R version: 3.5.1 (2018-07-02) – “Feather Spray”
Packages used:
| Package | Version |
|---|---|
| tm | 0.7.5 |
| stringr | 1.3.1 |
| dplyr | 0.7.7 |
| ggplot2 | 3.0.0 |
| RWeka | 0.4.39 |
To prevent Java (used for RWeka) from running out of memory, we also adjust the Java Xmx parameter.
library(tm)
library(stringr)
library(dplyr)
library(ggplot2)
# Increase Java memory limits before loading RWeka
options(java.parameters = "-Xmx8000m")
library(RWeka)
Force encoding to UTF-8, skip empty rows, and import list of swearwords.
Remove duplicate rows/lines.
## Import Data
blogFile <- "C:/R/Datasets/swiftkey/en_US.blogs.txt"
newsFile <- "C:/R/Datasets/swiftkey/en_US.news.txt"
twitFile <- "C:/R/Datasets/swiftkey/en_US.twitter.txt"
blogs <- readLines(blogFile, encoding = "UTF-8", skipNul = T)
news <- readLines(newsFile, encoding = "UTF-8", skipNul = T)
twitter <- readLines(twitFile, encoding = "UTF-8", skipNul = T)
swearwords <- readLines("C:/R/Datasets/swiftkey/swear.txt")
## Remove duplicate lines
twitter <- unique(twitter)
blogs <- unique(blogs)
news <- unique(news)
Create data frame with data on source text files.
# Text Statistics
## File Information
fileNames <- c("Blogs", "News", "Twitter")
fileSizes <- round(file.size(c(blogFile, newsFile, twitFile))
/ (1024 * 1000), 2)
noLines <- sapply(list(blogs, news, twitter),
function(x){length(x)})
noChars <- sapply(list(blogs, news, twitter),
function(x){sum(nchar(x))})
noWords <- sapply(list(blogs, news, twitter),
function(x){length(word(x))})
textSummary <- data.frame("FileName" = fileNames,
"SizeinMB" = fileSizes,
"NumberLines" = noLines,
"Characters" = noChars,
"Words" = noWords)
textSummary
## FileName SizeinMB NumberLines Characters Words
## 1 Blogs 205.23 899288 206824505 899288
## 2 News 200.99 77259 15639408 77259
## 3 Twitter 163.19 2305923 160656274 2305923
Create 10% samples from source text files, and write sampled text to disk.
## Create Samples
set.seed(12345)
twi <- sample(twitter, size = (0.1 * length(twitter)))
blo <- sample(blogs, size = (0.1 * length(blogs)))
nws <- sample(news, size = (0.1 * length(news)))
# Write samples to disk
writeLines(twi, "C:/R/Datasets/swiftkey/samples/twi.txt")
writeLines(blo, "C:/R/Datasets/swiftkey/samples/blo.txt")
writeLines(nws, "C:/R/Datasets/swiftkey/samples/nws.txt")
Create corpus from text files (samples) on disk, and display corpus information.
## Create Corpus
fileLoc <- "C:/R/Datasets/swiftkey/samples"
textData <- VCorpus(DirSource(fileLoc, encoding = "UTF-8"))
## Inspect Corpus
inspect(textData)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 20790578
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 1585061
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 16138513
Strip whitespace, remove numbers, all lower case, remove English stopwords, remove punctuation, and remove profanity.
When predictive model will be built, stopwords will be left in
# Corpus Transformation
textData <- tm_map(textData, stripWhitespace)
textData <- tm_map(textData, removeNumbers)
textData <- tm_map(textData, content_transformer(tolower))
textData <- tm_map(textData, removeWords, stopwords("english"))
textData <- tm_map(textData, content_transformer(removePunctuation))
textData <- tm_map(textData, removeWords, swearwords)
Create document term matrix, 1-gram, 2-gram, and 3-gram N-grams.
## Document Term Matrix
dtm <- DocumentTermMatrix(textData)
inspect(removeSparseTerms(dtm, 0.4))
## <<DocumentTermMatrix (documents: 3, terms: 44431)>>
## Non-/sparse entries: 106257/27036
## Sparsity : 20%
## Maximal term length: 23
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs can get good just like love now one time will
## blo.txt 9635 7052 4792 9988 9800 4445 5932 12391 8896 11331
## nws.txt 453 346 223 405 417 102 269 635 406 837
## twi.txt 8749 10966 9707 14856 12184 10293 8092 8126 7389 9450
# N-Grams
## 1-gram
uni <- function(x) NGramTokenizer(x, Weka_control(min = 1,
max = 1))
uniTDM <- TermDocumentMatrix(textData,
control = list(tokenize = uni))
uniFT <- findFreqTerms(uniTDM,
lowfreq = 20)
uniTerms <- rowSums(as.matrix(uniTDM[uniFT,]))
uniTerms <- data.frame(Term = names(uniTerms),
Frequency = uniTerms)
uniTerms <- arrange(uniTerms,
desc(Frequency))
oneGram <- head(uniTerms, 10)
oneGram
## Term Frequency
## 1 just 25249
## 2 like 22401
## 3 will 21618
## 4 one 21152
## 5 can 18837
## 6 get 18364
## 7 time 16691
## 8 love 14840
## 9 good 14722
## 10 now 14293
## 2-gram
bi <- function(x) NGramTokenizer(x, Weka_control(min = 2,
max = 2))
biTDM <- TermDocumentMatrix(textData,
control = list(tokenize = bi))
biFT <- findFreqTerms(biTDM,
lowfreq = 20)
biTerms <- rowSums(as.matrix(biTDM[biFT,]))
biTerms <- data.frame(Term = names(biTerms),
Frequency = biTerms)
biTerms <- arrange(biTerms,
desc(Frequency))
twoGram <- head(biTerms, 10)
twoGram
## Term Frequency
## 1 right now 2216
## 2 last night 1510
## 3 feel like 1165
## 4 looking forward 1071
## 5 new york 925
## 6 looks like 866
## 7 can get 863
## 8 just got 796
## 9 let know 790
## 10 first time 773
## 3-gram
tri <- function(x) NGramTokenizer(x, Weka_control(min = 3,
max = 3))
triTDM <- TermDocumentMatrix(textData,
control = list(tokenize = tri))
triFT <- findFreqTerms(triTDM,
lowfreq = 20)
triTerms <- rowSums(as.matrix(triTDM[triFT,]))
triTerms <- data.frame(Term = names(triTerms),
Frequency = triTerms)
triTerms <- arrange(triTerms,
desc(Frequency))
threeGram <- head(triTerms, 10)
threeGram
## Term Frequency
## 1 happy mothers day 301
## 2 let us know 231
## 3 happy new year 169
## 4 new york city 136
## 5 cinco de mayo 92
## 6 looking forward seeing 87
## 7 new york times 78
## 8 just got back 71
## 9 st patricks day 71
## 10 happy valentines day 66
Graphs showing frequently used terms in the N-grams generated.