This is the submission report for the Data Science Capstone module in Coursera with John Hopkins Bloomberg School of Health.
The dataset on Natural Language Processing is provided by Swiftkey Corporation, we are using the English set of files.
The objective of this report is to show some traits of these English test files. These traits are relalted to the words and the word sequences inside these files. From these exploration of the text features, a predictive training method can be developed to estimate the next word from a started sentence in English.
Download dataset
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Create a local directory to unzip, I use the directory ~/Documents/_DataScienceCapstone/final/ on my macbook i5 with 8 Mo Ram:
The directory structure is: ~/Documents/_DataScienceCapstone/final/ /en_US /de_DE /fi_FI /ru_RU
Load the data, pass the file names: en_US.twitter.txt, en_US.news.txt, en_US.blogs.txt
setwd(“~/Documents/_DataScienceCapstone/final/en_US“) conn <- file(”en_US.twitter.txt“)
twt <- readLines(conn) close(conn)
In Natural Language processing, tokenization of text is the entry point We focus on English language set of files Identify words, punctuations, numbers Token_List <- Function (file_name, size)
Basic Summary of the files
| File_Name | File_Size | Line_Count | Word_Count |
|---|---|---|---|
| en_US.twitter.txt | 167 MB | 2360148 | 30374206 |
| en_US.news.txt | 206 MB | 1010242 | 34372720 |
| en_US.blogs.txt | 210 MB | 899288 | 37334690 |
# 0) Initialization
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(ggplot2)
library(SnowballC)
library(slam)
library(RColorBrewer)
library(reshape2)
library(rCharts)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RWeka)
library(wordcloud)
library("qdap")
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
##
## The following object is masked from 'package:ggplot2':
##
## %+%
##
## Loading required package: qdapTools
##
## Attaching package: 'qdap'
##
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
##
## The following object is masked from 'package:NLP':
##
## ngrams
##
## The following object is masked from 'package:base':
##
## Filter
library("RTextTools")
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
##
##
## Attaching package: 'RTextTools'
##
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
library(plyr)
##
## Attaching package: 'plyr'
##
## The following object is masked from 'package:qdapTools':
##
## id
setwd("~/Documents/_DataScienceCapstone/final/en_US")
set.seed(1)
options(mc.cores=1)
# 1) Read the sample from the file with the indicated size
rLines <- function (FileName, Size) {
con <- file(FileName, "r")
linn <- readLines(con,Size)
close(con)
return(linn)
}
# 2) Clean the data read - lower case, - restrict to English dictionary, Filter profanity
remRepLetter <- function(x) {
gsub('([[:alpha:]])\\1+', '\\1', x)
}
remRepWord <- function(str) {
str %>%
tolower() %>%
word_split() %>%
sapply(., function(x) unbag(unique(x))) %>%
rm_white_endmark() %>%
rm_default(pattern="(^[a-z]{1})", replacement = "\\U\\1") %>%
unname()
}
CleanCorp <- function(data){
corpus <- Corpus(VectorSource(data))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, remRepWord)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, content_transformer(tolower))
}
# Read the set of document corpus
Corp <- function(FileName, Size){
data <- rLines(FileName, Size)
corpus <- CleanCorp(data)
return(corpus)
}
# Tokenization 1 through 5 words
Term1 <- function(corpus){
uni_tokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 1, max = 1))
#s.tdm <- TermDocumentMatrix(corpus, control=list(stemDocument=TRUE))
s.tdm <- TermDocumentMatrix(corpus, control=list(tokenize = uni_tokenizer))
su <- sort(row_sums(s.tdm), decreasing=TRUE)
return(su)
#m <- as.matrix(s.tdm)
#(terms <- apply_as_df(corpus, freq_terms, top=15, stopwords=tm::stopwords("en")))
#plot(terms)
#return(terms)
}
Term2 <- function(corpt) {
bi_tokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 2, max = 2))
btm <- TermDocumentMatrix(corpt,control=list(tokenize = bi_tokenizer))
su <- sort(row_sums(btm), decreasing=TRUE)
return(su)
}
Term3 <- function(corpt) {
TrigramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 3, max = 3))
ttm <- TermDocumentMatrix(corpt, control = list(tokenize = TrigramTokenizer))
su <- sort(row_sums(ttm), decreasing=TRUE)
return(su)
}
Term4 <- function(corpt) {
QuagramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 4, max = 4))
qtm <- TermDocumentMatrix(corpt, control = list(tokenize = QuagramTokenizer))
su <- sort(row_sums(qtm), decreasing=TRUE)
return(su)
}
Term5 <- function(corpt) {
CingramTokenizer <- function(x) NGramTokenizer(x,Weka_control(min = 5, max = 5))
ctm <- TermDocumentMatrix(corpt, control = list(tokenize = CingramTokenizer))
su <- sort(row_sums(ctm), decreasing=TRUE)
return(su)
}
# Plot the data cloud plot
CPlot <- function(su,ntop) {
wordcloud(names(su), su,
max.words=ntop, random.order=F,
colors=brewer.pal(8,"Dark2"),
scale=c(2,0.2))
}
# Plot the data bar plot
BPlot <- function(x,ntop, title){
d <- data.frame(Word = names(x),Frequency=x)
d <- d[with(d, order(-Frequency)),]
g <- ggplot(d[1:ntop,],aes(x=Word ,y=Frequency))
p <- g +
geom_bar(stat="identity", position="identity",fill="pink",
colour="white") +
theme(axis.text.x = element_text(angle = 90, hjust=1, size=15, colour="black")) +
labs(x="Words", y="Frequency") +
ggtitle(title)
print(p)
}
# Plotting General
GenPlot <- function(su,ntop,title) {
BPlot(su,ntop, title)
CPlot(su,ntop)
}
The Twitter datafile
# Twitter - shunk of 5000
corpt <- Corp("en_US.twitter.txt",5000)
# Twitter unigram plot
lt <- Term1(corpt)
GenPlot(lt,15, "Twitter UniGram")
# Twitter bigram plot
lt <- Term2(corpt)
GenPlot(lt,15, "Twitter BiGram")
# Twitter trigram plot
lt <- Term3(corpt)
GenPlot(lt,15, "Twitter TriGram")
The News datafile
# news - shunk of 5000
corpn <- Corp("en_US.news.txt",5000)
# news unigram plot
ln <- Term1(corpn)
GenPlot(ln,15, "News UniGram")
# news bigram plot
ln <- Term2(corpn)
GenPlot(ln,15, "News BiGram")
# news trigram
ln <- Term3(corpn)
GenPlot(ln,15, "News TriGram")
The Blogs datafile
# blogs - shunk of 5000
corpb <- Corp("en_US.blogs.txt",5000)
# blogs unigram plot
lb <- Term1(corpb)
GenPlot(lb,15, "Blogs UniGram")
# blogs bigram plot
lb <- Term2(corpb)
GenPlot(lb,15, "Blogs BiGram")
# blogs- Trigram
lb <- Term3(corpb)
GenPlot(lb,15, "Blogs TriGram")
In this text exploration, we can see that depending on the source of text files, the pre-processing methods of the files might differ. For example, word repetitions were found mostly in the twitter file, but since word repetitions is general enough, the processing of eliminating these repetitions is applied to files from the news and the blogs source as well. We can see the words like “i” and “you” are extremely frequent in the blogs file, so we might decide to pre-process the blogs file in eliminating these pronouns. No matter which pre-processing routines applied to files by the prediction model, the same routines might be applied against the incoming sentence.
Since the prediction is on words, the categorical methods can be more appropriate than linear model. I am thinking of Tree model or Random Forest, combined with Markov Ngram: the nth word can be approximately predicted by the n-1 words before the prediction target word. I think the limite to 3 or 4 words can be required for performance consideration, depending on the power of the plateform.