General Setup

The data was available from the Coursera website through their partner Swiftkey. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The goal for this product is for us to understanding the process of doing text analysis in R, and then build a predictive model to predict the possible next word user will type in.

Predictive text modeling using NLP follows generally the same approach to data as we learned in the Data Science Specialization. The data is obtained, cleaned and explored, before moving to the predictive modeling stage using a training, validation and test set, and finally text prediction itself.

Library necessary packages

Read in data

file1 <- file("en_US.twitter.txt")
twitter <- readLines(file1, encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

file1 <- file("en_US.blogs.txt")
blogs <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

file1 <- file("en_US.news.txt", "rb")
news <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

set seed

set.seed(2018)

Data Basic Info Handling

## Size of Files
x1 <- file.info("en_US.blogs.txt")$size / 1024^2 #200 Megabytes
x2 <- file.info("en_US.news.txt")$size  / 1024^2 #196 Megabytes
x3 <- file.info("en_US.twitter.txt")$size / 1024^2# 159 Megabytes

## Number of lines
y1 <- length(blogs) #899,288 lines
y2 <- length(news)  #1,010,242 lines
y3 <- length(twitter) #2,360,148

## Counting the Words
z1 <- sum(stri_count_words(blogs)) #7,546,246
z2 <- sum(stri_count_words(news))  #34,762,395
z3 <- sum(stri_count_words(twitter)) #= 30,093,410

## The length of the longest line seen in any of the three en_US data sets:
t1 <- max(nchar(blogs)) #40,833
t2 <- max(nchar(news))  #11,384 
t3 <- max(nchar(twitter)) #140

Display the info of files

Info <- data.frame(
        files_Name = c("Blogs","News","Twitter"),
        files_Size = c(x1, x2, x3),
        lines_Count = c(y1, y2, y3),
        words_Count = c(z1, z2, z3),
        max_size_Line = c(t1, t2, t3)
)
Info
##   files_Name files_Size lines_Count words_Count max_size_Line
## 1      Blogs   200.4242      899288    37546246         40833
## 2       News   196.2775     1010242    34762395         11384
## 3    Twitter   159.3641     2360148    30093410           140

Data cleaning

# Remove non-ASCII characters
twitter <- iconv(twitter, from = "latin1", "ASCII", sub = "")
blogs <- iconv(blogs, from = "latin1", "ASCII", sub = "")
news <- iconv(news, from = "latin1", "ASCII", sub = "")

Sampling a certain size of data

sub_twitter <- sample(twitter, size = 10000, replace = TRUE)
sub_blogs <- sample(blogs, size = 10000, replace = TRUE)
sub_news <- sample(news, size = 10000, replace = TRUE)

removing punctuation, stopwords, numbers, and converting characters to lower case

Explore N-grams

Uni-grams analysis

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpusMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))
inspect(corpusMatrix)
## <<TermDocumentMatrix (terms: 51188, documents: 3)>>
## Non-/sparse entries: 75805/77759
## Sparsity           : 51%
## Maximal term length: 78
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  blogs.txt news.txt twitter.txt
##   can       1100      557         367
##   get        764      456         513
##   just      1108      563         664
##   like      1060      490         516
##   new        636      677         296
##   now        673      351         381
##   one       1386      775         299
##   said       408     2522          58
##   time       940      497         326
##   will      1150     1082         403
uniGram <- as.data.frame((as.matrix(corpusMatrix)))
uniGramSorted <- sort(rowSums(uniGram),decreasing=TRUE)
uniGram_df <- data.frame(word = names(uniGramSorted),freq=uniGramSorted)
data <- uniGram_df[1:10,]
g <- ggplot(data = data, aes(x = word, y = freq))
g <- g + geom_bar(stat = "identity") + coord_flip() + ggtitle("Frequency") + geom_text(data = data, aes(x = word, y = freq, label = freq), hjust=-1, position = "identity")
g

p <- brewer.pal(9, "BuPu")
p = p[-(1:3)]
wordcloud(uniGram_df$word, uniGram_df$freq, max.words = 50, random.order = F, colors = p)

Bi-gram analysis

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpusMatrix2 <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
corpusMatrix2 <- removeSparseTerms(corpusMatrix2, 0.25)
inspect(corpusMatrix2)
## <<TermDocumentMatrix (terms: 2374, documents: 3)>>
## Non-/sparse entries: 7122/0
## Sparsity           : 0%
## Maximal term length: 21
## Weighting          : term frequency (tf)
## Sample             :
##              Docs
## Terms         blogs.txt news.txt twitter.txt
##   cant wait          22        1          84
##   dont know          71       34          41
##   feel like          58       17          29
##   first time         39       45          19
##   high school        34       85          13
##   last week          49       70          12
##   last year          48      119           5
##   new york           49      117           9
##   right now          50       34          73
##   years ago          57       66           8
biGram <- as.data.frame((as.matrix(corpusMatrix2)))
biGramSorted <- sort(rowSums(biGram),decreasing=TRUE)
biGram_df <- data.frame(word = names(biGramSorted),freq=biGramSorted)
kable(head(biGram_df, 10), row.names = FALSE, caption = "Top Ten Bigrams")
Top Ten Bigrams
word freq
new york 175
last year 172
right now 157
dont know 146
high school 132
last week 131
years ago 131
cant wait 107
feel like 104
first time 103
p <- brewer.pal(9, "Blues")
p = p[-(1:3)]
wordcloud(biGram_df$word,  biGram_df$freq, max.words = 50, random.order = F, colors = p)

Tri-gram analysis

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpusMatrix3 <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
corpusMatrix3 <- removeSparseTerms(corpusMatrix3, 0.66)
inspect(corpusMatrix3)
## <<TermDocumentMatrix (terms: 929, documents: 3)>>
## Non-/sparse entries: 1909/878
## Sparsity           : 32%
## Maximal term length: 28
## Weighting          : term frequency (tf)
## Sample             :
##                         Docs
## Terms                    blogs.txt news.txt twitter.txt
##   cant wait see                  5        0          19
##   happy mothers day              1        0          15
##   happy new year                 6        0           8
##   im looking forward             7        3           5
##   let us know                    4        1          10
##   new york city                 10       10           0
##   new york times                 8       10           0
##   president barack obama         3       17           0
##   two years ago                  7        8           1
##   world war ii                   1       13           0
triGram <- as.data.frame((as.matrix(corpusMatrix3)))
triGramSorted <- sort(rowSums(triGram),decreasing=TRUE)
triGram_df <- data.frame(word = names(triGramSorted),freq=triGramSorted)
kable(head(triGram_df, 10), row.names = FALSE, caption = "Top Ten Trigrams")
Top Ten Trigrams
word freq
cant wait see 24
new york city 20
president barack obama 20
new york times 18
happy mothers day 16
two years ago 16
im looking forward 15
let us know 15
happy new year 14
world war ii 14
p <- brewer.pal(9, "Greens")
p = p[-(1:3)]
wordcloud(triGram_df$word, triGram_df$freq, max.words = 50, random.order = F, colors = p)

Next step

In the next step, I’ll dive deeper into building the predictive model, useing the most related word analyzing on bi-gram and tri-gram. After that I’ll try to build a effective model and inserted it into shiny app to display and improve user interaction.