Capstone - Milestone Report

General Setup

The data was available from the Coursera website through their partner Swiftkey. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The goal for this product is for us to understanding the process of doing text analysis in R, and then build a predictive model to predict the possible next word user will type in.

Predictive text modeling using NLP follows generally the same approach to data as we learned in the Data Science Specialization. The data is obtained, cleaned and explored, before moving to the predictive modeling stage using a training, validation and test set, and finally text prediction itself.

Library necessary packages

Read in data

file1 <- file("en_US.twitter.txt")
twitter <- readLines(file1, encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

file1 <- file("en_US.blogs.txt")
blogs <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

file1 <- file("en_US.news.txt", "rb")
news <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)

set seed

set.seed(2018)

Data Basic Info Handling

## Size of Files
x1 <- file.info("en_US.blogs.txt")$size / 1024^2 #200 Megabytes
x2 <- file.info("en_US.news.txt")$size  / 1024^2 #196 Megabytes
x3 <- file.info("en_US.twitter.txt")$size / 1024^2# 159 Megabytes

## Number of lines
y1 <- length(blogs) #899,288 lines
y2 <- length(news)  #1,010,242 lines
y3 <- length(twitter) #2,360,148

## Counting the Words
z1 <- sum(stri_count_words(blogs)) #7,546,246
z2 <- sum(stri_count_words(news))  #34,762,395
z3 <- sum(stri_count_words(twitter)) #= 30,093,410

## The length of the longest line seen in any of the three en_US data sets:
t1 <- max(nchar(blogs)) #40,833
t2 <- max(nchar(news))  #11,384 
t3 <- max(nchar(twitter)) #140

Display the info of files

Info <- data.frame(
        files_Name = c("Blogs","News","Twitter"),
        files_Size = c(x1, x2, x3),
        lines_Count = c(y1, y2, y3),
        words_Count = c(z1, z2, z3),
        max_size_Line = c(t1, t2, t3)
)
Info

##   files_Name files_Size lines_Count words_Count max_size_Line
## 1      Blogs   200.4242      899288    37546246         40833
## 2       News   196.2775     1010242    34762395         11384
## 3    Twitter   159.3641     2360148    30093410           140

Data cleaning

# Remove non-ASCII characters
twitter <- iconv(twitter, from = "latin1", "ASCII", sub = "")
blogs <- iconv(blogs, from = "latin1", "ASCII", sub = "")
news <- iconv(news, from = "latin1", "ASCII", sub = "")

Sampling a certain size of data

sub_twitter <- sample(twitter, size = 10000, replace = TRUE)
sub_blogs <- sample(blogs, size = 10000, replace = TRUE)
sub_news <- sample(news, size = 10000, replace = TRUE)

removing punctuation, stopwords, numbers, and converting characters to lower case

Explore N-grams

Uni-grams analysis

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpusMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))

inspect(corpusMatrix)

## <<TermDocumentMatrix (terms: 51188, documents: 3)>>
## Non-/sparse entries: 75805/77759
## Sparsity           : 51%
## Maximal term length: 78
## Weighting          : term frequency (tf)
## Sample             :
##       Docs
## Terms  blogs.txt news.txt twitter.txt
##   can       1100      557         367
##   get        764      456         513
##   just      1108      563         664
##   like      1060      490         516
##   new        636      677         296
##   now        673      351         381
##   one       1386      775         299
##   said       408     2522          58
##   time       940      497         326
##   will      1150     1082         403

uniGram <- as.data.frame((as.matrix(corpusMatrix)))
uniGramSorted <- sort(rowSums(uniGram),decreasing=TRUE)
uniGram_df <- data.frame(word = names(uniGramSorted),freq=uniGramSorted)
data <- uniGram_df[1:10,]
g <- ggplot(data = data, aes(x = word, y = freq))
g <- g + geom_bar(stat = "identity") + coord_flip() + ggtitle("Frequency") + geom_text(data = data, aes(x = word, y = freq, label = freq), hjust=-1, position = "identity")
g

p <- brewer.pal(9, "BuPu")
p = p[-(1:3)]
wordcloud(uniGram_df$word, uniGram_df$freq, max.words = 50, random.order = F, colors = p)

Bi-gram analysis

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpusMatrix2 <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
corpusMatrix2 <- removeSparseTerms(corpusMatrix2, 0.25)
inspect(corpusMatrix2)

## <<TermDocumentMatrix (terms: 2374, documents: 3)>>
## Non-/sparse entries: 7122/0
## Sparsity           : 0%
## Maximal term length: 21
## Weighting          : term frequency (tf)
## Sample             :
##              Docs
## Terms         blogs.txt news.txt twitter.txt
##   cant wait          22        1          84
##   dont know          71       34          41
##   feel like          58       17          29
##   first time         39       45          19
##   high school        34       85          13
##   last week          49       70          12
##   last year          48      119           5
##   new york           49      117           9
##   right now          50       34          73
##   years ago          57       66           8

biGram <- as.data.frame((as.matrix(corpusMatrix2)))
biGramSorted <- sort(rowSums(biGram),decreasing=TRUE)
biGram_df <- data.frame(word = names(biGramSorted),freq=biGramSorted)
kable(head(biGram_df, 10), row.names = FALSE, caption = "Top Ten Bigrams")

Top Ten Bigrams
word	freq
new york	175
last year	172
right now	157
dont know	146
high school	132
last week	131
years ago	131
cant wait	107
feel like	104
first time	103

p <- brewer.pal(9, "Blues")
p = p[-(1:3)]
wordcloud(biGram_df$word,  biGram_df$freq, max.words = 50, random.order = F, colors = p)

Tri-gram analysis

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpusMatrix3 <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
corpusMatrix3 <- removeSparseTerms(corpusMatrix3, 0.66)
inspect(corpusMatrix3)

## <<TermDocumentMatrix (terms: 929, documents: 3)>>
## Non-/sparse entries: 1909/878
## Sparsity           : 32%
## Maximal term length: 28
## Weighting          : term frequency (tf)
## Sample             :
##                         Docs
## Terms                    blogs.txt news.txt twitter.txt
##   cant wait see                  5        0          19
##   happy mothers day              1        0          15
##   happy new year                 6        0           8
##   im looking forward             7        3           5
##   let us know                    4        1          10
##   new york city                 10       10           0
##   new york times                 8       10           0
##   president barack obama         3       17           0
##   two years ago                  7        8           1
##   world war ii                   1       13           0

triGram <- as.data.frame((as.matrix(corpusMatrix3)))
triGramSorted <- sort(rowSums(triGram),decreasing=TRUE)
triGram_df <- data.frame(word = names(triGramSorted),freq=triGramSorted)
kable(head(triGram_df, 10), row.names = FALSE, caption = "Top Ten Trigrams")

Top Ten Trigrams
word	freq
cant wait see	24
new york city	20
president barack obama	20
new york times	18
happy mothers day	16
two years ago	16
im looking forward	15
let us know	15
happy new year	14
world war ii	14

p <- brewer.pal(9, "Greens")
p = p[-(1:3)]
wordcloud(triGram_df$word, triGram_df$freq, max.words = 50, random.order = F, colors = p)

Next step

In the next step, I’ll dive deeper into building the predictive model, useing the most related word analyzing on bi-gram and tri-gram. After that I’ll try to build a effective model and inserted it into shiny app to display and improve user interaction.